In [9]:
import csv
import pandas as pd


In [10]:
df = pd.read_csv("IN-gov-txts.tsv", sep = '\t', encoding = 'utf-8',\
                 quoting = csv.QUOTE_MINIMAL)
df['id'] = range(1, df.shape[0]+1)
print df.shape
print df.iloc[:5, :]
print df.iloc[145:155, :]
# df.set_index('id')

(1087, 3)
                                                 txt general_label  id
0  ATTORNEYS FOR APPELLANT\nRobert D. MacGill\nMi...         Civil   1
1  ATTORNEYS FOR APPELLANT\nJohn D. LaDue\nErin L...      Criminal   2
2  ATTORNEYS FOR APPELLANTS\nBryan L. Ciyou\nLori...    Government   3
3  ATTORNEYS FOR APPELLANTS\nRobert E. Lehman\nIn...      Criminal   4
4  ATTORNEYS FOR APPELLANTS\n\nATTORNEYS FOR APPE...      Criminal   5
                                                   txt general_label   id
145  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...         Civil  146
146  ATTORNEY FOR APPELLANT\nDeborah Markisohn\nInd...      Criminal  147
147  ATTORNEY FOR APPELLANT\nRobert T. Sanders III\...      Criminal  148
148  ATTORNEY FOR APPELLANT\nKimberly A. Jackson\nI...      Criminal  149
149  ATTORNEYS FOR APPELLANTS\n\nATTORNEYS FOR AMIC...      Criminal  150
150  ATTORNEYS FOR APPELLANT\n\nATTORNEYS FOR APPEL...      Criminal  151
151  ATTORNEYS FOR APPELLANT\n\nATTORNEYS FOR 

In [11]:
# df = df[pd.notnull(df['general_label'])]
df.ix[pd.isnull(df['general_label']),'general_label'] = -1
print df.shape
print df.index
print len(df)
print df['id'][:10]
print df.iloc[245:255, :]
print "there are %d unique labels in total" % len(set(df['general_label']))

(1087, 3)
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086],
           dtype='int64', length=1087)
1087
0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: id, dtype: int64
                                                   txt   general_label   id
245  ATTORNEYS FOR APPELLANTS\nCENTER TOWNSHIP TRUS...           Civil  246
246  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...           Civil  247
247  ATTORNEYS FOR APPELLANT\n\nATTORNEYS FOR APPEL...  Constitutional  248
248  ATTORNEY FOR APPELLANT\nAmy O. Carson\nIndiana...        Criminal  249
249  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...          Family  250
250  ATTORNEY FOR APPELLANT\nJim Brugh\nLogansport,...              -1  251
251  ATTORNEY FOR APPELLANTS\nKatherine A. Brown-He...              -1  252
252  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...              -1  25

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn import svm
from Word2VecUtility import Word2VecUtility
from gensim.models import Word2Vec
import numpy as np

In [13]:
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    #
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0.
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1.
    return reviewFeatureVecs


def getCleanReviews(reviews):
    clean_reviews = []
    for review in reviews['txt']:
        clean_reviews.append( Word2VecUtility.review_to_wordlist\
                             ( review, remove_stopwords=True ))
    return clean_reviews

In [7]:
# only use those labeled data, supervised learning
model = Word2Vec.load("300features_40minwords_10context")
num_features = 300
np.random.seed(520)
mask = np.random.rand(len(df)) < 0.8
train = df[mask]
test = df[~mask]

print "Creating average feature vecs for %d train case txt" %len(train)
trainDataVecs = getAvgFeatureVecs( getCleanReviews(train), model, num_features )
print "Creating average feature vecs for %d test case txt" %len(test)
testDataVecs = getAvgFeatureVecs( getCleanReviews(test), model, num_features )

classifiers = { 'random forest' : RandomForestClassifier(n_estimators = 100),
                'svm' : svm.SVC(C = 5.0)}
for name, classifier in classifiers.items():
    print "Fitting a %s to labeled training data..." %(name)
    classifier.fit( trainDataVecs, train['general_label'])
    train_pred = classifier.predict( trainDataVecs )
    test_pred = classifier.predict( testDataVecs)
    # accuracy
    print("accuracy on train: %d/100"%(sum(train_pred == train['general_label'])/len(train)*100))
    print("accuracy on test: %d/100"%(sum(test_pred == test['general_label'])/len(test) *100))
    # Write the train predict results
#     output = pd.DataFrame( data={"id":test["id"], "predict_label":test_pred} )
#     output.to_csv( "Word2Vec_AverageVectors_predict.csv", index=False, quoting=3 )
#     print "Wrote Word2Vec_AverageVectors_predict.csv"

Creating average feature vecs for 127 train case txt
Creating average feature vecs for 24 test case txt
Fitting a svm to labeled training data...
accuracy on train: 0/100
accuracy on test: 0/100
Fitting a random forest to labeled training data...
accuracy on train: 100/100
accuracy on test: 0/100


In [16]:
# semisupervised learning
model = Word2Vec.load("300features_40minwords_10context")
num_features = 300
labeled_index = np.where(df['general_label'] != -1)
print "Creating average feature vecs for all(%d) case txt file..." %len(df)
print "there are %d unique labels in total" % len(set(df['general_label']))

X = getAvgFeatureVecs( getCleanReviews(df), model, num_features )
# replace nan value with 0.
X[pd.isnull(X)] = 0
y = df['general_label']
label_prop_model = LabelPropagation(kernel='rbf', gamma=20, \
                                    n_neighbors=3, alpha=1, max_iter=30, tol=0.001)
label_prop_model.fit(X, y)
y_pred = label_prop_model.predict(X)
print 'accuracy: %d/%d' %(np.sum(y_pred[labeled_index] == y.ix[labeled_index]),\
                          len(labeled_index[0]))

Creating average feature vecs for all(1087) case txt file...
there are 16 unique labels in total
accuracy: 106/250


In [40]:
X[pd.isnull(X)] = 0
print np.isfinite(X).all()
np.where(np.sum(np.isfinite(X), axis=1) != 300)

True


(array([], dtype=int64),)

In [75]:
print type(labeled_index)
print len(labeled_index[0])

<type 'tuple'>
151


In [15]:
print sum(y == 'Criminal')

106
