In [1]:
import csv
import numpy as np
import pandas as pd
from Word2VecUtility import Word2VecUtility
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [3]:
df = pd.read_csv("IN-gov-txts.tsv", sep = '\t', encoding = 'utf-8',\
                 quoting = csv.QUOTE_MINIMAL)
df['id'] = range(1, df.shape[0]+1)
print df.shape
print df.iloc[:5, :]
print df.iloc[245:255, :]
# df.set_index('id')

(1087, 3)
                                                 txt general_label  id
0  ATTORNEYS FOR APPELLANT\nRobert D. MacGill\nMi...         Civil   1
1  ATTORNEYS FOR APPELLANT\nJohn D. LaDue\nErin L...      Criminal   2
2  ATTORNEYS FOR APPELLANTS\nBryan L. Ciyou\nLori...    Government   3
3  ATTORNEYS FOR APPELLANTS\nRobert E. Lehman\nIn...      Criminal   4
4  ATTORNEYS FOR APPELLANTS\n\nATTORNEYS FOR APPE...      Criminal   5
                                                   txt   general_label   id
245  ATTORNEYS FOR APPELLANTS\nCENTER TOWNSHIP TRUS...           Civil  246
246  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...           Civil  247
247  ATTORNEYS FOR APPELLANT\n\nATTORNEYS FOR APPEL...  Constitutional  248
248  ATTORNEY FOR APPELLANT\nAmy O. Carson\nIndiana...        Criminal  249
249  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...          Family  250
250  ATTORNEY FOR APPELLANT\nJim Brugh\nLogansport,...             NaN  251
251  ATTORNEY FOR APPELLANTS\nKa

In [3]:
# for supervised learning
df = df[pd.notnull(df['general_label'])]
np.random.seed(520)
mask = np.random.rand(len(df)) < 0.8
train = df[mask]
test = df[~mask]

train_clean = []
print "Cleaning and parsing the case txt files...\n"
for txt in train["txt"]:
    train_clean.append(" ".join(Word2VecUtility.review_to_wordlist(txt, True)))

test_clean = []
for txt in test["txt"]:
    test_clean.append(" ".join(Word2VecUtility.review_to_wordlist(txt, True)))
print "finished!"

Cleaning and parsing the case txt files...

finished!


In [6]:
# for supervised learning

# ****** Create a bag of words from the training set
#
print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 50)

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(train_clean)
test_data_features = vectorizer.transform(test_clean)
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()
# ******* Train a random forest using the bag of words
#
classifiers = { 'random forest' : RandomForestClassifier(n_estimators = 20),
                'svm' : svm.SVC(C = 5.0)}
for name, classifier in classifiers.items():
    print "Fitting a %s to labeled training data..." %(name)
    classifier.fit( train_data_features, train['general_label'])
    train_pred = classifier.predict( train_data_features )
    test_pred = classifier.predict( test_data_features)
    # accuracy
    print("accuracy on train: %d/100" %(sum(train_pred == train['general_label'])/len(train) *100))
    print("accuracy on test: %d/100" %(sum(test_pred == test['general_label'])/len(test) *100))
# Write the train predict results
# output = pd.DataFrame( data={"id":test["id"], "predict_label":test_pred} )
# output.to_csv( "Word2Vec_AverageVectors_predict.csv", index=False, quoting=3 )
# print "Wrote Word2Vec_AverageVectors_predict.csv"

Creating the bag of words...

Fitting a svm to labeled training data...
accuracy on train: 100/100
accuracy on test: 0/100
Fitting a random forest to labeled training data...
accuracy on train: 100/100
accuracy on test: 0/100


In [4]:
# for semi-supervised learning
df.ix[pd.isnull(df['general_label']),'general_label'] = -1
print df.shape
print df.index
print len(df)
print df['id'][:10]
print df.iloc[245:255, :]
print "there are %d unique labels in total" % len(set(df['general_label']))

(1087, 3)
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086],
           dtype='int64', length=1087)
1087
0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: id, dtype: int64
                                                   txt   general_label   id
245  ATTORNEYS FOR APPELLANTS\nCENTER TOWNSHIP TRUS...           Civil  246
246  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...           Civil  247
247  ATTORNEYS FOR APPELLANT\n\nATTORNEYS FOR APPEL...  Constitutional  248
248  ATTORNEY FOR APPELLANT\nAmy O. Carson\nIndiana...        Criminal  249
249  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...          Family  250
250  ATTORNEY FOR APPELLANT\nJim Brugh\nLogansport,...              -1  251
251  ATTORNEY FOR APPELLANTS\nKatherine A. Brown-He...              -1  252
252  ATTORNEY FOR APPELLANT\n\nATTORNEYS FOR APPELL...              -1  25

In [8]:
# for semi-supervised learning
from sklearn.semi_supervised import LabelPropagation
clean_df = []
print "Cleaning and parsing the case txt files...\n"
for txt in df["txt"]:
    clean_df.append(" ".join(Word2VecUtility.review_to_wordlist(txt, True)))
    
print "Creating the bag of words...\n"

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 300)
X = vectorizer.fit_transform(clean_df).toarray()
print X.shape
y = df['general_label']
label_prop_model = LabelPropagation(kernel='rbf', gamma=20, \
                                    n_neighbors=3, alpha=1, max_iter=30, tol=0.001)
label_prop_model.fit(X, y)
y_pred = label_prop_model.predict(X)
labeled_index = np.where(df['general_label'] != -1)
print 'accuracy: %d/%d' %(np.sum(y_pred[labeled_index] == y.ix[labeled_index]),\
                          len(labeled_index[0]))

Cleaning and parsing the case txt files...

Creating the bag of words...

(1087, 300)
accuracy: 1/250
