In [None]:
# split into dev and training sets later 
np.random.permutation(len(text_features))
text_features, target = text_features[shuffle], target[shuffle]
#separate into training and dev groups
train_data, train_labels = text_features[:3200], target[:3200]
dev_data, dev_labels = text_features[3200:], target[3200:]

In [46]:
import os
import csv
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# get file names of all text files
file_names = []
for file in os.listdir("./case_txts"):
    if file.endswith("_clean.txt"):
        file_names.append(file)

# open all texts and import into 1 list with each docket being 1 item in the list
texts = []
for f in file_names:
    docket = open("./case_txts/"+f, 'r')
    texts.append(docket.readlines())
    
# get list of verdicts
verdict = []
verdict_csv = csv.reader(open('SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    verdict.append([row[13], row[36]])
    
# clean up file name to get docket number
##### fix dockets with the word 'orig' in title
docket_number = []
for x in range(0, len(file_names)):
    new_file_name = file_names[x].strip('_clean.txt')
    new_file_name = new_file_name.strip('OA')
    if new_file_name.count('_') >= 2:
        new_file_name = new_file_name[:new_file_name.find('_', 4)]
    new_file_name = new_file_name.replace('_', '-')
    docket_number.append(new_file_name)

# combine verdict (y variable) with the index of the texts (x variable)
# 31 docket numbers that were not clean(includes random characters or can't find it at all) 
# so for the sake of simplicity for now I'm just excluding them from the training set
winning_party = []
texts_cleaned = []
for x in range(0, len(docket_number)):
    for y in range(0, len(verdict)):
        if verdict[y][0] == docket_number[x]:
            winning_party.append([docket_number[x], verdict[y][1]])
            texts_cleaned.append(texts[x])

# flatten texts so it can be used in CountVectorizer
texts_cleaned = [item for sublist in texts_cleaned for item in sublist]
texts_cleaned = np.array(texts_cleaned)

# for reference:
# verdict value 0 = no favorable disposition for petitioning part apparent
# verdict value 1 = petitioning party received a favorable disposition
# verdict value 2 = favorable disposition for petitioning party unclear
# get only the verdict to use for y variable
verdict_cleaned = []
for y in range(0, len(winning_party)):
    verdict_cleaned.append(int(winning_party[y][1]))
verdict_cleaned = np.array(verdict_cleaned)

#separate text and verdicts into train and test sets
train_text = texts_cleaned[len(texts_cleaned)/3:]
test_text = texts_cleaned[:len(texts_cleaned)/3]
train_verdict = verdict_cleaned[len(texts_cleaned)/3:]
test_verdict = verdict_cleaned[:len(verdict_cleaned)/3]

# use CountVectorizer 
# could optimize by changing min_df only take counts above n
count_vect = CountVectorizer(min_df = 3)
x_train_counts = count_vect.fit_transform(train_text)

# tf-idf to normalize word frequency
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.toarray()

# transform test data
x_test_counts = count_vect.transform(test_text)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)
x_test_tfidf.toarray()

# Multinomial Naive Baynes
# could optimize by changing alpha
clf = MultinomialNB(alpha = 0.1).fit(x_train_tfidf, train_verdict)
naive_baynes_prediction = clf.predict(x_test_tfidf)
nb_accuracy = np.mean(naive_baynes_prediction == test_verdict)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# Random Forest
# could optimize by changing n_estimators
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train_tfidf, train_verdict)
random_forest_prediction = forest.predict(x_test_tfidf)
rf_accuracy = np.mean(random_forest_prediction == test_verdict)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# logistic regression
# could optimize by changing C
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train_tfidf, train_verdict)
lr_prediction = lr_model.predict(x_test_tfidf)
lr_accuracy = np.mean(lr_prediction == test_verdict)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.6842
Accuracy of Bag-of-Words Random Forest Model: 0.6868
Accuracy of Bag-of-Words Logistic Regression Model: 0.6842


In [2]:
print 'x_train shape:', x_train_tfidf.shape
print 'x_test shape:', x_test_tfidf.shape

x_train shape: (760, 16181)
x_test shape: (380, 16181)


In [47]:
random_forest_prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1,