In [130]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [131]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'))
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'))
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'))
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'))

In [132]:
###Setup Data

#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = csv.reader(open('../SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[12], row[17], row[19], row[36], row[39], row[40]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
print "Verdict extraction done!"
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#get advocate sides
advocates = pd.read_json('./advocate_dict.json')

advocate_turns = []

for index, row in turns_combined.iterrows():
    if row['speaker_role'] == 'scotus_justice':
        advocate_turns.append('scotus_justice')
    else:
        speaker = row['speaker']
        transcript_id = row['transcript_id']
        try:
            lawyer_side = advocates.ix[speaker][transcript_id]
            advocate_turns.append(lawyer_side)
        except:
            advocate_turns.append('None')

#insert advocate side to the turns_combined dataframe
turns_combined['lawyer_side'] = advocate_turns


#create speaking length column
turns_combined = turns_combined.assign(speaking_length = lambda x: x.text_stop - x.text_start)
turns_combined.loc[turns_combined['speaking_length'] < 0, 'speaking_length'] = 1

print "Advocate side extraction done!"

#pivot turns files by transcript_id and lawyer_side for texts
lawyer_side_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'lawyer_side', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))
#reset index
lawyer_side_text_pivot = lawyer_side_text_pivot.reset_index()

#drop columns with no lawyer side tags
lawyer_side_text_pivot = lawyer_side_text_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#count of number of times speaker spoke
counts_pivot = pd.pivot_table(turns_combined[['transcript_id', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              aggfunc=len,
                              fill_value=0)

#reset index
counts_pivot = counts_pivot.reset_index()

#drop columns with no lawyer side tags
counts_pivot = counts_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
counts_pivot.columns = ['transcript_id', 'appellant/petitioner_count', 'appellee/respondent_count',
                       'scotus_justice_count']

#length of speaker speaking
length_pivot = pd.pivot_table(turns_combined[['transcript_id', 'speaking_length', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              values = 'speaking_length',
                              aggfunc=np.sum,
                              fill_value=0)

#reset index
length_pivot = length_pivot.reset_index()

#drop columns with no lawyer side tags
length_pivot = length_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
length_pivot.columns = ['transcript_id', 'appellant/petitioner_length', 'appellee/respondent_length',
                       'scotus_justice_length']

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#concatenate pivots together
pivots_concate = pd.concat([lawyer_side_text_pivot,
                            counts_pivot[counts_pivot.columns[1:4]], 
                            length_pivot[length_pivot.columns[1:4]],
                            speaker_role_text_pivot[speaker_role_text_pivot.columns[1]]],
                            axis=1,
                            join='inner')

#Convert feature attributes to ints
verdict['petitioner'] = verdict['petitioner'].astype(int)
verdict['respondent'] = verdict.respondent.apply(lambda x: 0 if x == '' else x)
verdict['respondent'] = verdict['respondent'].astype(int)
verdict['issue'] = verdict.issue.apply(lambda x: 0 if x == '' else x)
verdict['issue'] = verdict['issue'].astype(int)
verdict['issueArea'] = verdict.issueArea.apply(lambda x: 0 if x == '' else x)
verdict['issueArea'] = verdict['issueArea'].astype(int)

#join verdict into df
train_test_df = pivots_concate.join(verdict.set_index('term_docket'), on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()
#train_test_df = train_test_df[train_test_df['appellant/petitioner_length'] != 0.0] 

#create train and test split
x = train_test_df[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'not_a_justice', 'petitioner', 'respondent', 'issue', 'issueArea']]
y = train_test_df.partyWinning
x_train_orig, x_test_orig, y_train_orig, y_test_orig = train_test_split(x, y, test_size = 0.33)

print "Data re-shaping and combining done!"

Verdict extraction done!
Advocate side extraction done!
Data re-shaping and combining done!


In [133]:
###initial test using count vectorizer bag of words using not_a_justice and scotus_justice

#include only not_a_justice and scotus_justice features
x_train = x_train_orig[['not_a_justice', 'scotus_justice']]
x_test = x_test_orig[['not_a_justice', 'scotus_justice']]

#count vectorizer
count_vect_1 = CountVectorizer()
count_vect_2 = CountVectorizer()
x_train_cv_not_a_justice = count_vect_1.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect_2.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect_1.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect_2.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#get y variables
y_train = y_train_orig
y_test = y_test_orig

#random forest
#forest = RandomForestClassifier(n_estimators = 100)
#forest = forest.fit(x_train, y_train)
#random_forest_prediction = forest.predict(x_test)
#rf_accuracy = np.mean(random_forest_prediction == y_test)
#print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
#nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
#naive_baynes_prediction = nb.predict(x_test)
#nb_accuracy = np.mean(naive_baynes_prediction == y_test)
#print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model_1 = LogisticRegression(C = 0.2, penalty = "l1")
lr_model_train_1 = lr_model_1.fit(x_train, y_train)
lr_prediction_1 = lr_model_1.predict(x_test)
lr_accuracy_1 = np.mean(lr_prediction_1 == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy_1,4)

#print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

#print 'LR predictions:\n', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Logistic Regression Model: 0.553


In [134]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent

#get x_train and x_test features
x_train = x_train_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length']]
x_test = x_test_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length']]

#count vectorizer
count_vect_1 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)
count_vect_2 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)
count_vect_3 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)

x_train_cv_petitioner_vec = count_vect_1.fit(x_train['appellant/petitioner'])
voc1 = count_vect_1.vocabulary_
x_train_cv_respondant_vec = count_vect_2.fit(x_train['appellee/respondent'])
voc2 = count_vect_2.vocabulary_
x_train_cv_scotus_justice_vec = count_vect_3.fit(x_train['scotus_justice'])

x_test_cv_petitioner = count_vect_1.transform(x_test['appellant/petitioner'])
x_test_cv_respondant = count_vect_2.transform(x_test['appellee/respondent'])
x_test_cv_scotus_justice = count_vect_3.transform(x_test['scotus_justice'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitioner_vec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_train_cv_respondant = pd.DataFrame(x_train_cv_respondant_vec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice_vec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())
x_test_cv_petitioner = pd.DataFrame(x_test_cv_petitioner.todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_test_cv_respondant = pd.DataFrame(x_test_cv_respondant.todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[3:9]]
x_train_features = x_train_features.reset_index()
x_test_features = x_test[x_train.columns[3:9]]
x_test_features = x_test_features.reset_index()

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0
x_test_features[x_test_features < 0 ] = 0

#concatenate the bow back
x_train = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondant, 
                     x_train_cv_scotus_justice,
                     x_train_features], 
                     axis = 1)
x_test = pd.concat([x_test_cv_petitioner, 
                    x_test_cv_respondant, 
                    x_test_cv_scotus_justice,
                    x_test_features],
                    axis = 1)

#reset index to match x
y_train = y_train_orig.reset_index()
y_train = y_train.drop(['index'], axis=1)
x_train = x_train.drop(['index'], axis=1)
x_test = x_test.drop(['index'], axis=1)
y_test = y_test_orig

#random forest
#forest = RandomForestClassifier(n_estimators = 1000)
#forest = forest.fit(x_train, y_train)
#random_forest_prediction = forest.predict(x_test)
#rf_accuracy = np.mean(random_forest_prediction == y_test)
#print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

#Multinomial Naive Baynes
#nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
#naive_baynes_prediction = nb.predict(x_test)
#nb_accuracy = np.mean(naive_baynes_prediction == y_test)
#print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model_2 = LogisticRegression(C = 0.2, penalty = "l1")
lr_model_train_2 = lr_model_2.fit(x_train, y_train)
lr_prediction_2 = lr_model_2.predict(x_test)
lr_accuracy_2 = np.mean(lr_prediction_2 == y_test)

print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy_2,4)
#print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()
#print 'RF predictions:', np.unique(random_forest_prediction, return_counts=True)
#print 'NB predictions:', np.unique(naive_baynes_prediction, return_counts=True)
#print 'LR predictions:', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Logistic Regression Model: 0.6667


In [135]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent, and additional features
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code

#get x_train and x_test features
x_train = x_train_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'petitioner', 'respondent', 'issue', 'issueArea']]
x_test = x_test_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'petitioner', 'respondent', 'issue', 'issueArea']]

#count vectorizer
stop_words = ["that", "the", "court", "of", "justice", "and", "please", "this", "to"]

count_vect_1 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_2 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_3 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_chief = CountVectorizer()

print "Vectorizing done!"

x_train_cv_petitioner_vec = count_vect_1.fit(x_train['appellant/petitioner'])
voc1 = count_vect_1.vocabulary_
x_train_cv_respondant_vec = count_vect_2.fit(x_train['appellee/respondent'])
voc2 = count_vect_2.vocabulary_
x_train_cv_scotus_justice_vec = count_vect_3.fit(x_train['scotus_justice'])
x_train_chief = count_vect_chief.fit(x_train['chief'])

x_test_cv_petitioner = count_vect_1.transform(x_test['appellant/petitioner'])
x_test_cv_respondant = count_vect_2.transform(x_test['appellee/respondent'])
x_test_cv_scotus_justice = count_vect_3.transform(x_test['scotus_justice'])
x_test_chief = count_vect_chief.fit(x_test['chief'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitioner_vec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_train_cv_respondant = pd.DataFrame(x_train_cv_respondant_vec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice_vec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())
x_train_cv_chief = pd.DataFrame(x_train_chief.transform(x_train['chief']).todense(),
                               columns = x_train_chief.get_feature_names())

x_test_cv_petitioner = pd.DataFrame(x_test_cv_petitioner.todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_test_cv_respondant = pd.DataFrame(x_test_cv_respondant.todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())
x_test_cv_chief = pd.DataFrame(x_test_chief.transform(x_test['chief']).todense(),
                               columns = x_test_chief.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[4:]]
x_train_features = x_train_features.reset_index()
x_test_features = x_test[x_train.columns[4:]]
x_test_features = x_test_features.reset_index()

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y_train = y_train_orig.reset_index()
y_test = y_test_orig.reset_index()
y_train = y_train.drop(['index'], axis = 1)
y_test = y_test.drop(['index'], axis = 1)
y_train = y_train.as_matrix()
y_test = y_test.as_matrix()
y_train = column_or_1d(y_train)
y_test = column_or_1d(y_test)

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0
x_test_features[x_test_features < 0 ] = 0

#concatenate the bow back
x_train = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondant, 
                     x_train_cv_scotus_justice,
                     x_train_cv_chief,
                     x_train_features], 
                     axis = 1)
x_test = pd.concat([x_test_cv_petitioner, 
                    x_test_cv_respondant, 
                    x_test_cv_scotus_justice,
                    x_test_cv_chief,
                    x_test_features],
                    axis = 1)

# #random forest
# forest = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
# forest = forest.fit(x_train, y_train)
# random_forest_prediction = forest.predict(x_test)
# rf_accuracy = np.mean(random_forest_prediction == y_test)
# print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# #Multinomial Naive Baynes
# nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
# naive_baynes_prediction = nb.predict(x_test)
# nb_accuracy = np.mean(naive_baynes_prediction == y_test)
# print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model_3 = LogisticRegression(C = 0.2, penalty = "l1", n_jobs = -1)
lr_model_train_3 = lr_model_3.fit(x_train, y_train)
lr_prediction_3 = lr_model_3.predict(x_test)
lr_accuracy_3 = np.mean(lr_prediction_3 == y_test)

print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy_3,4)
# print 'Actual Count of y_test Verdicts:\n', np.unique(y_test, return_counts=True)
# print 'RF predictions:', np.unique(random_forest_prediction, return_counts=True)
# print 'NB predictions:', np.unique(naive_baynes_prediction, return_counts=True)
# print 'LR predictions:', np.unique(lr_prediction, return_counts=True)

Vectorizing done!
Accuracy of Bag-of-Words Logistic Regression Model: 0.6667


In [136]:
actual_count = np.unique(y_test_orig, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print "Floor is:", round(np.divide(num, denom), 4)

print "LR Model 1 Accuracy:", round(lr_accuracy_1,4)
print "LR Model 2 Accuracy:", round(lr_accuracy_2,4)
print "LR Model 3 Accuracy:", round(lr_accuracy_3,4)

Floor is: 0.6459
LR Model 1 Accuracy: 0.553
LR Model 2 Accuracy: 0.6667
LR Model 3 Accuracy: 0.6667
