In [1]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./data_and_etl//turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'), encoding='utf-8')
zf2 = zipfile.ZipFile('./data_and_etl//turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'), encoding='utf-8')
zf3 = zipfile.ZipFile('./data_and_etl//turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'), encoding='utf-8')
zf4 = zipfile.ZipFile('./data_and_etl//turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'), encoding='utf-8')

In [3]:
###Setup Data

#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = csv.reader(open('./data_and_etl//SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[12], row[17], row[19], row[36], row[39], row[40]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
print "Verdict extraction done!"
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#get advocate sides
advocates = pd.read_json('./advocate_dict.json')

advocate_turns = []

for index, row in turns_combined.iterrows():
    if row['speaker_role'] == 'scotus_justice':
        advocate_turns.append('scotus_justice')
    else:
        speaker = row['speaker']
        transcript_id = row['transcript_id']
        try:
            lawyer_side = advocates.ix[speaker][transcript_id]
            advocate_turns.append(lawyer_side)
        except:
            advocate_turns.append('None')

#insert advocate side to the turns_combined dataframe
turns_combined['lawyer_side'] = advocate_turns


#create speaking length column
turns_combined = turns_combined.assign(speaking_length = lambda x: x.text_stop - x.text_start)
turns_combined.loc[turns_combined['speaking_length'] < 0, 'speaking_length'] = 1

print "Advocate side extraction done!"

#pivot turns files by transcript_id and lawyer_side for texts
lawyer_side_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'lawyer_side', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))
#reset index
lawyer_side_text_pivot = lawyer_side_text_pivot.reset_index()

#drop columns with no lawyer side tags
lawyer_side_text_pivot = lawyer_side_text_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#count of number of times speaker spoke
counts_pivot = pd.pivot_table(turns_combined[['transcript_id', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              aggfunc=len,
                              fill_value=0)

#reset index
counts_pivot = counts_pivot.reset_index()

#drop columns with no lawyer side tags
counts_pivot = counts_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
counts_pivot.columns = ['transcript_id', 'appellant/petitioner_count', 'appellee/respondent_count',
                       'scotus_justice_count']

#length of speaker speaking
length_pivot = pd.pivot_table(turns_combined[['transcript_id', 'speaking_length', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              values = 'speaking_length',
                              aggfunc=np.sum,
                              fill_value=0)

#reset index
length_pivot = length_pivot.reset_index()

#drop columns with no lawyer side tags
length_pivot = length_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
length_pivot.columns = ['transcript_id', 'appellant/petitioner_length', 'appellee/respondent_length',
                       'scotus_justice_length']

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#concatenate pivots together
pivots_concate = pd.concat([lawyer_side_text_pivot,
                            counts_pivot[counts_pivot.columns[1:4]], 
                            length_pivot[length_pivot.columns[1:4]],
                            speaker_role_text_pivot[speaker_role_text_pivot.columns[1]]],
                            axis=1,
                            join='inner')

#Convert feature attributes to ints
verdict['petitioner'] = verdict['petitioner'].astype(int)
verdict['respondent'] = verdict.respondent.apply(lambda x: 0 if x == '' else x)
verdict['respondent'] = verdict['respondent'].astype(int)
verdict['issue'] = verdict.issue.apply(lambda x: 0 if x == '' else x)
verdict['issue'] = verdict['issue'].astype(int)
verdict['issueArea'] = verdict.issueArea.apply(lambda x: 0 if x == '' else x)
verdict['issueArea'] = verdict['issueArea'].astype(int)

#join verdict into df
train_test_df = pivots_concate.join(verdict.set_index('term_docket'), on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()
#train_test_df = train_test_df[train_test_df['appellant/petitioner_length'] != 0.0] 

#create train and test split
x_orig = train_test_df[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'not_a_justice', 'petitioner', 'respondent', 'issue', 'issueArea']]
y_orig = train_test_df.partyWinning

print "Data re-shaping and combining done!"

Verdict extraction done!
Advocate side extraction done!
Data re-shaping and combining done!


In [4]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent, and additional features
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code

#get x_train and x_test features
x_train = x_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'petitioner', 'respondent', 'issue', 'issueArea']]

#count vectorizer
stop_words = ["that", "the", "court", "of", "justice", "and", "please", "this", "to"]

count_vect_1 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_2 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_3 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_chief = CountVectorizer()

print "Vectorizing done!"

x_train_cv_petitioner_vec = count_vect_1.fit(x_train['appellant/petitioner'])
voc1 = count_vect_1.vocabulary_
x_train_cv_respondent_vec = count_vect_2.fit(x_train['appellee/respondent'])
voc2 = count_vect_2.vocabulary_
x_train_cv_scotus_justice_vec = count_vect_3.fit(x_train['scotus_justice'])
x_train_chief = count_vect_chief.fit(x_train['chief'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitioner_vec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_train_cv_respondent = pd.DataFrame(x_train_cv_respondent_vec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondent_vec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice_vec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())
x_train_cv_chief = pd.DataFrame(x_train_chief.transform(x_train['chief']).todense(),
                               columns = x_train_chief.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[4:]]
x_train_features = x_train_features.reset_index()

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y = y_orig.reset_index()
y = y.drop(['index'], axis = 1)
y = y.as_matrix()
y = column_or_1d(y)

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0

#concatenate the bow back
x = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondent, 
                     x_train_cv_scotus_justice,
                     x_train_cv_chief,
                     x_train_features], 
                     axis = 1)


# #random forest
#forest_3 = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
#rf_model_3_scores = cross_val_score(forest_3, x, y, cv=20)

# #Multinomial Naive Baynes
#nb_3 = MultinomialNB(alpha = 0.1)
#nb_model_3_scores = cross_val_score(nb_3, x, y, cv=20)

# logistic regression
lr_model_3 = LogisticRegression(C = 0.82, penalty = "l1", n_jobs = -1)
lr_model_train_3 = lr_model_3.fit(x, y)
lr_model_3_scores = cross_val_score(lr_model_3, x, y, cv=20)

#print "Accuracy of RF Model 3: %0.4f (+/- %0.4f)" % (rf_model_3_scores.mean(), rf_model_3_scores.std() * 2)
#print "Accuracy of NB Model 3: %0.4f (+/- %0.4f)" % (nb_model_3_scores.mean(), nb_model_3_scores.std() * 2)
print "Accuracy of LR Model 3: %0.4f (+/- %0.4f)" % (lr_model_3_scores.mean(), lr_model_3_scores.std() * 2)
actual_count = np.unique(y, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print "Floor is:", round(np.divide(num, denom), 4)

Vectorizing done!
Accuracy of LR Model 3: 0.6581 (+/- 0.0689)
Floor is: 0.6318


In [5]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent

#get x_train and x_test features
x_train = x_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length']]

#count vectorizer
count_vect_1 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)
count_vect_2 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)
count_vect_3 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)

x_train_cv_petitioner_vec = count_vect_1.fit(x_train['appellant/petitioner'])
voc1 = count_vect_1.vocabulary_
x_train_cv_respondent_vec = count_vect_2.fit(x_train['appellee/respondent'])
voc2 = count_vect_2.vocabulary_
x_train_cv_scotus_justice_vec = count_vect_3.fit(x_train['scotus_justice'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitioner_vec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_train_cv_respondent = pd.DataFrame(x_train_cv_respondent_vec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondent_vec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice_vec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[3:9]]
x_train_features = x_train_features.reset_index()

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0

#concatenate the bow back
x = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondent, 
                     x_train_cv_scotus_justice,
                     x_train_features], 
                     axis = 1)

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y = y_orig.reset_index()
y = y.drop(['index'], axis = 1)
y = y.as_matrix()
y = column_or_1d(y)

#random forest
#forest_2 = RandomForestClassifier(n_estimators = 1000)
#rf_model_2_scores = cross_val_score(forest_2, x, y, cv=20)

#Multinomial Naive Baynes
#nb_2 = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
#nb_model_2_scores = cross_val_score(nb_2, x, y, cv=20)

# logistic regression
lr_model_2 = LogisticRegression(C = 0.57, penalty = "l1", n_jobs = -1)
lr_model_2_scores = cross_val_score(lr_model_2, x, y, cv=20)

#print "Accuracy of RF Model 2: %0.4f (+/- %0.4f)" % (rf_model_2_scores.mean(), rf_model_2_scores.std() * 2)
#print "Accuracy of NB Model 2: %0.4f (+/- %0.4f)" % (nb_model_2_scores.mean(), nb_model_2_scores.std() * 2)
print "Accuracy of LR Model 2: %0.4f (+/- %0.4f)" % (lr_model_2_scores.mean(), lr_model_2_scores.std() * 2)
actual_count = np.unique(y, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print "Floor is:", round(np.divide(num, denom), 4)

Accuracy of LR Model 2: 0.6574 (+/- 0.0690)
Floor is: 0.6318


In [6]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
# nltk.download('popular')

model = Doc2Vec.load('./data_and_etl//D2V_model_3.model') 

In [7]:
###
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code

#get x_train and x_test features
x_train = x_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length']]

x_train = x_train.reset_index()
x_train = x_train.drop(['index'], axis = 1)



#Embeddings
# tokenize function from http://nlpforhackers.io/tf-idf/


stop_words = stopwords.words('english') + list(punctuation) 
 
def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words]

print "tokenizing start"

x_train_cv_petitioner_tkn =  x_train['appellant/petitioner'].apply(lambda x: tokenize(x))
x_train_cv_respondent_tkn  = x_train['appellee/respondent'].apply(lambda x: tokenize(x))
x_train_cv_scotus_justice_tkn  = x_train['scotus_justice'].apply(lambda x: tokenize(x))

# x_test_cv_petitioner_tkn = x_test['appellant/petitioner'].apply(lambda x: tokenize(x))
# x_test_cv_respondent_tkn = x_test['appellee/respondent'].apply(lambda x: tokenize(x))
# x_test_cv_scotus_justice_tkn = x_test['scotus_justice'].apply(lambda x: tokenize(x))

print "tokenizing complete, infer start"

x_train_cv_petitioner  = x_train_cv_petitioner_tkn.apply(lambda x: pd.Series(model.infer_vector(x)))
x_train_cv_respondent  = x_train_cv_respondent_tkn.apply(lambda x: pd.Series(model.infer_vector(x)))
x_train_cv_scotus_justice  = x_train_cv_scotus_justice_tkn.apply(lambda x: pd.Series(model.infer_vector(x)))

# x_test_cv_petitioner = x_test_cv_petitioner_tkn.apply(lambda x: pd.Series(model.infer_vector(x)))
# x_test_cv_respondent = x_test_cv_respondent_tkn.apply(lambda x: pd.Series(model.infer_vector(x)))
# x_test_cv_scotus_justice = x_test_cv_scotus_justice_tkn.apply(lambda x: pd.Series(model.infer_vector(x)))



print "infer vectors done, data manipulation start"

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[3:9]]
x_train_features = x_train_features.reset_index()

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0

#concatenate the bow back
x = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondent, 
                     x_train_cv_scotus_justice,
                     x_train_features], 
                     axis = 1)

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y = y_orig.reset_index()
y = y.drop(['index'], axis = 1)
y = y.as_matrix()
y = column_or_1d(y)

#random forest
#forest_2 = RandomForestClassifier(n_estimators = 1000)
#rf_model_2_scores = cross_val_score(forest_2, x, y, cv=20)

#Multinomial Naive Baynes
#nb_2 = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
#nb_model_2_scores = cross_val_score(nb_2, x, y, cv=20)

# logistic regression
lr_model_2 = LogisticRegression(C = 0.2, penalty = "l1", n_jobs = -1)
lr_model_2_scores = cross_val_score(lr_model_2, x, y, cv=20)

#print "Accuracy of RF Model 2: %0.4f (+/- %0.4f)" % (rf_model_2_scores.mean(), rf_model_2_scores.std() * 2)
#print "Accuracy of NB Model 2: %0.4f (+/- %0.4f)" % (nb_model_2_scores.mean(), nb_model_2_scores.std() * 2)
print "Accuracy of LR Model 2: %0.4f (+/- %0.4f)" % (lr_model_2_scores.mean(), lr_model_2_scores.std() * 2)
actual_count = np.unique(y, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print "Floor is:", round(np.divide(num, denom), 4)




tokenizing start
tokenizing complete, infer start
infer vectors done, data manipulation start
Accuracy of LR Model 2: 0.6509 (+/- 0.0499)
Floor is: 0.6318
