In [255]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re
import time
from IPython.display import HTML
from IPython.display import display
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore") 

In [2]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'), encoding='utf-8')
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'), encoding='utf-8')
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'), encoding='utf-8')
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'), encoding='utf-8')

In [270]:
###Setup Data

#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition

###Python 3
verdict = []
verdict_csv = []

with open('../SCDB_2017_01_caseCentered_Citation.csv', encoding = 'cp1252') as f:
    reader = csv.reader(f)
    for row in reader:
        verdict_csv.append(row)
###Python 3

###Python 2
# verdict = []
# verdict_csv = csv.reader(open('../SCDB_2017_01_caseCentered_Citation.csv'))
###Python 2
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[12], row[17], row[19], row[36], row[39], row[40]])

    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
print("Verdict extraction done!")
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#get advocate sides
advocates = pd.read_json('./advocate_dict.json')

advocate_turns = []

for index, row in turns_combined.iterrows():
    if row['speaker_role'] == 'scotus_justice':
        advocate_turns.append('scotus_justice')
    else:
        speaker = row['speaker']
        transcript_id = row['transcript_id']
        try:
            lawyer_side = advocates.ix[speaker][transcript_id]
            advocate_turns.append(lawyer_side)
        except:
            advocate_turns.append('None')

#insert advocate side to the turns_combined dataframe
turns_combined['lawyer_side'] = advocate_turns


#create speaking length column
turns_combined = turns_combined.assign(speaking_length = lambda x: x.text_stop - x.text_start)
turns_combined.loc[turns_combined['speaking_length'] < 0, 'speaking_length'] = 1

print("Advocate side extraction done!")

#pivot turns files by transcript_id and lawyer_side for texts
lawyer_side_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'lawyer_side', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))
#reset index
lawyer_side_text_pivot = lawyer_side_text_pivot.reset_index()

#drop columns with no lawyer side tags
lawyer_side_text_pivot = lawyer_side_text_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#count of number of times speaker spoke
counts_pivot = pd.pivot_table(turns_combined[['transcript_id', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              aggfunc=len,
                              fill_value=0)

#reset index
counts_pivot = counts_pivot.reset_index()

#drop columns with no lawyer side tags
counts_pivot = counts_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
counts_pivot.columns = ['transcript_id', 'appellant/petitioner_count', 'appellee/respondent_count',
                       'scotus_justice_count']

#length of speaker speaking
length_pivot = pd.pivot_table(turns_combined[['transcript_id', 'speaking_length', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              values = 'speaking_length',
                              aggfunc=np.sum,
                              fill_value=0)

#reset index
length_pivot = length_pivot.reset_index()

#drop columns with no lawyer side tags
length_pivot = length_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
length_pivot.columns = ['transcript_id', 'appellant/petitioner_length', 'appellee/respondent_length',
                       'scotus_justice_length']

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#concatenate pivots together
pivots_concate = pd.concat([lawyer_side_text_pivot,
                            counts_pivot[counts_pivot.columns[1:4]], 
                            length_pivot[length_pivot.columns[1:4]],
                            speaker_role_text_pivot[speaker_role_text_pivot.columns[1]]],
                            axis=1,
                            join='inner')

#Convert feature attributes to ints
verdict['petitioner'] = verdict['petitioner'].astype(int)
verdict['respondent'] = verdict.respondent.apply(lambda x: 0 if x == '' else x)
verdict['respondent'] = verdict['respondent'].astype(int)
verdict['issue'] = verdict.issue.apply(lambda x: 0 if x == '' else x)
verdict['issue'] = verdict['issue'].astype(int)
verdict['issueArea'] = verdict.issueArea.apply(lambda x: 0 if x == '' else x)
verdict['issueArea'] = verdict['issueArea'].astype(int)

#join verdict into df
train_test_df = pivots_concate.join(verdict.set_index('term_docket'), on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()
#train_test_df = train_test_df[train_test_df['appellant/petitioner_length'] != 0.0] 

#create train and test split
x_orig = train_test_df[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'not_a_justice', 'petitioner', 'respondent', 'issue', 'issueArea']]
y_orig = train_test_df.partyWinning

print("Data re-shaping and combining done!")

Verdict extraction done!
Advocate side extraction done!
Data re-shaping and combining done!


In [271]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent, and additional features
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code

start_time = time.time()

#get x_train and x_test features
x_train = x_orig[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'petitioner', 'respondent', 'issue', 'issueArea']]

#count vectorizer
stop_words = ["that", "the", "court", "of", "justice", "and", "please", "this", "to"]

count_vect_1 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_2 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_3 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words=stop_words, lowercase = False)
count_vect_chief = CountVectorizer()

print("Vectorizing done!")

x_train_cv_petitioner_vec = count_vect_1.fit(x_train['appellant/petitioner'])
voc1 = count_vect_1.vocabulary_
x_train_cv_respondant_vec = count_vect_2.fit(x_train['appellee/respondent'])
voc2 = count_vect_2.vocabulary_
x_train_cv_scotus_justice_vec = count_vect_3.fit(x_train['scotus_justice'])
x_train_chief = count_vect_chief.fit(x_train['chief'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitioner_vec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_train_cv_respondant = pd.DataFrame(x_train_cv_respondant_vec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice_vec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())
x_train_cv_chief = pd.DataFrame(x_train_chief.transform(x_train['chief']).todense(),
                               columns = x_train_chief.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[4:]]
x_train_features = x_train_features.reset_index()

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y = y_orig.reset_index()
y = y.drop(['index'], axis = 1)
y = y.as_matrix()
y = column_or_1d(y)

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0

#concatenate the bow back
x = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondant, 
                     x_train_cv_scotus_justice,
                     x_train_cv_chief,
                     x_train_features], 
                     axis = 1)


# #random forest
forest_3 = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
rf_model_3_scores = cross_val_score(forest_3, x, y, cv=20)

# #Multinomial Naive Baynes
nb_3 = MultinomialNB(alpha = 0.1)
nb_model_3_scores = cross_val_score(nb_3, x, y, cv=20)

# logistic regression
lr_model_3 = LogisticRegression(C = 0.2, penalty = "l1", n_jobs = -1)
lr_model_train_3 = lr_model_3.fit(x, y)
lr_model_3_scores = cross_val_score(lr_model_3, x, y, cv=20)

end_time = time.time()

print("Accuracy of RF Model 3: %0.4f (+/- %0.4f)" % (rf_model_3_scores.mean(), rf_model_3_scores.std() * 2))
print("Accuracy of NB Model 3: %0.4f (+/- %0.4f)" % (nb_model_3_scores.mean(), nb_model_3_scores.std() * 2))
print("Accuracy of LR Model 3: %0.4f (+/- %0.4f)" % (lr_model_3_scores.mean(), lr_model_3_scores.std() * 2))
actual_count = np.unique(y, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print("Floor is:", round(np.divide(num, denom), 4))
print("Time taken:", round((end_time - start_time) / 60, 1), " minutes")

Vectorizing done!
Accuracy of RF Model 3: 0.6505 (+/- 0.0299)
Accuracy of NB Model 3: 0.5296 (+/- 0.0718)
Accuracy of LR Model 3: 0.6462 (+/- 0.0623)
Floor is: 0.6318
Time taken: 16.9  minutes


## Token Analysis

In [272]:
#Convert Labels Back to DataFrame, add Label Name
y = pd.DataFrame(y)
y.columns = ['partyWinning']
xycombined = pd.concat([x, y], axis = 1)

In [273]:
xycombined.head()

Unnamed: 0,Absolutely Your Honor,Act does not,Act which is,Administrative Procedure Act,And Court has,And Your Honor,And as Court,And as we,And don think,And if it,...,appellee/respondent_count,scotus_justice_count,appellant/petitioner_length,appellee/respondent_length,scotus_justice_length,petitioner,respondent,issue,issueArea,partyWinning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,154,221,2242.485,1876.84,2448.269,8.0,21.0,30110.0,3.0,1
1,0.0,0.0,0.0,0.0,0.0,0.203908,0.0,0.0,0.0,0.0,...,97,90,298.389,2098.089,672.408,126.0,28.0,90180.0,9.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,105,100,3304.774,2110.115,793.406,145.0,116.0,80070.0,8.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27,42,1193.185,1896.673,458.936,382.0,151.0,70070.0,7.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39,132,1660.955,677.257,1151.771,27.0,100.0,80130.0,8.0,1


In [274]:
#Split Datasets into Petitioner Wins + Respondent Wins
petitioner_wins = xycombined[xycombined['partyWinning'] == '1']
respondent_wins = xycombined[xycombined['partyWinning'] == '0']

#Remove columns after the final n-gram
petitioner_wins = petitioner_wins.iloc[:, :7500]
respondent_wins = respondent_wins.iloc[:, :7500]

#Add unique occurrences of n-grams
petitioner_wins = petitioner_wins.append(petitioner_wins.apply(np.count_nonzero, axis = 0), ignore_index = True)
respondent_wins = respondent_wins.append(respondent_wins.apply(np.count_nonzero, axis = 0), ignore_index = True)

#Most common n-grams when petitioner/respondent wins
petitioner_wins_petitioner_most_common = petitioner_wins.iloc[-1][:2500].nlargest(100)
petitioner_wins_respondent_most_common = petitioner_wins.iloc[-1][2500:5000].nlargest(100)
petitioner_wins_scotus_most_common = petitioner_wins.iloc[-1][5000:].nlargest(100)
petitioner_wins_most_common = pd.DataFrame({"Petitioner Most Common":petitioner_wins_petitioner_most_common.index,
                                           "Respondent Most Common":petitioner_wins_respondent_most_common.index,
                                           "Scotus Most Common":petitioner_wins_scotus_most_common.index})

respondent_wins_petitioner_most_common = respondent_wins.iloc[-1][:2500].nlargest(100)
respondent_wins_respondent_most_common = respondent_wins.iloc[-1][2500:5000].nlargest(100)
respondent_wins_scotus_most_common = respondent_wins.iloc[-1][5000:].nlargest(100)
respondent_wins_most_common = pd.DataFrame({"Petitioner Most Common":respondent_wins_petitioner_most_common.index,
                                           "Respondent Most Common":respondent_wins_respondent_most_common.index,
                                           "Scotus Most Common":respondent_wins_scotus_most_common.index})

petitioner_wins_matrix = petitioner_wins_most_common.head(10).as_matrix().reshape([1,30])[0]
petitioner_counts = Counter(petitioner_wins_matrix)

respondent_wins_matrix = respondent_wins_most_common.head(10).as_matrix().reshape([1,30])[0]
respondent_counts = Counter(respondent_wins_matrix)

def highlight_unique(val, counts):
    """Highlights values that only appear once in the DataFrame"""
    if counts[val] > 1:
        color = None
    else:
        color = 'yellow'
    return 'background-color: %s' % color

print("10 most common tokens when petitioner wins:")
display(petitioner_wins_most_common.head(10).style.applymap(highlight_unique, counts=petitioner_counts,
                                                            subset=["Petitioner Most Common", "Respondent Most Common"]))

print(" ")
print("-" * 100)
print(" ")

print("10 most common tokens when respondent wins:")
display(respondent_wins_most_common.head(10).style.applymap(highlight_unique, counts=respondent_counts,
                                                           subset=["Petitioner Most Common", "Respondent Most Common"]))


10 most common tokens when petitioner wins:


Unnamed: 0,Petitioner Most Common,Respondent Most Common,Scotus Most Common
0,Mr Chief Justice,Mr Chief Justice,case is submitted
1,may it Court,may it Court,The case is
2,Chief Justice may,Chief Justice may,We ll hear
3,Justice may it,Justice may it,Thank you Mr
4,there is no,there is no,it seems me
5,it would be,it would be,do you think
6,you Mr Chief,whether or not,it would be
7,Thank you Mr,there was no,ll hear argument
8,whether or not,it is not,you re saying
9,there was no,would have been,Do you think


 
----------------------------------------------------------------------------------------------------
 
 10 most common tokens when respondent wins:


Unnamed: 0,Petitioner Most Common,Respondent Most Common,Scotus Most Common
0,Mr Chief Justice,Mr Chief Justice,case is submitted
1,may it Court,may it Court,The case is
2,Chief Justice may,Chief Justice may,Thank you Mr
3,Justice may it,Justice may it,We ll hear
4,there is no,there is no,it seems me
5,it would be,it would be,do you think
6,it is not,it is not,Do you think
7,whether or not,whether or not,ll hear argument
8,there was no,there was no,you may proceed
9,you Mr Chief,in our brief,you re saying


In [275]:
#Most common n-grams when petitioner/respondent wins
petitioner_most_common = pd.DataFrame({"Petitioner Most Common When Petitioner Wins":
                                       petitioner_wins_petitioner_most_common.index,
                                       "Petitioner Most Common When Respondent Wins":
                                       respondent_wins_petitioner_most_common.index})

respondent_most_common = pd.DataFrame({"Respondent Most Common When Petitioner Wins":
                                       petitioner_wins_respondent_most_common.index,
                                       "Respondent Most Common When Respondent Wins":
                                       respondent_wins_respondent_most_common.index})

scotus_most_common = pd.DataFrame({"SCOTUS Most Common When Petitioner Wins":
                                       petitioner_wins_scotus_most_common.index,
                                       "SCOTUS Most Common When Respondent Wins":
                                       respondent_wins_scotus_most_common.index})

petitioner_most_common_matrix = petitioner_most_common.head(10).as_matrix().reshape([1,20])[0]
petitioner_only_counts = Counter(petitioner_most_common_matrix)

respondent_most_common_matrix = respondent_most_common.head(10).as_matrix().reshape([1,20])[0]
respondent_only_counts = Counter(respondent_most_common_matrix)

scotus_most_common_matrix = scotus_most_common.head(10).as_matrix().reshape([1,20])[0]
scotus_only_counts = Counter(scotus_most_common_matrix)

print("10 most common tokens among petitioners:")
display(petitioner_most_common.head(10).style.applymap(highlight_unique, counts=petitioner_only_counts))

print(" ")
print("-" * 100)
print(" ")

print("10 most common tokens among respondents:")
display(respondent_most_common.head(10).style.applymap(highlight_unique, counts=respondent_only_counts))

print(" ")
print("-" * 100)
print(" ")

print("10 most common tokens among SCOTUS:")
display(scotus_most_common.head(10).style.applymap(highlight_unique, counts=scotus_only_counts))

10 most common tokens among petitioners:


Unnamed: 0,Petitioner Most Common When Petitioner Wins,Petitioner Most Common When Respondent Wins
0,Mr Chief Justice,Mr Chief Justice
1,may it Court,may it Court
2,Chief Justice may,Chief Justice may
3,Justice may it,Justice may it
4,there is no,there is no
5,it would be,it would be
6,you Mr Chief,it is not
7,Thank you Mr,whether or not
8,whether or not,there was no
9,there was no,you Mr Chief


 
----------------------------------------------------------------------------------------------------
 
10 most common tokens among respondents:


Unnamed: 0,Respondent Most Common When Petitioner Wins,Respondent Most Common When Respondent Wins
0,Mr Chief Justice,Mr Chief Justice
1,may it Court,may it Court
2,Chief Justice may,Chief Justice may
3,Justice may it,Justice may it
4,there is no,there is no
5,it would be,it would be
6,whether or not,it is not
7,there was no,whether or not
8,it is not,there was no
9,would have been,in our brief


 
----------------------------------------------------------------------------------------------------
 
10 most common tokens among SCOTUS:


Unnamed: 0,SCOTUS Most Common When Petitioner Wins,SCOTUS Most Common When Respondent Wins
0,case is submitted,case is submitted
1,The case is,The case is
2,We ll hear,Thank you Mr
3,Thank you Mr,We ll hear
4,it seems me,it seems me
5,do you think,do you think
6,it would be,Do you think
7,ll hear argument,ll hear argument
8,you re saying,you may proceed
9,Do you think,you re saying


In [276]:
#Find overlap between most common respondent n_grams
petitioner_stop_n_grams = []
for n_gram_1 in petitioner_wins_petitioner_most_common.index:
    for n_gram_2 in respondent_wins_petitioner_most_common.index:
        if n_gram_2 == n_gram_1:
            petitioner_stop_n_grams.append(n_gram_2)

respondent_stop_n_grams = []
for n_gram_1 in petitioner_wins_respondent_most_common.index:
    for n_gram_2 in respondent_wins_respondent_most_common.index:
        if n_gram_2 == n_gram_1:
            respondent_stop_n_grams.append(n_gram_2)

scotus_stop_n_grams = []
for n_gram_1 in petitioner_wins_scotus_most_common.index:
    for n_gram_2 in respondent_wins_scotus_most_common.index:
        if n_gram_2 == n_gram_1:
            scotus_stop_n_grams.append(n_gram_2)

In [277]:
#Update x by dropping 100 most common n-grams among petitioners, respondents, scotuses
petitioner_vocab = x.iloc[:, :2500]
for n_gram in petitioner_stop_n_grams:
    del petitioner_vocab[n_gram]
    
respondent_vocab = x.iloc[:, 2500:5000]
for n_gram in respondent_stop_n_grams:
    del respondent_vocab[n_gram]

scotus_vocab = x.iloc[:, 5000:7500]
for n_gram in scotus_stop_n_grams:
    del scotus_vocab[n_gram]

x = pd.concat([petitioner_vocab,
               respondent_vocab,
               scotus_vocab,
               x_train_cv_chief,
               x_train_features],
               axis = 1)

## Re-Train Models After Removing Overlapping N-Grams

In [299]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent, and additional features
###Bag of words trimmed based on overlap
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code

start_time = time.time()

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y = y_orig.reset_index()
y = y.drop(['index'], axis = 1)
y = y.as_matrix()
y = column_or_1d(y)

# #random forest
forest_3 = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
rf_model_3_scores = cross_val_score(forest_3, x, y, cv=20)

# #Multinomial Naive Baynes
nb_3 = MultinomialNB(alpha = 0.1)
nb_model_3_scores = cross_val_score(nb_3, x, y, cv=20)

# logistic regression
lr_model_3 = LogisticRegression(C = 0.2, penalty = "l1", n_jobs = -1)
lr_model_train_3 = lr_model_3.fit(x, y)
lr_model_3_scores = cross_val_score(lr_model_3, x, y, cv=20)

end_time = time.time()

print("Accuracy of RF Model 3: %0.4f (+/- %0.4f)" % (rf_model_3_scores.mean(), rf_model_3_scores.std() * 2))
print("Accuracy of NB Model 3: %0.4f (+/- %0.4f)" % (nb_model_3_scores.mean(), nb_model_3_scores.std() * 2))
print("Accuracy of LR Model 3: %0.4f (+/- %0.4f)" % (lr_model_3_scores.mean(), lr_model_3_scores.std() * 2))
actual_count = np.unique(y, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print("Floor is:", round(np.divide(num, denom), 4))

print("Time taken:", round((end_time - start_time) / 60, 1), " minutes")

In [300]:
#Amount by which to decrease attributes
petitioner_trim = len(petitioner_stop_n_grams)
respondent_trim = len(respondent_stop_n_grams)
scotus_trim = len(scotus_stop_n_grams)

In [301]:
y = pd.DataFrame(y)
y.columns = ['partyWinning']
xycombined = pd.concat([x, y], axis = 1)

#Split Datasets into Petitioner Wins + Respondent Wins
petitioner_wins = xycombined[xycombined['partyWinning'] == '1']
respondent_wins = xycombined[xycombined['partyWinning'] == '0']

#Remove columns after the final n-gram
petitioner_wins = petitioner_wins.iloc[:, :7500-(petitioner_trim + respondent_trim + scotus_trim)]
respondent_wins = respondent_wins.iloc[:, :7500-(petitioner_trim + respondent_trim + scotus_trim)]

#Add unique occurrences of n-grams
petitioner_wins = petitioner_wins.append(petitioner_wins.apply(np.count_nonzero, axis = 0), ignore_index = True)
respondent_wins = respondent_wins.append(respondent_wins.apply(np.count_nonzero, axis = 0), ignore_index = True)

#Most common n-grams when petitioner/respondent wins
petitioner_wins_petitioner_most_common = petitioner_wins.iloc[-1][:2500-petitioner_trim].nlargest(10)
petitioner_wins_respondent_most_common = petitioner_wins.iloc[-1][2500-petitioner_trim:5000-respondent_trim].nlargest(10)
petitioner_wins_scotus_most_common = petitioner_wins.iloc[-1][5000-respondent_trim:].nlargest(10)
petitioner_wins_most_common = pd.DataFrame({"Petitioner Most Common":petitioner_wins_petitioner_most_common.index,
                                           "Respondent Most Common":petitioner_wins_respondent_most_common.index,
                                           "Scotus Most Common":petitioner_wins_scotus_most_common.index})

respondent_wins_petitioner_most_common = respondent_wins.iloc[-1][:2500-petitioner_trim].nlargest(10)
respondent_wins_respondent_most_common = respondent_wins.iloc[-1][2500-petitioner_trim:5000-respondent_trim].nlargest(10)
respondent_wins_scotus_most_common = respondent_wins.iloc[-1][5000-respondent_trim:].nlargest(10)
respondent_wins_most_common = pd.DataFrame({"Petitioner Most Common":respondent_wins_petitioner_most_common.index,
                                           "Respondent Most Common":respondent_wins_respondent_most_common.index,
                                           "Scotus Most Common":respondent_wins_scotus_most_common.index})

petitioner_wins_matrix = petitioner_wins_most_common.head(10).as_matrix().reshape([1,30])[0]
petitioner_counts = Counter(petitioner_wins_matrix)

respondent_wins_matrix = respondent_wins_most_common.head(10).as_matrix().reshape([1,30])[0]
respondent_counts = Counter(respondent_wins_matrix)

print("10 most common tokens when petitioner wins:")
display(petitioner_wins_most_common.head(10).style.applymap(highlight_unique, counts=petitioner_counts,
                                                            subset=["Petitioner Most Common", "Respondent Most Common"]))

print(" ")
print("-" * 100)
print(" ")

print(" 10 most common tokens when respondent wins:")
display(respondent_wins_most_common.head(10).style.applymap(highlight_unique, counts=respondent_counts,
                                                           subset=["Petitioner Most Common", "Respondent Most Common"]))


10 most common tokens when petitioner wins:


Unnamed: 0,Petitioner Most Common,Respondent Most Common,Scotus Most Common
0,no further questions,did not have,don know what
1,If there are,Your Honor because,if it is
2,Court said in,they did not,counsel The case
3,remainder my time,Your Honor don,there is an
4,did not have,Your Honor in,it was not
5,in case was,Are you saying,thought it was
6,in our view,That correct Your,hear argument first
7,in case The,it can be,you counsel The
8,could have been,do not think,in other words
9,don think there,we re going,nothing do with


 
----------------------------------------------------------------------------------------------------
 
 10 most common tokens when respondent wins:


Unnamed: 0,Petitioner Most Common,Respondent Most Common,Scotus Most Common
0,is going be,may not be,whenever you re
1,does not have,we have here,So you re
2,Your Honor We,think would be,would not have
3,is there is,it may be,take it you
4,do not have,no further questions,But you re
5,by Court in,Are you saying,if it were
6,it can be,there may be,re not going
7,they are not,in United States,you say you
8,That correct Your,And you say,they re not
9,we have here,in our view,you have any


In [302]:
#Most common n-grams when petitioner/respondent wins
petitioner_most_common = pd.DataFrame({"Petitioner Most Common When Petitioner Wins":
                                       petitioner_wins_petitioner_most_common.index,
                                       "Petitioner Most Common When Respondent Wins":
                                       respondent_wins_petitioner_most_common.index})

respondent_most_common = pd.DataFrame({"Respondent Most Common When Petitioner Wins":
                                       petitioner_wins_respondent_most_common.index,
                                       "Respondent Most Common When Respondent Wins":
                                       respondent_wins_respondent_most_common.index})

scotus_most_common = pd.DataFrame({"SCOTUS Most Common When Petitioner Wins":
                                       petitioner_wins_scotus_most_common.index,
                                       "SCOTUS Most Common When Respondent Wins":
                                       respondent_wins_scotus_most_common.index})

petitioner_most_common_matrix = petitioner_most_common.head(10).as_matrix().reshape([1,20])[0]
petitioner_only_counts = Counter(petitioner_most_common_matrix)

respondent_most_common_matrix = respondent_most_common.head(10).as_matrix().reshape([1,20])[0]
respondent_only_counts = Counter(respondent_most_common_matrix)

scotus_most_common_matrix = scotus_most_common.head(10).as_matrix().reshape([1,20])[0]
scotus_only_counts = Counter(scotus_most_common_matrix)

print("10 most common tokens among petitioners:")
display(petitioner_most_common.head(10).style.applymap(highlight_unique, counts=petitioner_only_counts))

print(" ")
print("-" * 100)
print(" ")

print("10 most common tokens among respondents:")
display(respondent_most_common.head(10).style.applymap(highlight_unique, counts=respondent_only_counts))

print(" ")
print("-" * 100)
print(" ")

print("10 most common tokens among SCOTUS:")
display(scotus_most_common.head(10).style.applymap(highlight_unique, counts=scotus_only_counts))

10 most common tokens among petitioners:


Unnamed: 0,Petitioner Most Common When Petitioner Wins,Petitioner Most Common When Respondent Wins
0,no further questions,is going be
1,If there are,does not have
2,Court said in,Your Honor We
3,remainder my time,is there is
4,did not have,do not have
5,in case was,by Court in
6,in our view,it can be
7,in case The,they are not
8,could have been,That correct Your
9,don think there,we have here


 
----------------------------------------------------------------------------------------------------
 
10 most common tokens among respondents:


Unnamed: 0,Respondent Most Common When Petitioner Wins,Respondent Most Common When Respondent Wins
0,did not have,may not be
1,Your Honor because,we have here
2,they did not,think would be
3,Your Honor don,it may be
4,Your Honor in,no further questions
5,Are you saying,Are you saying
6,That correct Your,there may be
7,it can be,in United States
8,do not think,And you say
9,we re going,in our view


 
----------------------------------------------------------------------------------------------------
 
10 most common tokens among SCOTUS:


Unnamed: 0,SCOTUS Most Common When Petitioner Wins,SCOTUS Most Common When Respondent Wins
0,don know what,whenever you re
1,if it is,So you re
2,counsel The case,would not have
3,there is an,take it you
4,it was not,But you re
5,thought it was,if it were
6,hear argument first,re not going
7,you counsel The,you say you
8,in other words,they re not
9,nothing do with,you have any


In [346]:
def contains_your_honor(row, df):
    counter = 0
    columns = df.columns
    for column in columns:
        if "Your Honor" in column:
            if row[column] > 0:
                counter += 1
    return counter

In [347]:
petitioner_vocab = x.iloc[:, :2500-petitioner_trim]
petitioner_vocab['your_honor_count'] = petitioner_vocab.apply(contains_your_honor, df=petitioner_vocab, axis=1)
    
respondent_vocab = x.iloc[:, 2500-petitioner_trim:5000-respondent_trim]
respondent_vocab['your_honor_count'] = respondent_vocab.apply(contains_your_honor, df=respondent_vocab, axis=1)

scotus_vocab = x.iloc[:, 5000-respondent_trim:]
scotus_vocab['your_honor_count'] = scotus_vocab.apply(contains_your_honor, df=scotus_vocab, axis=1)

x = pd.concat([petitioner_vocab,
               respondent_vocab,
               scotus_vocab,
               x_train_cv_chief,
               x_train_features],
               axis = 1)

## Re-train Models After Adding Count of N-Grams Containing "Your Honor"

In [349]:
###tfidf vectorizer bag of words using scotus_justice, petitioner, respondent, and additional features
###Bag of words trimmed based on overlap
##features: chief justice indicator, petitioner code, respondent code, issue code, issue area code, your honor count

start_time = time.time()

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y = y_orig.reset_index()
y = y.drop(['index'], axis = 1)
y = y.as_matrix()
y = column_or_1d(y)

# #random forest
forest_3 = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
rf_model_3_scores = cross_val_score(forest_3, x, y, cv=20)

# #Multinomial Naive Baynes
nb_3 = MultinomialNB(alpha = 0.1)
nb_model_3_scores = cross_val_score(nb_3, x, y, cv=20)

# logistic regression
lr_model_3 = LogisticRegression(C = 0.2, penalty = "l1", n_jobs = -1)
lr_model_train_3 = lr_model_3.fit(x, y)
lr_model_3_scores = cross_val_score(lr_model_3, x, y, cv=20)

end_time = time.time()

print("Accuracy of RF Model 3: %0.4f (+/- %0.4f)" % (rf_model_3_scores.mean(), rf_model_3_scores.std() * 2))
print("Accuracy of NB Model 3: %0.4f (+/- %0.4f)" % (nb_model_3_scores.mean(), nb_model_3_scores.std() * 2))
print("Accuracy of LR Model 3: %0.4f (+/- %0.4f)" % (lr_model_3_scores.mean(), lr_model_3_scores.std() * 2))
actual_count = np.unique(y, return_counts=True)[1]
num = round(actual_count[1],2)
denom  = round(actual_count[0] + actual_count[1],2)
print("Floor is:", round(np.divide(num, denom), 4))

print("Time taken:", round((end_time - start_time) / 60, 1), " minutes")

Accuracy of RF Model 3: 0.6455 (+/- 0.0334)
Accuracy of NB Model 3: 0.5296 (+/- 0.0718)
Accuracy of LR Model 3: 0.6455 (+/- 0.0557)
Floor is: 0.6318
Time taken: 10.4  minutes
