In [131]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d

In [3]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'))
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'))
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'))
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'))

#read summaries file into pandas df
summaries = pd.read_csv('summaries.csv')

In [4]:
#remove OA and _orig from transcript id in summaries df
summaries['transcript_id'] = summaries['transcript_id'].str.replace('OA','')
summaries['transcript_id'] = summaries['transcript_id'].str.replace('_orig','')

#create new column for verdict variable
def f(row):
    if row['winning_party'] == row['first_party']:
        val = row['first_party_label']
    elif row['winning_party'] == row['second_party']:
        val = row['second_party_label']
    else:
        val = 'No Verdict'
    return val

summaries['verdict'] = summaries.apply(f, axis=1)

#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('transcript_id')['verdict'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df[train_test_df.verdict.notnull()]
train_test_df = train_test_df.dropna()
train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.verdict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print('Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4))

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print('Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4))

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print('Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4))

print('Actual Count of y_test Verdicts:\n', y_test.value_counts())

Accuracy of Bag-of-Words Random Forest Model: 0.8332
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.7822
Accuracy of Bag-of-Words Logistic Regression Model: 0.8761
Actual Count of y_test Verdicts:
 Petitioner    815
Appellant     162
Respondent     24
Name: verdict, dtype: int64


In [5]:
####rerunning above but using the case name instead of case id

#create new column for verdict variable
def f(row):
    if row['winning_party'] == row['first_party']:
        val = row['first_party_label']
    elif row['winning_party'] == row['second_party']:
        val = row['second_party_label']
    else:
        val = 'No Verdict'
    return val

summaries['verdict'] = summaries.apply(f, axis=1)

#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'title', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('case_name')['verdict'], on='title')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df[train_test_df.verdict.notnull()]
train_test_df = train_test_df.dropna()
train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.verdict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print('Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4))

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print('Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4))

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print('Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4))

print('Actual Count of y_test Verdicts:\n', y_test.value_counts())

print('LR predictions:\n', np.unique(lr_prediction, return_counts=True))

Accuracy of Bag-of-Words Random Forest Model: 0.851
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.7996
Accuracy of Bag-of-Words Logistic Regression Model: 0.8839
Actual Count of y_test Verdicts:
 Petitioner    803
Appellant     155
Respondent     14
Appellee        1
Name: verdict, dtype: int64
LR predictions:
 (array(['Appellant', 'Petitioner', 'Respondent'], dtype=object), array([136, 835,   2]))


In [6]:
#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = []

with open('../SCDB_2017_01_caseCentered_Citation.csv', encoding="cp1252") as f:
    reader = csv.reader(f)
    for row in reader:
        verdict_csv.append(row)
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[36]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(verdict.set_index('term_docket')['partyWinning'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.partyWinning
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print('Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4))

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print('Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4))

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print('Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4))

print('Actual Count of y_test Verdicts:\n', y_test.value_counts())

print('LR predictions:\n', np.unique(lr_prediction, return_counts=True))

Accuracy of Bag-of-Words Random Forest Model: 0.6169
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.5533
Accuracy of Bag-of-Words Logistic Regression Model: 0.5715
Actual Count of y_test Verdicts:
 1    1120
0     642
Name: partyWinning, dtype: int64
LR predictions:
 (array(['0', '1'], dtype=object), array([ 537, 1225]))


In [15]:
##SET UP

#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = []

with open('../SCDB_2017_01_caseCentered_Citation.csv', encoding="cp1252") as f:
    reader = csv.reader(f)
    for row in reader:
        verdict_csv.append(row)
        
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[12], row[17], row[19], row[36], row[39], row[40]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#get advocate sides
advocates = pd.read_json('./advocate_dict.json')

advocate_turns = []

for index, row in turns_combined.iterrows():
    if row['speaker_role'] == 'scotus_justice':
        advocate_turns.append('scotus_justice')
    else:
        speaker = row['speaker']
        transcript_id = row['transcript_id']
        try:
            lawyer_side = advocates.ix[speaker][transcript_id]
            advocate_turns.append(lawyer_side)
        except:
            advocate_turns.append('None')

#insert advocate side to the turns_combined dataframe
turns_combined['lawyer_side'] = advocate_turns

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [16]:
##PIVOT

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'lawyer_side', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))
#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#drop columns with no lawyer side tags
speaker_role_text_pivot = speaker_role_text_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#count of number of times speaker spoke
counts_pivot = pd.pivot_table(turns_combined[['transcript_id', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              aggfunc=len,
                              fill_value=0)

#reset index
counts_pivot = counts_pivot.reset_index()

#drop columns with no lawyer side tags
counts_pivot = counts_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
counts_pivot.columns = ['transcript_id', 'appellant/petitioner_count', 'appellee/respondent_count',
                       'scotus_justice_count']

#length of speaker speaking
length_pivot = pd.pivot_table(turns_combined[['transcript_id', 'text_duration', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              values = 'text_duration',
                              aggfunc=np.sum,
                              fill_value=0)

#reset index
length_pivot = length_pivot.reset_index()

#drop columns with no lawyer side tags
length_pivot = length_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
length_pivot.columns = ['transcript_id', 'appellant/petitioner_length', 'appellee/respondent_length',
                       'scotus_justice_length']

#concatenate pivots together
pivots_concate = pd.concat([speaker_role_text_pivot, 
                            counts_pivot[counts_pivot.columns[1:4]], 
                            length_pivot[length_pivot.columns[1:4]]],
                            axis=1,
                            join='inner')

In [17]:
verdict.head()

Unnamed: 0,term_docket,chief,petitioner,respondent,partyWinning,issue,issueArea
0,1946_24,Vinson,198,172,1,80180,8
1,1946_12,Vinson,100,27,0,10500,1
2,1946_21,Vinson,209,27,0,80250,8
3,1946_26,Vinson,27,170,0,20150,2
4,1946_50,Vinson,27,176,1,80060,8


In [42]:
#Convert attributes to ints
verdict['petitioner'] = verdict['petitioner'].astype(int)
verdict['respondent'] = verdict.respondent.apply(lambda x: 0 if x == '' else x)
verdict['respondent'] = verdict['respondent'].astype(int)
verdict['issue'] = verdict.issue.apply(lambda x: 0 if x == '' else x)
verdict['issue'] = verdict['issue'].astype(int)
verdict['issueArea'] = verdict.issueArea.apply(lambda x: 0 if x == '' else x)
verdict['issueArea'] = verdict['issueArea'].astype(int)

In [187]:
##MODELING

#join verdict into df
train_test_df = pivots_concate.join(verdict.set_index('term_docket'), on='transcript_id')
#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()

#create train and test split
x = train_test_df[['appellant/petitioner', 'appellee/respondent', 'scotus_justice', 'chief',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length',
                   'petitioner', 'respondent', 'issue', 'issueArea']]
y = train_test_df.partyWinning
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect_petitioner = CountVectorizer(ngram_range = (2,3), max_features = 2500, analyzer = 'word')
count_vect_respondent = CountVectorizer(ngram_range = (2,3), max_features = 2500, analyzer = 'word')
count_vect_justice = CountVectorizer(ngram_range = (2,3), max_features = 2500, analyzer = 'word')
count_vect_chief = CountVectorizer()

x_train_cv_petitionervec = count_vect_petitioner.fit(x_train['appellant/petitioner'])
x_train_cv_respondentvec = count_vect_respondent.fit(x_train['appellee/respondent'])
x_train_cv_scotus_justicevec = count_vect_justice.fit(x_train['scotus_justice'])
x_train_chief = count_vect_chief.fit(x_train['chief'])

x_test_cv_petitionervec = count_vect_petitioner.fit(x_test['appellant/petitioner'])
x_test_cv_respondentvec = count_vect_respondent.fit(x_test['appellee/respondent'])
x_test_cv_scotus_justicevec = count_vect_justice.fit(x_test['scotus_justice'])
x_test_chief = count_vect_chief.fit(x_test['chief'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitionervec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitionervec.get_feature_names())
x_train_cv_respondant = pd.DataFrame(x_train_cv_respondantvec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondantvec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justicevec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justicevec.get_feature_names())
x_train_cv_chief = pd.DataFrame(x_train_chief.transform(x_train['chief']).todense(),
                               columns = x_train_chief.get_feature_names())

x_test_cv_petitioner = pd.DataFrame(x_test_cv_petitionervec.transform(x_test['appellant/petitioner']).todense(),
                                        columns = x_test_cv_petitionervec.get_feature_names())
x_test_cv_respondant = pd.DataFrame(x_test_cv_respondantvec.transform(x_test['appellee/respondent']).todense(),
                                        columns = x_test_cv_respondantvec.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justicevec.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justicevec.get_feature_names())
x_test_cv_chief = pd.DataFrame(x_test_chief.transform(x_test['chief']).todense(),
                               columns = x_test_chief.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[4:]]
x_train_features = x_train_features.reset_index()
x_test_features = x_test[x_train.columns[4:]]
x_test_features = x_test_features.reset_index()

#rest y train + test indices, drop 'index' columns, and convert to 1-d matrices
y_train = y_train.reset_index()
y_test = y_test.reset_index()
y_train = y_train.drop(['index'], axis = 1)
y_test = y_test.drop(['index'], axis = 1)
y_train = y_train.as_matrix()
y_test = y_test.as_matrix()
y_train = column_or_1d(y_train)
y_test = column_or_1d(y_test)


#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0
x_test_features[x_test_features < 0 ] = 0

#concatenate the bow back
x_train = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondant, 
                     x_train_cv_scotus_justice,
                     x_train_cv_chief,
                     x_train_features], 
                     axis = 1)
x_test = pd.concat([x_test_cv_petitioner, 
                    x_test_cv_respondant, 
                    x_test_cv_scotus_justice,
                    x_test_cv_chief,
                    x_test_features],
                    axis = 1)


#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print('Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4))

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print('Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4))

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l1")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print('Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4))

print('Actual Count of y_test Verdicts:\n', np.unique(y_test, return_counts=True))

print('RF predictions:', np.unique(random_forest_prediction, return_counts=True))
print('NB predictions:', np.unique(naive_baynes_prediction, return_counts=True))
print('LR predictions:', np.unique(lr_prediction, return_counts=True))

Accuracy of Bag-of-Words Random Forest Model: 0.6448
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.5705
Accuracy of Bag-of-Words Logistic Regression Model: 0.576
Actual Count of y_test Verdicts:
 (array(['0', '1'], dtype=object), array([323, 592]))
RF predictions: (array(['0', '1'], dtype=object), array([ 32, 883]))
NB predictions: (array(['0', '1'], 
      dtype='<U1'), array([374, 541]))
LR predictions: (array(['0', '1'], dtype=object), array([333, 582]))


In [189]:
y_train = pd.DataFrame(y_train)
y_train.columns = ["partyWinning"]

In [190]:
train_concat = pd.concat([x_train, y_train], axis = 1)

In [192]:
train_concat.head()

Unnamed: 0,ability to,able to,about it,about that,about the,about this,about what,about whether,absence of,access to,...,appellee/respondent_count,scotus_justice_count,appellant/petitioner_length,appellee/respondent_length,scotus_justice_length,petitioner,respondent,issue,issueArea,partyWinning
0,0,0,0,0,3,0,0,0,0,5,...,77,113,1417.802,1268.599,882.82,249.0,249.0,30010.0,3.0,1
1,0,0,0,0,1,0,0,0,1,0,...,40,73,1271.119,1410.646,792.277,208.0,143.0,90090.0,9.0,1
2,0,1,1,1,2,1,0,0,0,0,...,93,4,0.0,0.0,0.0,151.0,371.0,70050.0,7.0,1
3,0,1,0,0,1,0,0,0,0,0,...,57,107,1157.113,1237.199,1242.412,137.0,27.0,10390.0,1.0,0
4,0,0,0,0,0,1,1,0,1,1,...,68,139,1030.501,988.384,1644.819,100.0,28.0,90150.0,9.0,0


In [300]:
petitioner_wins = train_concat[train_concat['partyWinning'] == '1']
petitioner_wins = petitioner_wins.append(petitioner_wins.apply(np.count_nonzero, axis = 0), ignore_index = True)

In [301]:
petitioner_wins.tail()

Unnamed: 0,ability to,able to,about it,about that,about the,about this,about what,about whether,absence of,access to,...,appellee/respondent_count,scotus_justice_count,appellant/petitioner_length,appellee/respondent_length,scotus_justice_length,petitioner,respondent,issue,issueArea,partyWinning
1154,0,0,0,0,2,0,0,0,0,0,...,42,70,0.0,0.0,0.0,8.0,21.0,30010.0,3.0,1
1155,0,0,1,0,1,0,0,0,0,0,...,78,101,0.0,0.0,0.0,102.0,146.0,70180.0,7.0,1
1156,1,2,0,0,0,0,0,0,0,0,...,74,140,1116.45,1041.375,1329.769,100.0,28.0,40010.0,4.0,1
1157,0,2,0,0,1,0,0,0,0,0,...,1,12,0.0,0.0,0.0,186.0,324.0,10020.0,1.0,1
1158,211,528,234,304,726,181,202,190,198,137,...,1158,1158,699.0,698.0,668.0,1158.0,1158.0,1158.0,1158.0,1158


In [298]:
respondent_wins = train_concat[train_concat['partyWinning'] == '0']
respondent_wins = respondent_wins.append(respondent_wins.apply(np.count_nonzero, axis = 0), ignore_index = True)

In [299]:
respondent_wins.tail()

Unnamed: 0,ability to,able to,about it,about that,about the,about this,about what,about whether,absence of,access to,...,appellee/respondent_count,scotus_justice_count,appellant/petitioner_length,appellee/respondent_length,scotus_justice_length,petitioner,respondent,issue,issueArea,partyWinning
693,0,1,1,1,2,0,0,0,0,0,...,39,116,1470.044,1322.339,868.866,4.0,198.0,80100.0,8.0,0
694,0,1,0,0,2,0,0,0,0,0,...,78,150,0.0,0.0,0.0,21.0,239.0,20210.0,2.0,0
695,1,0,0,0,2,0,2,1,1,0,...,101,222,1443.44,1251.734,2485.569,126.0,28.0,100020.0,10.0,0
696,2,8,0,0,0,0,0,0,0,0,...,68,194,1323.691,926.045,794.287,126.0,27.0,30200.0,3.0,0
697,140,343,149,179,441,105,106,106,92,76,...,697,697,422.0,427.0,411.0,697.0,697.0,689.0,689.0,697


In [302]:
#Index of the last token token
print(petitioner_wins.columns[29598])
print(respondent_wins.columns[29598])

your your
your your


In [303]:
#Remove non-BOW Columns
petitioner_wins = petitioner_wins.loc[:, :"your your"]
respondent_wins = respondent_wins.loc[:, :"your your"]

In [304]:
#Find 10 most common tokens when petitioner/respondent wins
petitioner_wins_most_common = petitioner_wins.iloc[-1].nlargest(10)
respondent_wins_most_common = respondent_wins.iloc[-1].nlargest(10)
print("Most common tokens when petitioner wins: \n", petitioner_wins_most_common, sep = '')
print("-" * 100)
print("Most common tokens when respondent wins: \n", respondent_wins_most_common, sep = '')

Most common tokens when petitioner wins: 
in the       1158
the court    1158
in           1158
it           1158
of           1158
that         1158
the          1158
to           1158
of the       1157
that the     1157
Name: 1158, dtype: int64
----------------------------------------------------------------------------------------------------
Most common tokens when respondent wins: 
in the    697
of the    697
and       697
as        697
be        697
for       697
have      697
in        697
is        697
it        697
Name: 697, dtype: int64


In [305]:
#Find the tokens never used when the petitioner/respondent wins
zero_counts_petitioner_wins = petitioner_wins.columns[petitioner_wins.iloc[-1] == 0]
zero_counts_respondent_wins = respondent_wins.columns[respondent_wins.iloc[-1] == 0]

In [332]:
#Find tokens used when petitioner wins but not when respondent wins and vice versa
token_only_present_when_respondent_wins = {}
token_only_present_when_petitioner_wins = {}
for token in list(zero_counts_petitioner_wins):
    if token not in list(zero_counts_respondent_wins):
        token_only_present_when_respondent_wins[token] = respondent_wins.iloc[-1][token]
for t in list(zero_counts_respondent_wins):
    if t not in list(zero_counts_petitioner_wins):
        token_only_present_when_petitioner_wins[t] = petitioner_wins.iloc[-1][t]

In [344]:
#Convert to DF
token_only_present_when_respondent_wins = pd.DataFrame(token_only_present_when_respondent_wins, 
                                                       index = [0])
token_only_present_when_petitioner_wins = pd.DataFrame(token_only_present_when_petitioner_wins,
                                                       index = [0])

In [346]:
#Find the most common tokens used when the respondent wins but not present when the petitioner wins and vice versa
most_common_token_only_present_when_respondent_wins = token_only_present_when_respondent_wins.iloc[0].nlargest(10)
most_common_token_only_present_when_petitioner_wins = token_only_present_when_petitioner_wins.iloc[0].nlargest(10)
print("Most common tokens only present when petitioner wins: \n", most_common_token_only_present_when_respondent_wins,
      sep = '')
print("-" * 100)
print("Most common tokens only present when respondent wins: \n", most_common_token_only_present_when_petitioner_wins,
      sep = '')

Most common tokens only present when petitioner wins: 
watershed        9
apportioning     6
disparagement    5
draftsmanship    5
elevation        5
explosive        5
invalidates      5
lessees          5
multistate       5
oust             5
Name: 0, dtype: int64
----------------------------------------------------------------------------------------------------
Most common tokens only present when respondent wins: 
personality    15
schneckloth    13
cat            12
disappears     11
theater        11
thou           11
tight          11
litigations    10
servant        10
112             9
Name: 0, dtype: int64
