In [1]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'))
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'))
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'))
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'))

#read summaries file into pandas df
summaries = pd.read_csv('summaries.csv')

In [72]:
#remove OA and _orig from transcript id in summaries df
summaries['transcript_id'] = summaries['transcript_id'].str.replace('OA','')
summaries['transcript_id'] = summaries['transcript_id'].str.replace('_orig','')

#create new column for verdict variable
def f(row):
    if row['winning_party'] == row['first_party']:
        val = row['first_party_label']
    elif row['winning_party'] == row['second_party']:
        val = row['second_party_label']
    else:
        val = 'No Verdict'
    return val

summaries['verdict'] = summaries.apply(f, axis=1)

#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('transcript_id')['verdict'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df[train_test_df.verdict.notnull()]
train_test_df = train_test_df.dropna()
train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.verdict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

Accuracy of Bag-of-Words Random Forest Model: 0.8531
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.7922
Accuracy of Bag-of-Words Logistic Regression Model: 0.8831
Actual Count of y_test Verdicts:
Petitioner    824
Appellant     155
Respondent     21
Appellee        1
Name: verdict, dtype: int64


In [28]:
####rerunning above but using the case name instead of case id

#create new column for verdict variable
def f(row):
    if row['winning_party'] == row['first_party']:
        val = row['first_party_label']
    elif row['winning_party'] == row['second_party']:
        val = row['second_party_label']
    else:
        val = 'No Verdict'
    return val

summaries['verdict'] = summaries.apply(f, axis=1)

#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'title', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('case_name')['verdict'], on='title')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df[train_test_df.verdict.notnull()]
train_test_df = train_test_df.dropna()
train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.verdict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

print 'LR predictions:\n', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Random Forest Model: 0.851
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.8345
Accuracy of Bag-of-Words Logistic Regression Model: 0.8859
Actual Count of y_test Verdicts:
Petitioner    801
Appellant     159
Respondent     13
Name: verdict, dtype: int64
LR predictions:
(array(['Appellant', 'Petitioner', 'Respondent'], dtype=object), array([137, 835,   1]))


In [107]:
#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = csv.reader(open('../SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    winning_party
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[36]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(verdict.set_index('term_docket')['partyWinning'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.partyWinning
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

print 'LR predictions:\n', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Random Forest Model: 0.6215
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.5715
Accuracy of Bag-of-Words Logistic Regression Model: 0.559
Actual Count of y_test Verdicts:
1    1107
0     655
Name: partyWinning, dtype: int64
LR predictions:
(array(['0', '1'], dtype=object), array([ 582, 1180]))
