In [12]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns",100)

In [10]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'))
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'))
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'))
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'))

#read summaries file into pandas df
#summaries = pd.read_csv('summaries.csv')
summaries = pd.read_csv("[Not_Oyez]SCDB_2017_01_caseCentered_Citation.csv", encoding="cp1252")

In [18]:
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

In [23]:
#Create transcript_id by concatenating year and docket
summaries['transcript_id'] = summaries.apply(lambda x: str(x['term']) + "_" + str(x['docket']), axis = 1)

In [29]:
#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('transcript_id')['decisionType'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()

In [30]:
#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.decisionType
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

In [31]:
#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

In [32]:
x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

In [33]:
#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print('Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4))

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print('Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4))

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print('Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4))

Accuracy of Bag-of-Words Random Forest Model: 0.8198
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.6832
Accuracy of Bag-of-Words Logistic Regression Model: 0.7624


In [36]:
petitioner_count = 0
respondent_count = 0
others = 0
for prediction in list(forest.predict(x_test)):
    if prediction == 1.0:
        petitioner_count += 1
    else:
        respondent_count += 1
        
print("Petitioner Count:", petitioner_count)
print("Respondent Count:", respondent_count)

Petitioner Count: 504
Respondent Count: 1


In [4]:
### For Use With Oyez Summaries File

# #remove OA and _orig from transcript id in summaries df
# summaries['transcript_id'] = summaries['transcript_id'].str.replace('OA','')
# summaries['transcript_id'] = summaries['transcript_id'].str.replace('_orig','')

# #create new column for verdict variable
# def f(row):
#     if row['winning_party'] == row['first_party']:
#         val = row['first_party_label']
#     elif row['winning_party'] == row['second_party']:
#         val = row['second_party_label']
#     else:
#         val = 'No Verdict'
#     return val

# summaries['verdict'] = summaries.apply(f, axis=1)

# #concate the turns files
# turns_combined = pd.concat([turns1, turns2, turns3, turns4])

# #remove _t01 and _t02 from transcript_id in turns_combined
# turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
# turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

# #pivot turns files by transcript_id and speaker_role for texts
# speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
#                                                      columns = 'speaker_role', 
#                                                      values = 'text',
#                                                      aggfunc=lambda x: ' '.join(x))

# #reset index
# speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

# #join verdict into df
# train_test_df = speaker_role_text_pivot.join(summaries.set_index('transcript_id')['verdict'], on='transcript_id')

# #remove NAs and blanks (these give errors when vectorizing)
# train_test_df = train_test_df[train_test_df.verdict.notnull()]
# train_test_df = train_test_df.dropna()
# train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

# #create train and test split
# x = train_test_df[['not_a_justice', 'scotus_justice']]
# y = train_test_df.verdict
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

# #count vectorizer
# count_vect = CountVectorizer()
# x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
# x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
# x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
# x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

# x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
#                                         columns = x_train_cv_not_a_justice.get_feature_names())
# x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
#                                         columns = x_train_cv_scotus_justice.get_feature_names())
# x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
#                                         columns = x_test_cv_not_a_justice.get_feature_names())
# x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
#                                         columns = x_test_cv_scotus_justice.get_feature_names())

# #concatenate the not_a_justice bow and scotus_justice bow
# x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
# x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

# #random forest
# forest = RandomForestClassifier(n_estimators = 100)
# forest = forest.fit(x_train, y_train)
# random_forest_prediction = forest.predict(x_test)
# rf_accuracy = np.mean(random_forest_prediction == y_test)
# print('Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4))

# # Multinomial Naive Baynes
# nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
# naive_baynes_prediction = nb.predict(x_test)
# nb_accuracy = np.mean(naive_baynes_prediction == y_test)
# print('Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4))

# # logistic regression
# lr_model = LogisticRegression(C = 1, penalty = "l2")
# lr_model_train = lr_model.fit(x_train, y_train)
# lr_prediction = lr_model.predict(x_test)
# lr_accuracy = np.mean(lr_prediction == y_test)
# print('Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4))

Accuracy of Bag-of-Words Random Forest Model: 0.8402
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.7972
Accuracy of Bag-of-Words Logistic Regression Model: 0.8851
