In [6]:
import os
import csv
import numpy as np
import pandas as pd
import zipfile
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
#unzip and read turns files into pandas df
zf1 = zipfile.ZipFile('./turns_part1.zip') 
turns1 = pd.read_csv(zf1.open('turns_part1.csv'))
zf2 = zipfile.ZipFile('./turns_part2.zip') 
turns2 = pd.read_csv(zf2.open('turns_part2.csv'))
zf3 = zipfile.ZipFile('./turns_part3.zip') 
turns3 = pd.read_csv(zf3.open('turns_part3.csv'))
zf4 = zipfile.ZipFile('./turns_part4.zip') 
turns4 = pd.read_csv(zf4.open('turns_part4.csv'))

#read summaries file into pandas df
summaries = pd.read_csv('summaries.csv')

In [3]:
#remove OA and _orig from transcript id in summaries df
summaries['transcript_id'] = summaries['transcript_id'].str.replace('OA','')
summaries['transcript_id'] = summaries['transcript_id'].str.replace('_orig','')

#create new column for verdict variable
def f(row):
    if row['winning_party'] == row['first_party']:
        val = row['first_party_label']
    elif row['winning_party'] == row['second_party']:
        val = row['second_party_label']
    else:
        val = 'No Verdict'
    return val

summaries['verdict'] = summaries.apply(f, axis=1)

#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('transcript_id')['verdict'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df[train_test_df.verdict.notnull()]
train_test_df = train_test_df.dropna()
train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.verdict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

Accuracy of Bag-of-Words Random Forest Model: 0.8511
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.8112
Accuracy of Bag-of-Words Logistic Regression Model: 0.8901
Actual Count of y_test Verdicts:
Petitioner    831
Appellant     153
Respondent     16
Appellee        1
Name: verdict, dtype: int64


In [4]:
####rerunning above but using the case name instead of case id

#create new column for verdict variable
def f(row):
    if row['winning_party'] == row['first_party']:
        val = row['first_party_label']
    elif row['winning_party'] == row['second_party']:
        val = row['second_party_label']
    else:
        val = 'No Verdict'
    return val

summaries['verdict'] = summaries.apply(f, axis=1)

#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'title', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(summaries.set_index('case_name')['verdict'], on='title')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df[train_test_df.verdict.notnull()]
train_test_df = train_test_df.dropna()
train_test_df = train_test_df[train_test_df['verdict'] != 'No Verdict']

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.verdict
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

print 'LR predictions:\n', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Random Forest Model: 0.8294
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.8191
Accuracy of Bag-of-Words Logistic Regression Model: 0.888
Actual Count of y_test Verdicts:
Petitioner    782
Appellant     176
Respondent     15
Name: verdict, dtype: int64
LR predictions:
(array(['Appellant', 'Petitioner', 'Respondent'], dtype=object), array([146, 825,   2]))


In [5]:
#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = csv.reader(open('../SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[36]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'speaker_role', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))

#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#join verdict into df
train_test_df = speaker_role_text_pivot.join(verdict.set_index('term_docket')['partyWinning'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()

#create train and test split
x = train_test_df[['not_a_justice', 'scotus_justice']]
y = train_test_df.partyWinning
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect = CountVectorizer()
x_train_cv_not_a_justice = count_vect.fit(x_train['not_a_justice'])
x_train_cv_scotus_justice = count_vect.fit(x_train['scotus_justice'])
x_test_cv_not_a_justice = count_vect.fit(x_test['not_a_justice'])
x_test_cv_scotus_justice = count_vect.fit(x_test['scotus_justice'])

x_train_cv_not_a_justice = pd.DataFrame(x_train_cv_not_a_justice.transform(x_train['not_a_justice']).todense(),
                                        columns = x_train_cv_not_a_justice.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice.get_feature_names())
x_test_cv_not_a_justice = pd.DataFrame(x_test_cv_not_a_justice.transform(x_test['not_a_justice']).todense(),
                                        columns = x_test_cv_not_a_justice.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.transform(x_test['scotus_justice']).todense(),
                                        columns = x_test_cv_scotus_justice.get_feature_names())

#concatenate the not_a_justice bow and scotus_justice bow
x_train = pd.concat([x_train_cv_not_a_justice, x_train_cv_scotus_justice], axis = 1)
x_test = pd.concat([x_test_cv_not_a_justice, x_test_cv_scotus_justice], axis = 1)

#random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(x_train, y_train)
random_forest_prediction = forest.predict(x_test)
rf_accuracy = np.mean(random_forest_prediction == y_test)
print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# Multinomial Naive Baynes
nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
naive_baynes_prediction = nb.predict(x_test)
nb_accuracy = np.mean(naive_baynes_prediction == y_test)
print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)
print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)

print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()

print 'LR predictions:\n', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Random Forest Model: 0.6311
Accuracy of Bag-of-Words Multinomial Naive Baynes Model: 0.576
Accuracy of Bag-of-Words Logistic Regression Model: 0.5658
Actual Count of y_test Verdicts:
1    1118
0     644
Name: partyWinning, dtype: int64
LR predictions:
(array(['0', '1'], dtype=object), array([ 619, 1143]))


In [3]:
##SET UP

#use original scdb winning party as verdict
#verdict value 0 = no favorable disposition for petitioning part apparent
#verdict value 1 = petitioning party received a favorable disposition
verdict = []
verdict_csv = csv.reader(open('../SCDB_2017_01_caseCentered_Citation.csv'))
for row in verdict_csv:
    docket_number = re.sub('-', '_', row[13])
    docket_number = re.sub(' ORIG', '_orig', docket_number)
    case_id = row[10]+'_'+docket_number
    verdict.append([case_id, row[36]])
    
verdict_header = verdict.pop(0)
verdict = pd.DataFrame(verdict, columns = verdict_header)
    
    
#concate the turns files
turns_combined = pd.concat([turns1, turns2, turns3, turns4])

#remove _t01 and _t02 from transcript_id in turns_combined
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t01','')
turns_combined['transcript_id'] = turns_combined['transcript_id'].str.replace('_t02','')

#get advocate sides
advocates = pd.read_json('./advocate_dict.json')

advocate_turns = []

for index, row in turns_combined.iterrows():
    if row['speaker_role'] == 'scotus_justice':
        advocate_turns.append('scotus_justice')
    else:
        speaker = row['speaker']
        transcript_id = row['transcript_id']
        try:
            lawyer_side = advocates.ix[speaker][transcript_id]
            advocate_turns.append(lawyer_side)
        except:
            advocate_turns.append('None')

#insert advocate side to the turns_combined dataframe
turns_combined['lawyer_side'] = advocate_turns

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  return self._engine.get_loc(key)
  return self._engine.get_loc(self._maybe_cast_indexer(key))


In [4]:
##PIVOT

#pivot turns files by transcript_id and speaker_role for texts
speaker_role_text_pivot = turns_combined.pivot_table(index = 'transcript_id', 
                                                     columns = 'lawyer_side', 
                                                     values = 'text',
                                                     aggfunc=lambda x: ' '.join(x))
#reset index
speaker_role_text_pivot = speaker_role_text_pivot.reset_index()

#drop columns with no lawyer side tags
speaker_role_text_pivot = speaker_role_text_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#count of number of times speaker spoke
counts_pivot = pd.pivot_table(turns_combined[['transcript_id', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              aggfunc=len,
                              fill_value=0)

#reset index
counts_pivot = counts_pivot.reset_index()

#drop columns with no lawyer side tags
counts_pivot = counts_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
counts_pivot.columns = ['transcript_id', 'appellant/petitioner_count', 'appellee/respondent_count',
                       'scotus_justice_count']

#length of speaker speaking
length_pivot = pd.pivot_table(turns_combined[['transcript_id', 'text_duration', 'lawyer_side']],
                              index = 'transcript_id',
                              columns = 'lawyer_side',
                              values = 'text_duration',
                              aggfunc=np.sum,
                              fill_value=0)

#reset index
length_pivot = length_pivot.reset_index()

#drop columns with no lawyer side tags
length_pivot = length_pivot.drop(['NEED MORE INFO', 'None'], axis = 1)

#rename headers
length_pivot.columns = ['transcript_id', 'appellant/petitioner_length', 'appellee/respondent_length',
                       'scotus_justice_length']

#concatenate pivots together
pivots_concate = pd.concat([speaker_role_text_pivot, 
                            counts_pivot[counts_pivot.columns[1:4]], 
                            length_pivot[length_pivot.columns[1:4]]],
                            axis=1,
                            join='inner')

In [25]:
##MODELING

#join verdict into df
train_test_df = pivots_concate.join(verdict.set_index('term_docket')['partyWinning'], on='transcript_id')

#remove NAs and blanks (these give errors when vectorizing)
train_test_df = train_test_df.dropna()

#create train and test split
x = train_test_df[['appellant/petitioner', 'appellee/respondent', 'scotus_justice',
                   'appellant/petitioner_count', 'appellee/respondent_count', 'scotus_justice_count',
                   'appellant/petitioner_length', 'appellee/respondent_length','scotus_justice_length']]
y = train_test_df.partyWinning
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33)

#count vectorizer
count_vect_1 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)
count_vect_2 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)
count_vect_3 = TfidfVectorizer(ngram_range = (3,3), max_features = 2500, stop_words='english', lowercase = False)

x_train_cv_petitioner_vec = count_vect_1.fit(x_train['appellant/petitioner'])
voc1 = count_vect_1.vocabulary_
x_train_cv_respondant_vec = count_vect_2.fit(x_train['appellee/respondent'])
voc2 = count_vect_2.vocabulary_
x_train_cv_scotus_justice_vec = count_vect_3.fit(x_train['scotus_justice'])

x_test_cv_petitioner = count_vect_1.transform(x_test['appellant/petitioner'])
x_test_cv_respondant = count_vect_2.transform(x_test['appellee/respondent'])
x_test_cv_scotus_justice = count_vect_3.transform(x_test['scotus_justice'])

x_train_cv_petitioner = pd.DataFrame(x_train_cv_petitioner_vec.transform(x_train['appellant/petitioner']).todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_train_cv_respondant = pd.DataFrame(x_train_cv_respondant_vec.transform(x_train['appellee/respondent']).todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_train_cv_scotus_justice = pd.DataFrame(x_train_cv_scotus_justice_vec.transform(x_train['scotus_justice']).todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())
x_test_cv_petitioner = pd.DataFrame(x_test_cv_petitioner.todense(),
                                        columns = x_train_cv_petitioner_vec.get_feature_names())
x_test_cv_respondant = pd.DataFrame(x_test_cv_respondant.todense(),
                                        columns = x_train_cv_respondant_vec.get_feature_names())
x_test_cv_scotus_justice = pd.DataFrame(x_test_cv_scotus_justice.todense(),
                                        columns = x_train_cv_scotus_justice_vec.get_feature_names())

#put features into its own df for concatenate in next step
x_train_features = x_train[x_train.columns[3:9]]
x_train_features = x_train_features.reset_index()
x_test_features = x_test[x_train.columns[3:9]]
x_test_features = x_test_features.reset_index()

#change negative numbers into 0
x_train_features[x_train_features < 0 ] = 0
x_test_features[x_test_features < 0 ] = 0

#concatenate the bow back
x_train = pd.concat([x_train_cv_petitioner, 
                     x_train_cv_respondant, 
                     x_train_cv_scotus_justice,
                     x_train_features], 
                     axis = 1)
x_test = pd.concat([x_test_cv_petitioner, 
                    x_test_cv_respondant, 
                    x_test_cv_scotus_justice,
                    x_test_features],
                    axis = 1)

#reset index to match x
y_train = y_train.reset_index()
y_train = y_train.drop(['index'], axis=1)
x_train = x_train.drop(['index'], axis=1)
x_test = x_test.drop(['index'], axis=1)

# #random forest
# forest = RandomForestClassifier(n_estimators = 1000)
# forest = forest.fit(x_train, y_train)
# random_forest_prediction = forest.predict(x_test)
# rf_accuracy = np.mean(random_forest_prediction == y_test)
# print 'Accuracy of Bag-of-Words Random Forest Model:', round(rf_accuracy,4)

# # # Multinomial Naive Baynes
# # nb = MultinomialNB(alpha = 0.1).fit(x_train, y_train)
# # naive_baynes_prediction = nb.predict(x_test)
# # nb_accuracy = np.mean(naive_baynes_prediction == y_test)
# # print 'Accuracy of Bag-of-Words Multinomial Naive Baynes Model:', round(nb_accuracy,4)

# logistic regression
lr_model = LogisticRegression(C = 1, penalty = "l2")
lr_model_train = lr_model.fit(x_train, y_train)
lr_prediction = lr_model.predict(x_test)
lr_accuracy = np.mean(lr_prediction == y_test)

print 'Accuracy of Bag-of-Words Logistic Regression Model:', round(lr_accuracy,4)
print 'Actual Count of y_test Verdicts:\n', y_test.value_counts()
# print 'RF predictions:', np.unique(random_forest_prediction, return_counts=True)
# # print 'NB predictions:', np.unique(naive_baynes_prediction, return_counts=True)
print 'LR predictions:', np.unique(lr_prediction, return_counts=True)

Accuracy of Bag-of-Words Logistic Regression Model: 0.6448
Actual Count of y_test Verdicts:
1    570
0    345
Name: partyWinning, dtype: int64
LR predictions: (array(['0', '1'], dtype=object), array([154, 761]))
