In [20]:
# Import libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Set styling parameters
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
plt.style.use('fivethirtyeight')



In [107]:
# Import ML models 

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import xgboost as xgb

In [28]:
og_data = pd.read_csv('SCDB_2018_01_justiceCentered_Citation.csv', encoding = 'ISO-8859-1')

In [29]:
og_data.head()

Unnamed: 0,caseId,docketId,caseIssuesId,voteId,dateDecision,decisionType,usCite,sctCite,ledCite,lexisCite,...,majVotes,minVotes,justice,justiceName,vote,opinion,direction,majority,firstAgreement,secondAgreement
0,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-01,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,86,HHBurton,2.0,1.0,1.0,1.0,,
1,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-02,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,84,RHJackson,1.0,1.0,2.0,2.0,,
2,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-03,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,81,WODouglas,1.0,1.0,2.0,2.0,,
3,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-04,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,80,FFrankfurter,4.0,2.0,2.0,2.0,,
4,1946-001,1946-001-01,1946-001-01-01,1946-001-01-01-01-05,11/18/1946,1,329 U.S. 1,67 S. Ct. 6,91 L. Ed. 3,1946 U.S. LEXIS 1724,...,8,1,79,SFReed,1.0,1.0,2.0,2.0,,


In [30]:
og_data.columns

Index(['caseId', 'docketId', 'caseIssuesId', 'voteId', 'dateDecision',
       'decisionType', 'usCite', 'sctCite', 'ledCite', 'lexisCite', 'term',
       'naturalCourt', 'chief', 'docket', 'caseName', 'dateArgument',
       'dateRearg', 'petitioner', 'petitionerState', 'respondent',
       'respondentState', 'jurisdiction', 'adminAction', 'adminActionState',
       'threeJudgeFdc', 'caseOrigin', 'caseOriginState', 'caseSource',
       'caseSourceState', 'lcDisagreement', 'certReason', 'lcDisposition',
       'lcDispositionDirection', 'declarationUncon', 'caseDisposition',
       'caseDispositionUnusual', 'partyWinning', 'precedentAlteration',
       'voteUnclear', 'issue', 'issueArea', 'decisionDirection',
       'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
       'lawType', 'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
       'splitVote', 'majVotes', 'minVotes', 'justice', 'justiceName', 'vote',
       'opinion', 'direction', 'majority', 'firstA

In [31]:
working_data = og_data.drop(columns = ['docketId', 'caseIssuesId', 'voteId', 'dateDecision',
                                        'usCite', 'sctCite', 'ledCite', 'lexisCite', 'chief', 
                                        'docket', 'caseName', 'petitionerState', 'respondentState', 
                                        'adminActionState', 'threeJudgeFdc', 'caseOriginState',
                                        'caseSourceState', 'certReason', 'declarationUncon', 
                                        'caseDispositionUnusual', 'partyWinning', 'voteUnclear',
                                        'decisionDirectionDissent', 'authorityDecision1', 'authorityDecision2',
                                        'lawSupp', 'lawMinor', 'majOpinWriter', 'majOpinAssigner',
                                        'splitVote','firstAgreement', 'secondAgreement', 
                                        'dateArgument', 'dateRearg', 'petitioner', 'respondent', 
                                        'jurisdiction', 'decisionType', 'term', 'caseOrigin',
                                        'caseSource', 'caseDisposition', 'decisionDirection',
                                        'majVotes', 'minVotes', 'majority'])

In [32]:
# fence_categories = ['lcDisposition', 'majority', 'lawType']
# working_data.drop(columns = fence_categories)

In [33]:
working_data.columns

Index(['caseId', 'naturalCourt', 'adminAction', 'lcDisagreement',
       'lcDisposition', 'lcDispositionDirection', 'precedentAlteration',
       'issue', 'issueArea', 'lawType', 'justice', 'justiceName', 'vote',
       'opinion', 'direction'],
      dtype='object')

In [35]:
# working_data.decisionDirection.value_counts()

In [36]:
working_data.justiceName.value_counts()

WJBrennan       5327
BRWhite         4946
WHRehnquist     4535
JPStevens       4268
WODouglas       4001
TMarshall       3881
HABlackmun      3771
PStewart        3592
HLBlack         3302
SDOConnor       2914
AMKennedy       2879
AScalia         2857
WEBurger        2809
LFPowell        2652
JHarlan2        2351
CThomas         2330
TCClark         2292
EWarren         2205
RBGinsburg      2092
SGBreyer        1994
FFrankfurter    1917
DHSouter        1753
HHBurton        1388
SFReed          1151
JGRoberts       1023
SAAlito          997
RHJackson        899
FMVinson         812
SMinton          717
SSotomayor       704
CEWhittaker      691
EKagan           612
AFortas          581
AJGoldberg       475
WBRutledge       387
FMurphy          387
NMGorsuch        120
Name: justiceName, dtype: int64

In [46]:
working_data.isna().sum()

caseId                        0
naturalCourt                  0
adminAction               57631
lcDisagreement               81
lcDisposition             11209
lcDispositionDirection     1866
precedentAlteration           9
issue                       539
issueArea                   539
lawType                   11233
justice                       0
justiceName                   0
vote                       1997
opinion                    2006
direction                  4516
dtype: int64

## Modeling 

### Random Forest 

Just issue areas (for the sake of dimensionality, to see if there's any true value to keeping all those categories). 

In [125]:
no_issue_data = working_data.drop(columns = ['issue'])
# no_issue_data = working_data

In [126]:
Ginsburg_df = no_issue_data[no_issue_data['justiceName'] == 'RBGinsburg']

In [127]:
Ginsburg_df = Ginsburg_df.drop(columns = ['caseId', 'justice', 'justiceName'])
# df = df[pd.notnull(df['EPS'])]

In [128]:
# if Ginsburg_df[Ginsburg_df['adminAction'].notnull()]:
#     Ginsburg_df['lcDisposition'] = 4
    
Ginsburg_df = Ginsburg_df.dropna(subset=['direction', 'vote'])

In [129]:
Ginsburg_df = Ginsburg_df.fillna(999).astype(int)
Ginsburg_df.head()

Unnamed: 0,naturalCourt,adminAction,lcDisagreement,lcDisposition,lcDispositionDirection,precedentAlteration,issueArea,lawType,vote,opinion,direction
60889,1606,999,0,999,1,0,2,4,1,1,1
60898,1606,999,0,999,1,0,2,4,1,1,1
60907,1606,999,0,2,2,0,2,3,1,1,2
60916,1606,999,0,2,1,0,2,3,3,2,2
60925,1606,999,0,6,1,0,9,4,1,1,1


In [130]:
Ginsburg_df.isna().sum()

naturalCourt              0
adminAction               0
lcDisagreement            0
lcDisposition             0
lcDispositionDirection    0
precedentAlteration       0
issueArea                 0
lawType                   0
vote                      0
opinion                   0
direction                 0
dtype: int64

In [131]:
Ginsburg_target = Ginsburg_df['direction']
Ginsburg_df = Ginsburg_df.drop(columns = ['direction'])

In [132]:
for i in Ginsburg_df.columns:
    Ginsburg_df[i] = Ginsburg_df[i].astype('category')

In [133]:
Ginsburg_dumb_data = pd.get_dummies(Ginsburg_df)

In [134]:
data_train, data_test, target_train, target_test = train_test_split(Ginsburg_dumb_data, Ginsburg_target)

In [135]:
forest = RandomForestClassifier()
forest.fit(data_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [136]:
print("On Training, score was {}".format(forest.score(data_train, target_train)))
print("On Test, score was {}".format(forest.score(data_test, target_test)))

On Training, score was 0.9324146981627297
On Test, score was 0.6673228346456693


In [137]:
Ginsburg_dumb_data.shape

(2032, 121)

In [138]:
# Using GridSearchCV to do hyperparameter tuning, note that this takes a while to run and will tax your system
# a bit if you're running this on a laptop. 

n_estimators = [300, 500, 700]
max_depth = [10,12,14,16]

param_grid_forest = dict(n_estimators=n_estimators, max_depth=max_depth)


kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

grid_search = GridSearchCV(forest, param_grid_forest, scoring="accuracy", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(data_train, target_train)

# Interpreting results 
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.703412 using {'n_estimators': 500, 'max_depth': 14}
0.695538 (0.035037) with: {'n_estimators': 300, 'max_depth': 10}
0.694226 (0.039259) with: {'n_estimators': 500, 'max_depth': 10}
0.699475 (0.038142) with: {'n_estimators': 700, 'max_depth': 10}
0.700131 (0.040887) with: {'n_estimators': 300, 'max_depth': 12}
0.695538 (0.043165) with: {'n_estimators': 500, 'max_depth': 12}
0.696194 (0.039666) with: {'n_estimators': 700, 'max_depth': 12}
0.698163 (0.032463) with: {'n_estimators': 300, 'max_depth': 14}
0.703412 (0.039623) with: {'n_estimators': 500, 'max_depth': 14}
0.700131 (0.037303) with: {'n_estimators': 700, 'max_depth': 14}
0.697507 (0.033157) with: {'n_estimators': 300, 'max_depth': 16}
0.698163 (0.037470) with: {'n_estimators': 500, 'max_depth': 16}
0.699475 (0.038109) with: {'n_estimators': 700, 'max_depth': 16}


In [139]:
eval_set = [(data_test, target_test)]
xgb_model = xgb.XGBClassifier()
xgb_model.fit(data_train, target_train, early_stopping_rounds=30, eval_metric="error", eval_set=eval_set, verbose=True)

[0]	validation_0-error:0.309055
Will train until validation_0-error hasn't improved in 30 rounds.
[1]	validation_0-error:0.301181
[2]	validation_0-error:0.309055
[3]	validation_0-error:0.301181
[4]	validation_0-error:0.301181
[5]	validation_0-error:0.299213
[6]	validation_0-error:0.301181
[7]	validation_0-error:0.301181
[8]	validation_0-error:0.301181
[9]	validation_0-error:0.299213
[10]	validation_0-error:0.299213
[11]	validation_0-error:0.299213
[12]	validation_0-error:0.299213
[13]	validation_0-error:0.301181
[14]	validation_0-error:0.299213
[15]	validation_0-error:0.299213
[16]	validation_0-error:0.299213
[17]	validation_0-error:0.28937
[18]	validation_0-error:0.285433
[19]	validation_0-error:0.285433
[20]	validation_0-error:0.285433
[21]	validation_0-error:0.295276
[22]	validation_0-error:0.283465
[23]	validation_0-error:0.281496
[24]	validation_0-error:0.281496
[25]	validation_0-error:0.301181
[26]	validation_0-error:0.299213
[27]	validation_0-error:0.299213
[28]	validation_0-err

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [140]:
print("On Training, score was {}".format(xgb_model.score(data_train, target_train)))
print("On Test, score was {}".format(xgb_model.score(data_test, target_test)))

On Training, score was 0.7198162729658792
On Test, score was 0.7086614173228346


In [144]:
# Using GridSearchCV to do hyperparameter tuning, note that this takes a while to run and will tax your system
# a bit if you're running this on a laptop. 

learning_rate = [0.005, 0.01, 0.02]
n_estimators = [150]
max_depth = [2,4,6]

param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)


kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

grid_search = GridSearchCV(xgb_model, param_grid, scoring="accuracy", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(data_train, target_train)

# Interpreting results 
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.694882 using {'n_estimators': 150, 'learning_rate': 0.01, 'max_depth': 4}
0.674541 (0.028003) with: {'n_estimators': 150, 'learning_rate': 0.005, 'max_depth': 2}
0.687664 (0.038605) with: {'n_estimators': 150, 'learning_rate': 0.005, 'max_depth': 4}
0.680446 (0.035468) with: {'n_estimators': 150, 'learning_rate': 0.005, 'max_depth': 6}
0.673885 (0.027641) with: {'n_estimators': 150, 'learning_rate': 0.01, 'max_depth': 2}
0.694882 (0.044332) with: {'n_estimators': 150, 'learning_rate': 0.01, 'max_depth': 4}
0.690945 (0.036382) with: {'n_estimators': 150, 'learning_rate': 0.01, 'max_depth': 6}
0.673885 (0.023236) with: {'n_estimators': 150, 'learning_rate': 0.02, 'max_depth': 2}
0.687664 (0.046944) with: {'n_estimators': 150, 'learning_rate': 0.02, 'max_depth': 4}
0.690945 (0.030626) with: {'n_estimators': 150, 'learning_rate': 0.02, 'max_depth': 6}


In [44]:
Scalia_df = no_issue_data[no_issue_data['justiceName'] == 'AScalia']
Scalia_df = Scalia_df.drop(columns = ['caseId','justice', 'justiceName'])
Scalia_df.isna().sum()

naturalCourt                 0
adminAction               2230
lcDisagreement               4
lcDisposition              186
lcDispositionDirection      45
precedentAlteration          1
issueArea                   10
lawType                    268
vote                        13
opinion                     14
direction                   84
dtype: int64

In [45]:
Scalia_df.shape

(2857, 11)