In [9]:
# if you are havign difficulty downloading numpy or scipy (requirements for sklearn), 
# https://scikit-learn.org/stable/modules/tree.html
from sklearn import tree, metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [22]:
# load data cleaned from rscript
sc_df = pd.read_csv('../sc_decisions_final.csv')

Unnamed: 0,c_lcDisagreement,c_lcDispositionDirection,c_issueArea,c_adminActionBool,c_oralArgBool,c_reargBool,c_decisionTime,c_formerCourt,c_caseOriginRegion,c_caseOriginCourt,...,e_decisionPresidentParty,e_JCApproval,j_justiceGender,j_justiceDecisionAgeBand,j_justiceRegion,j_justiceReligion,j_justiceEthnicity,j_justicePresidentParty,j_justiceVotesAppointed,direction
0,0.0,Conservative,Judicial Power,1,1,0,31-180 Days,0,South,District Court,...,Democrat,Approve,Male,65+,Midwest,Lutheran,Caucasian,Republican,<75%,Liberal
1,0.0,Conservative,Judicial Power,1,1,0,31-180 Days,1,South,District Court,...,Democrat,Approve,Male,65+,Midwest,Protestant,Caucasian,Republican,>=75%,Liberal
2,0.0,Conservative,Judicial Power,1,1,0,31-180 Days,0,South,District Court,...,Democrat,Approve,Female,65+,West,,Caucasian,Republican,>=75%,Liberal
3,0.0,Conservative,Judicial Power,1,1,0,31-180 Days,1,South,District Court,...,Democrat,Approve,Male,0-65,Northeast,Catholicism,Caucasian,Republican,>=75%,Conservative
4,0.0,Conservative,Judicial Power,1,1,0,31-180 Days,1,South,District Court,...,Democrat,Approve,Male,0-65,West,Catholicism,Caucasian,Republican,>=75%,Liberal


In [23]:
# fill missing values as they are legitimate
sc_df = sc_df.fillna("N/A")

In [24]:
# select explanatory and response variables
X = sc_df.loc[:, sc_df.columns != 'direction']
y = sc_df[['direction']].astype("category")

# encode target
y = y.apply(lambda x: x.cat.codes)

# check target encoding
# y.apply(lambda x: x.cat.categories) 

# treat booleans and multiple categories differently
cat_vars = list(X.columns[X.nunique() > 2])
bool_vars = list(X.columns[X.nunique() == 2])

# encode categorical features
X_cat = pd.get_dummies(X[cat_vars], columns = cat_vars, drop_first = True)
X_bool = pd.get_dummies(X[bool_vars], columns = bool_vars, drop_first = True)
X = pd.concat([X_cat, X_bool], axis = 1)

In [25]:
# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.25, random_state = 42)
print("Training split input- ", X_train.shape)
print("Testing split input- ", X_test.shape)

Training split input-  (10535, 51)
Testing split input-  (3512, 51)


In [35]:
# create tree object
tree = DecisionTreeClassifier(max_depth=4) # test depth

# train tree
model = tree.fit(X_train, y_train)

In [31]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
default_max_features_param = np.sqrt(X_train.shape[1])
default_max_features_param
hyperparam_grid={"max_features":[3,4,5,6,7],
                 'min_samples_leaf':[1,3,5,7,9,11]}
from sklearn.model_selection import ParameterGrid

ParameterGrid(hyperparam_grid)[1]
rfc = RandomForestClassifier(n_estimators=2000, oob_score=True,
                             warm_start=False, random_state=1, n_jobs=-2)
best_score=0.5

for g in ParameterGrid(hyperparam_grid):
    rfc.set_params(min_samples_leaf=g['min_samples_leaf'], max_features=g['max_features'])
    #or rfc.set_params(**g)
    rfc.fit(X_train,y_train.values.ravel())
    # save if best
    if rfc.oob_score_ > best_score:
        best_score = rfc.oob_score_
        best_params = g

print(f"OOB: %0.5f" % best_score)
print("Best parameters:", best_params)

Accuracy: 0.6526195899772209


In [None]:
from sklearn.inspection import permutation_importance

start_time = time.time()
result = permutation_importance(
    rfcopt, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(result.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()