In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
path = '../data/judicial_review_of_congress_database_1789-2018.xlsx'
# columns = ['YEAR', 'DECISION2', 'TIME TO DECISION', 'CONGRESS', 'AREA1',
#            'AREA2', 'ATT GEN', 'CERT', 'REVERSD', 'GVT PRTY', 'H-CT']
jr = pd.read_excel(path) #, usecols=columns)
jr.columns = jr.columns.str.lower().str.replace(r'[ -]', '_')
print(jr.shape)
jr.head()

(1308, 25)


Unnamed: 0,case,citation,date,year,decade,decision,decision2,effect,stat_name,stat_cite,...,area2,lndmrk,lnddtl,att_gen,cert,reversd,crs,gvt_prty,dissent,h_ct
0,United States v. Yale Todd,"*54 U.S. 40, 52",2/17/1794,1794,1790,2,1,struck down on face,Invalid Pensions Act,1 Stat. 243,...,,0.0,0.0,1,0,0.0,0.0,1,0,0
1,Penhallow v. Doane's Adm'rs,003 U.S. 054,2/24/1795,1795,1790,0,0,upheld,Court of Appeals Resolution,17 JCC 459,...,,0.0,0.0,1,0,1.0,0.0,0,0,1
2,Hylton v. United States,003 U.S. 171,3/8/1796,1796,1790,0,0,upheld,Carriage Tax Act,1 Stat. 373,...,,0.0,0.0,1,0,0.0,0.0,1,0,1
3,United States v. La Vengeance,003 U.S. 297,8/11/1796,1796,1790,0,0,upheld,Arms Exportation Act,1 Stat. 369,...,1.0,0.0,0.0,1,0,0.0,0.0,1,0,1
4,Mossman v. Higginson,004 U.S. 12,8/11/1800,1800,1800,1,1,struck down as applied,Judiciary Act of 1789,1 Stat. 73,...,6.0,1.0,0.0,0,0,0.0,0.0,0,0,0


In [3]:
full = jr.copy()

In [5]:
import pandas_profiling

In [None]:
%%time
profile = full.profile_report(title='Judical Review Profile')
profile.to_file(output_file='judical_review_full_pandas_profile.html')

In [None]:
# baseline for classification
target='decision2'
jr[target].value_counts(normalize=True, dropna=False)

In [None]:
jr.describe()

In [None]:
jr = jr.dropna(subset=['reversd'])
jr.shape

In [None]:
# split into train, validate, and test sets using 70/15/15
from sklearn.model_selection import train_test_split

train, val_test = train_test_split(jr, train_size=0.7, test_size=0.3,
                                  stratify=jr['decision2'], random_state=42)

val, test = train_test_split(val_test, train_size=0.5, test_size=0.5,
                                  stratify=val_test['decision2'], random_state=42)

train.shape, val.shape, test.shape

In [None]:
X_train = train.drop(columns=target)
X_val = val.drop(columns=target)
X_test = test.drop(columns=target)

y_train = train[target]
y_val = val[target]
y_test = test[target]

In [None]:
import xgboost as xgb
xgb.__version__

In [None]:
# gradient boosting
import category_encoders as ce
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    ce.OneHotEncoder(cols=['area1', 'area2']),
    xgb.XGBClassifier(n_estimators=100, n_jobs=-1, random_state=42)
)

pipeline.fit(X_train, y_train)

In [None]:
print('Validation Accuracy', pipeline.score(X_val, y_val))

In [None]:
y_proba = pipeline.predict_proba(X_val)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_proba[:, 1])

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
y_pred = pipeline.predict(X_val)

In [None]:
# plot a heatmap
def plot_confusion_matrix(y_true, y_pred):
    labels = unique_labels(y_true)
    columns = [f'Predicted {label}' for label in labels]
    index = [f'Actual {label}' for label in labels]
    table = pd.DataFrame(confusion_matrix(y_true, y_pred),
                        columns=columns, index=index)
    return sns.heatmap(table, annot=True, fmt='.0f', cmap='viridis')

plot_confusion_matrix(y_val, y_pred);

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))