In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# using pandas get dummies method to turn caticorgical data to numeric and create x & y
stevens = pd.read_csv('/Users/michaelcolellajensen/Desktop/Data Sets/Stevens.csv')
stevens_binary = pd.get_dummies(
    stevens[['Circuit', 'Issue', 'Petitioner', 'Respondent', 'LowerCourt', 'Unconst']],
    drop_first=True
    )

x=stevens_binary.values
y=stevens['Reverse'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=123)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [None]:
# building a DecisionTreeClassifier untuned
dt_model = DecisionTreeClassifier(max_depth=4, random_state=123, criterion='gini')
dt_model.fit(x_train, y_train)
dt_predictions = dt_model.predict(x_test)
dt_proba = dt_model.predict_proba(x_test)[:,1]

dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:{:.3f}".format(dt_accuracy))
dt_roc = roc_auc_score(y_test, dt_proba)
print("Decision Tree AUC score:{:.3f}".format(dt_roc))

dt_matrix = confusion_matrix(y_test, dt_predictions)
print('Confusion Matrix\n', dt_matrix)

In [None]:
# building a tuned DecisionTreeClassifier
dt_params = {
    'max_depth':[2,3,4],
    'min_samples_leaf': [0.05, 0.09, 0.13],
    'max_features': [0.2, 0.4, 0.6, 0.8],
    }
grid_dt = GridSearchCV(
    estimator=dt_model, 
    param_grid = dt_params,
    scoring = 'accuracy',
    cv=5 
    )
grid_dt.fit(x_train, y_train)
dt_best_params = grid_dt.best_params_
print('Best hyperparamerters:\n', dt_best_params)

best_dt_model = grid_dt.best_estimator_
best_dt_model_predictions = best_dt_model.predict(x_test)
best_dt_proba = best_dt_model.predict_proba(x_test)[:,1]
best_dt_auc = roc_auc_score(y_test, best_dt_proba)
best_dt_accuracy = accuracy_score(y_test, best_dt_model_predictions)
print('Best Decision Tree Accuracy:{:.3f}'.format(best_dt_accuracy))
print('Best Decision Tree AUC Score:{:.3f}'.format(best_dt_auc))

best_dt_matrix = confusion_matrix(y_test, best_dt_model_predictions)
print('Best Decision Tree Matrix:\n', best_dt_matrix)


In [None]:
# building an untuned RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth = 6, 
    min_samples_leaf = 0.1, 
    random_state = 123
    )

rf_model.fit(x_train, y_train)
rf_predictions = rf_model.predict(x_test)
rf_proba = rf_model.predict_log_proba(x_test)[:,1]

rf_accuracy = accuracy_score(y_test, rf_predictions)
print('Random Forest Accuracy:{:.3f}'.format(rf_accuracy))

rf_auc = roc_auc_score(y_test, rf_proba)
print('Random Forest AUC Score:{:.3f}'.format(rf_auc))

rf_matrix = confusion_matrix(y_test, rf_predictions)
print('Random Forest Confusion Matrix:\n', rf_matrix)

In [None]:
# building a RandomForestClassifier with tuning
rf_params = {
    'n_estimators':[90, 100, 110],
    'max_depth': [3, 4, 5], 
    'max_features': ['sqrt', 'auto'],
    'min_samples_leaf': [0.06, 0.07, 0.08]
    }
rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid = rf_params,
    cv=5,
    scoring = 'accuracy',
    verbose = 1,
    n_jobs = -1
    )
rf_grid.fit(x_train, y_train)
best_rf_params = rf_grid.best_params_
print('Best Random Forest Hyperparamers:\n',(best_rf_params))

best_rf_model = rf_grid.best_estimator_
best_rf_predictions = best_rf_model.predict(x_test)
best_rf_proba = best_rf_model.predict_proba(x_test)[:,1]
best_rf_accuracy = accuracy_score(y_test, best_rf_predictions)
best_rf_auc = roc_auc_score(y_test, best_rf_proba)

print('Best Random Forest Accuracy:{:.3f}'.format(best_rf_accuracy))
print('Best Random Forest AUC Score:{:.3f}'.format(best_rf_auc))

best_rf_matrix = confusion_matrix(y_test, best_rf_predictions)
print('Best Random Forest Confusion Matrix:\n', best_rf_matrix)
