# Evaluate in detail the best-performing model found with the grid search cv

In [None]:
# the usual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# my utilities
from crash_utils.zip_code_and_borough_from_coords import zip_code_and_borough_from_coords
from crash_utils.fix_vehicle_names import fix_vehicle_names
from crash_utils.make_crash_features import make_crash_features
from crash_utils.basic_cleaning import basic_cleaning
from crash_utils.prepare_data_for_modelling import prepare_data_for_modelling

In [None]:
data_path = "/Users/Mark/brainstation/capstone/nyc_bike_crash_analysis/data/"
df = pd.read_csv(data_path + "Motor_Vehicle_Collisions_-_Crashes.csv")

In [None]:
# fill in missing zip coded and boroughs using lat/lon
df = zip_code_and_borough_from_coords(df)

In [None]:
## clean up the VEHICLE TYPE CODE columns
df = fix_vehicle_names(df)

In [None]:
# perform some basic data munging operations (see `crash_utils/basic_cleaning.py` for details)
df = basic_cleaning(df)

In [None]:
# final prep (drops unncessary columns, feature engineering, count-vectorizer, OHE)
df = prepare_data_for_modelling(df)

# now evaluate the model

In [None]:
# machine learning stuff
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve

from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

## extract the features and targets from the big dataframe

In [None]:
# extract target and features and then train-test-split

X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

### fix the class imbalance with upsampling

In [None]:
# fix the class imbalance with upsampling
n_min = np.sum(y_train == 0)
n_maj = np.sum(y_train == 1)
minority_mask = y_train == 0

X_upsampled, y_upsampled = resample(X_train.loc[minority_mask], 
                                    y_train.loc[minority_mask], 
                                    replace = True, 
                                    n_samples = n_maj)

X_train_bal = np.vstack((X_train[y_train == 1], X_upsampled))
y_train_bal = np.hstack((y_train[y_train == 1], y_upsampled))

## run what was found to be the best classifier

run what was found to be the best classifier (file: 2020-12-10 00:30:04 GridCVresults.pkl)

PCA(n_components=20), RandomForestClassifier(max_depth=40, n_estimators = 200, 'scaler': None}

In [None]:
# apply pca transformation
from sklearn.decomposition import PCA

my_pca = PCA(n_components=20).fit(X_train_bal)
X_train_bal_pca = my_pca.transform(X_train_bal)
X_test_pca = my_pca.transform(X_test)

In [None]:
# fit random forest model to the upsampled, balanced, and PCA'ed training set
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=40, n_estimators=200)
rf.fit(X_train_bal_pca, y_train_bal)

In [None]:
# compute the accuracy on the training and test sets
print(f"Accuracy on the training set: {round(100*rf.score(X_train_bal_pca, y_train_bal),2)}%")
print(f"Accuracy on test set: {round(100*rf.score(X_test_pca, y_test),2)}%")

## accuracy, precision, recall

In [None]:
# predict test and train
y_pred_test = rf.predict(X_test_pca)
y_pred_train= rf.predict(X_train_bal_pca)

In [None]:
# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test,normalize="all")
conf_df = 100*pd.DataFrame(data =  conf_matrix,
                       index = ["True non-injury","True injury"],
                       columns = ["Predicted non-injury","Predicted injury"]).round(2)

conf_df

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
plt.rcParams.update({'font.size': 16})
plot_confusion_matrix(rf, X_test_pca, y_test, ax=ax,
                      cmap=plt.cm.Blues, normalize=None,
                      display_labels=["No injury","Injury"])
plt.tight_layout()
fig.savefig("/Users/Mark/brainstation/capstone/nyc_bike_crash_analysis/figs/confusion_matrix.png", facecolor="w", edgecolor='none')
plt.show()

In [None]:
counts_df = pd.DataFrame(y_test.value_counts().sort_index())
counts_df.rename(columns = {"outcome":"number"}, inplace=True)
counts_df["outcome"] = ["no injury","injury"]
counts_df.index.name = "encoding"
counts_df

In [None]:
print(f"accuracy: {round(accuracy_score(y_test, y_pred_test),2)}")
print(f"precision: {round(precision_score(y_test, y_pred_test),2)}")
print(f"recall: {round(recall_score(y_test, y_pred_test),2)}")
print(f"F1 score: {round(f1_score(y_test, y_pred_test),2)}")

In [None]:
report_initial = classification_report(y_test, y_pred_test)
print(report_initial)

## ROC

In [None]:
plt.show()

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1, 1, 1)
plt.rcParams.update({'font.size': 16})
plot_roc_curve(rf, X_test_pca, y_test, ax=ax)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='chance')
ax.legend(loc="lower right", fontsize=14)
plt.tight_layout()
fig.savefig("/Users/Mark/brainstation/capstone/nyc_bike_crash_analysis/figs/roc_curve.png", facecolor="w", edgecolor='none')
plt.show()

-----------

## Logistic classifier to quantify the relative importance of the factors

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C = 1e-6)
lr.fit(X_train_scaled, y_train_bal)

print("training set accuracy",round(lr.score(X_train_scaled, y_train_bal),3))
print("test set accuracy",round(lr.score(X_test_scaled, y_test),3))

In [None]:
Cs = 10**np.arange(-9.,5.,1.)
test_score = []
train_score = []

for C in Cs:
    lr = LogisticRegression(C = C).fit(X_train_scaled, y_train_bal)
    train_score.append(lr.score(X_train_scaled, y_train_bal))
    test_score.append(lr.score(X_test_scaled, y_test))
    print(C,end="\r")

In [None]:
max_ind = np.argmax(test_score)
print("max test score of", round(test_score[max_ind],3))
print("max test score at C = ",Cs[max_ind])

In [None]:
# Visualize the result
plt.figure()
plt.plot(Cs, train_score, label='training set', marker='o')
plt.plot(Cs, test_score, label='test set', marker='o')
plt.xscale("log")
plt.xlabel('C (inverse regularization strength)')
plt.ylabel('accuracy score')
plt.title("Logistic Regression Classification: impact of regularization")
plt.axvline(Cs[max_ind],color="k")
plt.legend()
plt.show()

In [None]:
# break out the coefficient values and feature names for text columns
coeffs = lr.coef_.reshape(-1)[253:]
features = X.columns[253:]
print(features.shape)
print(coeffs.shape)

In [None]:
word_df = pd.DataFrame({"coeffs":coeffs, "word":features})
word_df.sort_values(by="coeffs",ascending=False,inplace=True)

In [None]:
word_df.head(20).plot.bar(x = "word", rot=90, figsize=(14,8), fontsize=16, legend=None);
plt.xlabel("Coefficient value",size=18);
plt.ylabel("");
plt.title("Words that strongly predict injuries",size = 20);