# load the data and prep it for modelling

The basic idea in this notebook is to get a sense for how various models perform.  The assessment is based simply on the train and test accuracy.

In another notebook the models will be evaluated with cross validation and compared with grid searching.

In [None]:
# the usual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# machine learning stuff
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# my utilities
from crash_utils.zip_code_and_borough_from_coords import zip_code_and_borough_from_coords
from crash_utils.fix_vehicle_names import fix_vehicle_names
from crash_utils.make_crash_features import make_crash_features
from crash_utils.basic_cleaning import basic_cleaning
from crash_utils.prepare_data_for_modelling import prepare_data_for_modelling

In [None]:
data_path = "/Users/Mark/brainstation/capstone/nyc_bike_crash_analysis/data/"
df = pd.read_csv(data_path + "Motor_Vehicle_Collisions_-_Crashes.csv")

In [None]:
df.head(10)

In [None]:
# fill in missing zip coded and boroughs using lat/lon
df = zip_code_and_borough_from_coords(df)

In [None]:
df.head()

In [None]:
# perform some basic data munging operations (see `crash_utils/basic_cleaning.py` for details)
df = basic_cleaning(df)

In [None]:
df.head()

In [None]:
## clean up the VEHICLE TYPE CODE columns
df = fix_vehicle_names(df)

In [None]:
df.head()

In [None]:
# prepare the data for modelling
# drop columns
# set up target
# run "make_crash_features.py"
# OHE the text columns
# count-vectorize the vehicles and crash factors

df = prepare_data_for_modelling(df)

In [None]:
df.head()

# now test some models!

## extract the features and targets from the big dataframe

In [None]:
# extract target and features and then train-test-split
# also scale the data for those algorithms which would benefit (e.g., KNN)

from sklearn.preprocessing import StandardScaler

X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)


scaler = StandardScaler().fit(X_train, y_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Decision Tree Classifier

In [None]:
# instantiate and fit DTC

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth = 20).fit(X_train, y_train)

dtc.fit(X_train,y_train)

# score model on train and test sets
print(round(dtc.score(X_train, y_train),3))
print(round(dtc.score(X_test, y_test),3))

In [None]:
# predict test and train
y_pred_test = dtc.predict(X_test)
y_pred_train= dtc.predict(X_train)

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_df = pd.DataFrame(data = conf_matrix,
                       index = ['True Class 0','True Class 1','True Class 2'],
                       columns = ['Predicted Class 0','Predicted Class 1','Predicted Class 2'])

conf_df

In [None]:
counts_df = pd.DataFrame(y_test.value_counts().sort_index())
counts_df.rename(columns = {"outcome":"number"}, inplace=True)
counts_df["outcome"] = ["no injury","injury","fatality"]
counts_df.index.name = "encoding"
counts_df

In [None]:
ConfusionMatrixDisplay(conf_matrix).plot()

__little mini hyperparameter search__

In [None]:
#depths = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
depths = np.arange(10,200,10)
test_score = []
train_score = []

for depth in depths:
    dtc = DecisionTreeClassifier(max_depth = depth)
    dtc.fit(X_train,y_train)
    test_score.append(dtc.score(X_test, y_test))
    train_score.append(dtc.score(X_train, y_train))
    print(depth,end="\r")

In [None]:
# Visualize the result
plt.figure()
plt.plot(depths, train_score, label='training set', marker='o')
plt.plot(depths, test_score, label='test set', marker='o')
plt.xlabel('max depth')
plt.ylabel('accuracy score')
plt.title("DTC: impact of max depth")
plt.legend()
plt.show()

In [None]:
#min_leafs = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
min_leafs = np.arange(10,200,10)
test_score = []
train_score = []

for leaf in min_leafs:
    dtc = DecisionTreeClassifier(max_depth = 180, min_samples_leaf = leaf)
    dtc.fit(X_train,y_train)
    test_score.append(dtc.score(X_test, y_test))
    train_score.append(dtc.score(X_train, y_train))
    print(leaf,end="\r")

In [None]:
# Visualize the result
plt.figure()
plt.plot(min_leafs, train_score, label='training set', marker='o')
plt.plot(min_leafs, test_score, label='test set', marker='o')
#plt.xscale("log")
plt.xlabel('min samples per leaf')
plt.ylabel('accuracy score')
plt.title("DTC: impact of min. samples per leaf")
plt.legend()
plt.show()

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# optimized RF classifier
rf = RandomForestClassifier(max_depth = 50, n_estimators = 100, n_jobs = -1)
rf.fit(X_train, y_train)


# score model on train and test sets
print(round(rf.score(X_train, y_train),3))
print(round(rf.score(X_test, y_test),3))

In [None]:
estimators = np.arange(10,100,10)
test_score = []
train_score = []

for estimator in estimators:
    rf = RandomForestClassifier(n_estimators = estimator, max_depth = 150, n_jobs=-1)
    rf.fit(X_train,y_train)
    test_score.append(rf.score(X_test, y_test))
    train_score.append(rf.score(X_train, y_train))
    print(estimator,end="\r")

In [None]:
# Visualize the result
plt.figure()
plt.plot(estimators, train_score, label='training set', marker='o')
plt.plot(estimators, test_score, label='test set', marker='o')
plt.xlabel("number of estimators")
plt.ylabel('accuracy score')
plt.title("Random forest: impact of number of estimators")
plt.legend()
plt.show()

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)

# score model on train and test sets
print(round(gnb.score(X_train_scaled, y_train),3))
print(round(gnb.score(X_test_scaled, y_test),3))

In [None]:
# predict test and train
y_pred_test = gnb.predict(X_test_scaled)
y_pred_train= gnb.predict(X_train_scaled)

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_df = pd.DataFrame(data = conf_matrix,
                       index = ['True Class 0','True Class 1','True Class 2'],
                       columns = ['Predicted Class 0','Predicted Class 1','Predicted Class 2'])

conf_df

## boosting

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)


# score model on train and test sets
print(round(gb.score(X_train, y_train),3))
print(round(gb.score(X_test, y_test),3))

In [None]:
estimators = np.arange(10,200,20)
test_score = []
train_score = []

for estimator in estimators:
    gb = GradientBoostingClassifier(n_estimators = estimator)
    gb.fit(X_train,y_train)
    test_score.append(gb.score(X_test, y_test))
    train_score.append(gb.score(X_train, y_train))
    print(estimator,end="\r")

In [None]:
# Visualize the result
plt.figure()
plt.plot(estimators, train_score, label='training set', marker='o')
plt.plot(estimators, test_score, label='test set', marker='o')
plt.xlabel("number of estimators")
plt.ylabel('accuracy score')
plt.title("Random forest: impact of number of estimators")
plt.legend()
plt.show()

### Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
#fitting Adaptive Boosting
ada = AdaBoostClassifier()
ada.fit(X, y)

# score model on train and test sets
print(round(ada.score(X_train, y_train),3))
print(round(ada.score(X_test, y_test),3))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs = -1)
knn.fit(X_train_scaled, y_train)

# score model on train and test sets
print(round(knn.score(X_train_scaled, y_train),3))
print(round(knn.score(X_test_scaled, y_test),3))

## MLP classifier

In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(20,100,500,100,20), solver='lbfgs')

nn.fit(X_train_scaled,y_train)

print(round(nn.score(X_train_scaled, y_train),3))
print(round(nn.score(X_test_scaled, y_test),3))


#nn.fit(X_train_scaled,y_train)

# score model on train and test sets
# print(round(nn.score(X_train_scaled, y_train),3))
# print(round(nn.score(X_test_scaled, y_test),3))

In [None]:
# predict test and train
y_pred_test = nn.predict(X_test_scaled)
y_pred_train= nn.predict(X_train_scaled)

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
conf_df = pd.DataFrame(data = conf_matrix,
                       index = ['True Class 0','True Class 1','True Class 2'],
                       columns = ['Predicted Class 0','Predicted Class 1','Predicted Class 2'])

conf_df

## Tensorflow/Keras Feedforward NN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# Create a new sequential model
model = keras.Sequential()

# Declare the hidden layers
model.add(layers.Dense(20, activation="relu"))
model.add(layers.Dense(100, activation="relu"))
#model.add(layers.Dense(500, activation="relu"))
model.add(layers.Dense(100, activation="relu"))
model.add(layers.Dense(20, activation="relu"))

# Declare the output layer
model.add(layers.Dense(3, activation="softmax"))

In [None]:
model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    
    # Loss function to minimize
    loss=keras.losses.SparseCategoricalCrossentropy(),
    
    # Metric used to evaluate model
    metrics=[keras.metrics.BinaryAccuracy()]
)

In [None]:
# extract target and features and then train-test-split
X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train, y_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

type(X_train_scaled)

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=50, verbose=1)

In [None]:
# Evaluate the network
train_accuracy = history.history["binary_accuracy"][-1]

result = model.evaluate(X_test,y_test, verbose=0)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {result[1]:.4f}") 