# load the data and prep it for modelling

The basic idea in this notebook is to get a sense for how various models perform.  The assessment is based simply on the train and test accuracy.

In another notebook the models will be evaluated with cross validation and compared with grid searching.

In [None]:
# the usual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# some other useful things
import warnings
warnings.filterwarnings("ignore")
import pickle
import datetime
from os import path

# machine learning stuff
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

# my utilities
from crash_utils.zip_code_and_borough_from_coords import zip_code_and_borough_from_coords
from crash_utils.fix_vehicle_names import fix_vehicle_names
from crash_utils.make_crash_features import make_crash_features
from crash_utils.basic_cleaning import basic_cleaning
from crash_utils.prepare_data_for_modelling import prepare_data_for_modelling

In [None]:
data_path = "/Users/Mark/brainstation/capstone/nyc_bike_crash_analysis/data/"
df = pd.read_csv(data_path + "Motor_Vehicle_Collisions_-_Crashes.csv")

In [None]:
# fill in missing zip coded and boroughs using lat/lon
df = zip_code_and_borough_from_coords(df)

In [None]:
# perform some basic data munging operations (see `crash_utils/basic_cleaning.py` for details)
df = basic_cleaning(df)

In [None]:
## clean up the VEHICLE TYPE CODE columns
df = fix_vehicle_names(df)

In [None]:
# prepare the data for modelling
# drop columns
# set up target
# run "make_crash_features.py"
# OHE the text columns
# count-vectorize the vehicles and crash factors

df = prepare_data_for_modelling(df)

# extract the features and targets from the big dataframe

In [None]:
# extract target and features and then train-test-split
# also scale the data for those algorithms which would benefit (e.g., KNN)

from sklearn.preprocessing import StandardScaler

X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

scaler = StandardScaler().fit(X_train, y_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape

# now build the pipeline for grid search

## set up models and parameters

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
%%time
# 2020-12-06, 
# accuracy score of 
# file: 

file_name = "2"

if path.exists(file_name):

    infile = open(file_name,"rb")
    grid_out = pickle.load(infile)
    infile.close()

else:

    # pipeline initiation
    steps = [('dim_reduction', PCA()),            # step 2: PCA
             ('model', RandomForestClassifier())] # step 3: fit a regressor model
        
    model_pipeline = Pipeline(steps)
    
    # model parameters
    forest_params = {'model': [RandomForestClassifier(max_depth=50, n_estimators=100)],
                     'dim_reduction': [PCA()],
                     'dim_reduction__n_components':[70]
                    }


    # create our grid
    parameter_grid = [forest_params]
    grid_out = GridSearchCV(model_pipeline, parameter_grid, cv=5, verbose=1, n_jobs = -1, 
                            scoring = "f1_micro")
    
    # fit
    grid_out.fit(X_remainder, y_remainder)
     
    # write results    
    file_name = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " GridCVresults.pkl"
    pickle.dump(grid_out, open(file_name, "wb") )
    print("wrote:",file_name)

In [None]:
grid_out.best_estimator_

In [None]:
grid_out.best_params_

In [None]:
y_pred_test = grid_out.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_test)}")

In [None]:
# view performance of all models
grid_out.cv_results_.keys()
#grid_out.cv_results_["params"]

In [None]:
plt.hist(grid_out.cv_results_["mean_test_score"])

# other models to add:

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)

# score model on train and test sets
print(round(gnb.score(X_train_scaled, y_train),3))
print(round(gnb.score(X_test_scaled, y_test),3))

## boosting

### Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
#fitting Adaptive Boosting
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

# score model on train and test sets
print(round(ada.score(X_train, y_train),3))
print(round(ada.score(X_test, y_test),3))

In [None]:
from sklearn.linear_model import RidgeClassifier

rc = RidgeClassifier()
rc.fit(X_train_scaled, y_train)

# score model on train and test sets
print(round(rc.score(X_train_scaled, y_train),3))
print(round(rc.score(X_test_scaled, y_test),3))

In [None]:
rc.coef_

In [None]:
rc.classes_

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs = -1)
knn.fit(X_train_scaled, y_train)

# score model on train and test sets
print(round(knn.score(X_train_scaled, y_train),3))
print(round(knn.score(X_test_scaled, y_test),3))

## MLP classifier

In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(20,100,500,100,20), solver='lbfgs')

nn.fit(X_train_scaled,y_train)

# score model on train and test sets
print(round(nn.score(X_train_scaled, y_train),3))
print(round(nn.score(X_test_scaled, y_test),3))

## Tensorflow/Keras Feedforward NN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# Create a new sequential model
model = keras.Sequential()

# Declare the hidden layers
model.add(layers.Dense(20, activation="relu"))
model.add(layers.Dense(100, activation="relu"))
#model.add(layers.Dense(500, activation="relu"))
model.add(layers.Dense(100, activation="relu"))
model.add(layers.Dense(20, activation="relu"))

# Declare the output layer
model.add(layers.Dense(3, activation="softmax"))

In [None]:
model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    
    # Loss function to minimize
    loss=keras.losses.BinaryCrossentropy(),
    
    # Metric used to evaluate model
    metrics=[keras.metrics.BinaryAccuracy()]
)

In [None]:
# extract target and features and then train-test-split
X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train, y_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

type(X_train_scaled)

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=50, verbose=1)

In [None]:
# Evaluate the network
train_accuracy = history.history["binary_accuracy"][-1]

result = model.evaluate(X_test,y_test, verbose=0)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {result[1]:.4f}") 

# Previous runs

In [None]:
%%time
# 2020-12-06, best is RandomForestClassifier(max_depth = 50, n_estimators = 100)
# accuracy score of 82.6%
# file: 2020-12-06 20:46:08 GridCVresults.pkl

file_name = "2020-12-06 20:46:08 GridCVresults.pkl"

if path.exists(file_name):

    infile = open(file_name,"rb")
    grid_out = pickle.load(infile)
    infile.close()

    else:
        
    # pipeline initiation
    steps = [('model', DecisionTreeClassifier())]
    model_pipeline = Pipeline(steps)



    tree_params = {'model': [DecisionTreeClassifier()],
                   'model__max_depth': np.arange(10,200,10),
                   'model__min_samples_leaf': np.arange(10,200,10)}


    forest_params = {'model': [RandomForestClassifier()],
                     'model__max_depth': np.arange(10,200,10),
                     'model__n_estimators': [20, 50, 100],
                    }

    gboost_params = {'model': [GradientBoostingClassifier()],
                     'model__learning_rate': [0.1, 0.5, 0.9],
                     'model__n_estimators': [20, 50, 100, 200]}

    # create our grid

    parameter_grid = [tree_params, forest_params, gboost_params]
    grid_out = GridSearchCV(model_pipeline, parameter_grid, cv=5, verbose=1, n_jobs = -1, scoring="accuracy")
    
    # fit
    grid_out.fit(X_remainder, y_remainder)
    
    file_name = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " GridCVresults.pkl"
    pickle.dump(grid_out, open(file_name, "wb") )
    print("wrote:",file_name)

In [None]:
%%time
# 2020-12-06, best is PCA(n_components=50), RandomForestClassifier(max_depth=50), no scaler
# accuracy score of 91.7%
# file: 2020-12-06 22:32:26 GridCVresults.pkl

file_name = "2020-12-06 22:32:26 GridCVresults.pkl"

if path.exists(file_name):

    infile = open(file_name,"rb")
    grid_out = pickle.load(infile)
    infile.close()

else:

    # pipeline initiation

    steps = [('scaler', StandardScaler()),  # step 1: scale
             ('dim_reduction', PCA()),      # step 2: PCA
             ('model', RandomForestClassifier())] # step 3: fit a regressor model
        
    model_pipeline = Pipeline(steps)
    
    # model parameters
    forest_params = {'model': [RandomForestClassifier(max_depth=50, n_estimators=100)],
                     'scaler': [None, MinMaxScaler(), StandardScaler()],
                     'dim_reduction': [PCA()],
                     'dim_reduction__n_components':[1, 2, 5, 10, 20, 50, 100, 200],
                     
                    }


    # create our grid
    parameter_grid = [forest_params]
    grid_out = GridSearchCV(model_pipeline, parameter_grid, cv=5, verbose=1, n_jobs = -1, scoring="accuracy")
    
    # fit
    grid_out.fit(X_remainder, y_remainder)
     
    # write results    
    file_name = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " GridCVresults.pkl"
    pickle.dump(grid_out, open(file_name, "wb") )
    print("wrote:",file_name)

In [None]:
%%time
# 2020-12-06, best is PCA(n_components=70), RandomForestClassifier(max_depth=50)}
# accuracy score of 92.7
# file: 2020-12-06 22:57:34 GridCVresults.pkl

file_name = "2020-12-06 22:57:34 GridCVresults.pkl"

if path.exists(file_name):

    infile = open(file_name,"rb")
    grid_out = pickle.load(infile)
    infile.close()

else:

    # pipeline initiation
    steps = [('dim_reduction', PCA()),      # step 2: PCA
             ('model', RandomForestClassifier())] # step 3: fit a regressor model
        
    model_pipeline = Pipeline(steps)
    
    # model parameters
    forest_params = {'model': [RandomForestClassifier(max_depth=50, n_estimators=100)],
                     'dim_reduction': [PCA()],
                     'dim_reduction__n_components':[20, 30, 50, 60, 70, 80]
                    }


    # create our grid
    parameter_grid = [forest_params]
    grid_out = GridSearchCV(model_pipeline, parameter_grid, cv=5, verbose=1, n_jobs = -1, scoring="accuracy")
    
    # fit
    grid_out.fit(X_remainder, y_remainder)
     
    # write results    
    file_name = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " GridCVresults.pkl"
    pickle.dump(grid_out, open(file_name, "wb") )
    print("wrote:",file_name)