# compare the accuracy of various models to select the best one

The basic idea in this notebook is to get a sense for how various models perform.  The assessment is based simply on the train and test accuracy.

In another notebook the models will be evaluated with cross validation and compared with grid searching.

In [1]:
# the usual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# some other useful things
import warnings
warnings.filterwarnings("ignore")
import pickle
import datetime
from os import path

# my utilities
from crash_utils.zip_code_and_borough_from_coords import zip_code_and_borough_from_coords
from crash_utils.fix_vehicle_names import fix_vehicle_names
from crash_utils.make_crash_features import make_crash_features
from crash_utils.basic_cleaning import basic_cleaning
from crash_utils.prepare_data_for_modelling import prepare_data_for_modelling

In [2]:
data_path = "/Users/Mark/brainstation/capstone/nyc_bike_crash_analysis/data/"
df = pd.read_csv(data_path + "Motor_Vehicle_Collisions_-_Crashes.csv")

In [3]:
# fill in missing zip coded and boroughs using lat/lon
df = zip_code_and_borough_from_coords(df)

In [4]:
## clean up the VEHICLE TYPE CODE columns
df = fix_vehicle_names(df)

In [5]:
# perform some basic data munging operations (see `crash_utils/basic_cleaning.py` for details)
df = basic_cleaning(df)

In [6]:
# prepare the data for modelling
# drop columns
# set up target
# run "make_crash_features.py"
# OHE the text columns
# count-vectorize the vehicles and crash factors

df = prepare_data_for_modelling(df)

# extract the features and targets from the big dataframe

In [7]:
# machine learning stuff
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

In [8]:
# extract target and features and then train-test-split
# also scale the data for those algorithms which would benefit (e.g., KNN)

X = df.iloc[:,1:]
y = df.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [9]:
print(y_train.value_counts().sort_index())
n_min = np.sum(y_train == 0)
n_maj = np.sum(y_train == 1)

0.0     8193
1.0    27316
Name: outcome, dtype: int64


In [10]:
# fix the class imbalance with upsampling
from sklearn.utils import resample

minority_mask = y_train == 0

X_upsampled, y_upsampled = resample(X_train.loc[minority_mask], 
                                    y_train.loc[minority_mask], 
                                    replace = True, 
                                    n_samples = n_maj)

X_train_bal = np.vstack((X_train[y_train == 1], X_upsampled))
y_train_bal = np.hstack((y_train[y_train == 1], y_upsampled))

# now build the pipeline for grid search

## set up models and parameters

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [12]:
%%time
# 2020-12-11, PCA(n_components=20), RandomForestClassifier(max_depth=40, n_estimators=200), scaler: None
# upsampled the no-injury class in the training set
# accuracy score of 91%
# file: 2020-12-11 01:18:41 GridCVresults.pkl

file_name = "2020-12-11 01:18:41 GridCVresults.pkl"

if path.exists(file_name):

    infile = open(file_name,"rb")
    grid_out = pickle.load(infile)
    infile.close()

else:

    # pipeline initiation
    steps = [('scaler', StandardScaler()),        # step 1: scale
             ('dim_reduction', PCA()),            # step 2: PCA
             ('model', RandomForestClassifier())] # step 3: fit a regressor model
        
    model_pipeline = Pipeline(steps)
    
    # model parameters
    forest_params = {'model': [RandomForestClassifier()],
                     'model__max_depth': [1, 3, 10, 40, 80],
                     'model__n_estimators': [50, 100, 200],
                     'scaler': [None, StandardScaler()],
                     'dim_reduction': [PCA()],
                     'dim_reduction__n_components':[5, 20, 50, 100, 200]
                    }
   

    # create our grid
    parameter_grid = [forest_params]
    grid_out = GridSearchCV(model_pipeline, parameter_grid, cv=5, verbose=1, n_jobs = 6, 
                            scoring = "accuracy")
    
    # fit
    grid_out.fit(X_train_bal, y_train_bal)
     
    # write results    
    file_name = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " GridCVresults.pkl"
    pickle.dump(grid_out, open(file_name, "wb") )
    print("wrote:",file_name)

CPU times: user 73.5 ms, sys: 190 ms, total: 263 ms
Wall time: 268 ms


In [13]:
grid_out.best_estimator_

Pipeline(steps=[('scaler', None), ('dim_reduction', PCA(n_components=20)),
                ('model',
                 RandomForestClassifier(max_depth=40, n_estimators=200))])

In [14]:
grid_out.best_params_

{'dim_reduction': PCA(n_components=20),
 'dim_reduction__n_components': 20,
 'model': RandomForestClassifier(max_depth=40, n_estimators=200),
 'model__max_depth': 40,
 'model__n_estimators': 200,
 'scaler': None}

In [15]:
grid_out.best_score_

0.9091741691961

In [None]:
# view performance of all models
grid_out.cv_results_.keys()
#grid_out.cv_results_["params"]

In [None]:
#grid_out.cv_results_["mean_test_score"]

In [None]:
plt.hist(grid_out.cv_results_["mean_test_score"])

# Previous runs

In [None]:
%%time
# 2020-12-08, PCA(n_components=20), RandomForestClassifier(max_depth=40, n_estimators = 100, 'scaler': None}
# accuracy score of 73.9%
# file: 2020-12-10 00:30:04 GridCVresults.pkl

file_name = "2020-12-10 00:30:04 GridCVresults.pkl"

if path.exists(file_name):

    infile = open(file_name,"rb")
    grid_out = pickle.load(infile)
    infile.close()

else:

    # pipeline initiation
    steps = [('scaler', StandardScaler()),        # step 1: scale
             ('dim_reduction', PCA()),            # step 2: PCA
             ('model', RandomForestClassifier())] # step 3: fit a regressor model
        
    model_pipeline = Pipeline(steps)
    
    # model parameters
    forest_params = {'model': [RandomForestClassifier()],
                     'model__max_depth': [1, 10, 40, 80],
                     'model__n_estimators': [10, 50, 100],
                     'scaler': [None, StandardScaler()],
                     'dim_reduction': [PCA()],
                     'dim_reduction__n_components':[5, 20, 50, 100],
                    }

    boost_params = {'model': [GradientBoostingClassifier()],
                    'model__max_depth': [1, 3, 80],
                    'model__min_samples_leaf': [1, 50, 200],
                    'model__learning_rate': [0.1, 0.5],
                    'model__n_estimators': [10, 50, 100],
                    'scaler': [None, StandardScaler()],
                    'dim_reduction': [PCA()],
                    'dim_reduction__n_components': [5, 20, 50, 100]
                   }

    # create our grid
    parameter_grid = [forest_params, boost_params]
    grid_out = GridSearchCV(model_pipeline, parameter_grid, cv=5, verbose=1, n_jobs = 6, 
                            scoring = "accuracy")
    
    # fit
    grid_out.fit(X_train_bal, y_train_bal)
     
    # write results    
    file_name = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " GridCVresults.pkl"
    pickle.dump(grid_out, open(file_name, "wb") )
    print("wrote:",file_name)

In [None]:
# try to reproduce the highest accuracy score

from sklearn.metrics import accuracy_score

pca = PCA(n_components=20).fit(X_train_bal)
X_test_pca = pca.transform(X_test)

print(X_train_bal.shape)
print(X_test.shape)
print(y_test.shape)
print(X_test_pca.shape)

y_pred_test = grid_out.predict(X_test_pca)
#print(f"Accuracy: {accuracy_score(y_test, y_pred_test)}")