## Best Model for Predicting NYC Motor Vehicle Collisions -  

Random Forest for Classification is the best model for predicting NYC Collisions. Below is the final code for this model which would be used hereafter for putting as a web service using Flask and local deployment using Docker.

In [1]:
# loading all the basic libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import calendar

Loading the Dataset -

In [2]:
# Importing the data from source as a csv file and converting it into a Pandas DataFrame:
data = pd.read_csv('NYC_Motor_Vehicle_Collisions_to_Person.csv')

# Viewing the snapshot of first 5 rows of the loaded dataset:
data.head(3)

Unnamed: 0,CRASH_DATE,CRASH_TIME,PERSON_INJURY,PERSON_AGE,BODILY_INJURY,SAFETY_EQUIPMENT,PERSON_SEX,PERSON_TYPE,PED_LOCATION,CONTRIBUTING_FACTOR_2,...,COMPLAINT,EMOTIONAL_STATUS,VEHICLE_ID,PERSON_ID,CONTRIBUTING_FACTOR_1,POSITION_IN_VEHICLE,PED_ROLE,UNIQUE_ID,PED_ACTION,COLLISION_ID
0,2021-05-02,21:00,Killed,62.0,Head,,F,Pedestrian,Pedestrian/Bicyclist/Other Pedestrian at Inter...,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,...,Severe Bleeding,Apparent Death,,f2f329b6-2dfc-4bd0-b751-2e4255f1ea06,Traffic Control Disregarded,,Pedestrian,11791937,Crossing Against Signal,4412948
1,2021-05-21,0:00,Killed,24.0,Entire Body,Air Bag Deployed,M,Occupant,,,...,Internal,Apparent Death,19986231.0,e27e12a2-0485-4e22-b692-3f8a765d2582,,Driver,Driver,11819198,,4419608
2,2021-10-15,2:00,Killed,30.0,Head,,M,Occupant,,,...,Internal,Apparent Death,20091024.0,1a6f5aa7-5125-4be2-8499-fe7977cb0d90,,Driver,Driver,11998317,,4467504


Data Cleaning and Formatting -

In [3]:
# dropping irrelevant columns from the dataset and assigning remaining dataset to a variable:
NYC_df = data.drop(['VEHICLE_ID', 'PERSON_ID','UNIQUE_ID','COLLISION_ID'], axis = 1)

# Imputing missing values in PERSON_AGE column with mean PERSON_AGE values:
NYC_df['PERSON_AGE'] = NYC_df['PERSON_AGE'].fillna(np.mean(data['PERSON_AGE']))

# Imputing missing values in Other columns with 'Unknown' or Most common values:
NYC_df['SAFETY_EQUIPMENT'].fillna("Unknown",inplace = True)
NYC_df['PED_LOCATION'].fillna("Unknown",inplace = True)
NYC_df['CONTRIBUTING_FACTOR_2'].fillna("Unspecified",inplace = True)
NYC_df['EJECTION'].fillna("Not Ejected",inplace = True)
NYC_df['CONTRIBUTING_FACTOR_1'].fillna("Unspecified",inplace = True)
NYC_df['POSITION_IN_VEHICLE'].fillna("Unknown",inplace = True)
NYC_df['PED_ACTION'].fillna("Unknown",inplace = True)

# converting the 'Crash Date' column to datetime format:
NYC_df['CRASH_DATE']= pd.to_datetime(NYC_df['CRASH_DATE'])

# converting the 'Person Age' column to integer format:
NYC_df['PERSON_AGE']= NYC_df['PERSON_AGE'].astype('int64')

# Replacing some values in each column with specific values:
# Changing "Does Not Apply" to "Unknown"or "None" 
NYC_df['BODILY_INJURY'].replace('Does Not Apply','None',inplace=True)  
NYC_df['PERSON_SEX'].replace('U','M',inplace=True)
NYC_df['PED_LOCATION'].replace('Does Not Apply','Unknown',inplace=True)
NYC_df['COMPLAINT'].replace('Does Not Apply','Unknown',inplace=True)
NYC_df['EMOTIONAL_STATUS'].replace('Does Not Apply','Unknown',inplace=True)
NYC_df['PED_ACTION'].replace('Does Not Apply','Unknown',inplace=True)
NYC_df['PERSON_INJURY'].replace({'Injured': 0 ,'Killed': 1},inplace=True)

In [4]:
# converting Months to abbreviated Month Names:
month_name = []

# Extracting Months from Crash_Date and using it to get abbreviated month names using for loop:
crash_month = pd.DatetimeIndex(NYC_df['CRASH_DATE']).month
for i in crash_month:
    mnth_abb = calendar.month_abbr[i]
    month_name.append(mnth_abb)

# assigning month name to a column
NYC_df['CRASH_Mnth_Name'] = month_name

Splitting the Data and getting the Feature Matrix & Target variables -

In [5]:
# splitting the dataset using sklearn into 60-20-20:
# Step 1 - splitting dataset into full train and test subsets first:
df_full_train, df_test = train_test_split(NYC_df, test_size=0.2,random_state=1) 

# Step 2 - splitting full train subset again into training set and validation set:
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state = 1)

# Resetting indices for each of the subset: 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Getting our target variable column ('PERSON_INJURY') subsets as respective Numpy arrays:
y_train = df_train.PERSON_INJURY
y_val = df_val.PERSON_INJURY
y_test = df_test.PERSON_INJURY

# Deleting 'PERSON_INJURY' or target column from feature matrix subsets:
del df_train['PERSON_INJURY']
del df_val['PERSON_INJURY']
del df_test['PERSON_INJURY']

# Re-checking the size of 3 subsets after deleting the target column:
df_train.shape, df_val.shape, df_test.shape

((27401, 17), (9134, 17), (9134, 17))

Predicting on Test data using our Final Model (Random Forest for Classification) -

In [6]:
label_encoder = LabelEncoder()
# resetting indices of full_train DataFrame:
df_full_train = df_full_train.reset_index(drop=True)

# Encode string class values of target variable PERSON_INJURY column as integers - using LabelEncoder():
#label_encoder = LabelEncoder()
y_full_train = label_encoder.fit_transform(df_full_train.PERSON_INJURY)

In [7]:
# Converting the CRASH_DATE column as a Timestamp and converting it into an integer data type for 
# df_full_train and df_test subsets:
df_full_train['CRASH_DATE'] = df_full_train['CRASH_DATE'].map(pd.Timestamp.timestamp).astype(int)
df_test['CRASH_DATE'] = df_test['CRASH_DATE'].map(pd.Timestamp.timestamp).astype(int)

In [8]:
# turning the full train df into dictionaries:
dicts_full_train = df_full_train.to_dict(orient='records')

# instantiating the vectorizer instance:
dv = DictVectorizer(sparse=False)

# turning list of dictionaries into full train feature matrix
X_full_train = dv.fit_transform(dicts_full_train)

# turning the test df into dictionaries:
dicts_test = df_test.to_dict(orient='records')

# turning list of dictionaries into testing feature matrix
X_test = dv.transform(dicts_test)

In [9]:
# Using the Final Random Forest Classifier -
rf = RandomForestClassifier(random_state=42, max_depth=15, min_samples_leaf = 1, max_features = 8, n_estimators = 70)

# training our train set with above optimal parameters:
model = rf.fit(X_full_train, y_full_train)

# predicting the Random Forest for Classification model on the testing set:
y_pred2 = rf.predict_proba(X_test)[:,1]

# computing the AUC score on testing set:
print('AUC on test Rand_Forest: %.3f' % roc_auc_score(y_test, y_pred2))

AUC on test Rand_Forest: 0.998


#### Using KFold Cross-Validation on our Final Model for making Predictions - 

(making 5-fold cross-validation)

In [10]:
# Step 1 -
# Function 1 - Creating a function to train our DataFrame:
def train(df_train, y_train):
    dicts = df_train.to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = rf.fit(X_train, y_train)
    
    return dv, model

In [11]:
# Step 2 - 
# Function 2 - Creating another function to predict:
def predict(df, dv, model):
    dicts = df.to_dict(orient='records')  # converts df to list of dictionaries
    
    X = dv.transform(dicts)  # creates a feature matrix using the vectorizer
    y_pred = model.predict(X) # uses the model
    
    return y_pred

In [12]:
# specifying the number of folds to be used:
n_splits = 5

In [13]:
# Performing K-fold Cross validation and evaluating the AUC scores after each iteration:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    
    # Selecting part of dataset as 3 subsets for model:
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    
    y_train = df_train.PERSON_INJURY.values
    y_val = df_val.PERSON_INJURY.values
    
    dv, model = train(df_train, y_train)   # using train function created
    y_pred = predict(df_val, dv, model)   # using predict function created
    
    # compute auc scores for each iteration or fold in KFold:
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    
# Computing mean of AUC scores and spread of AUC score:
print('%.3f +- %.3f' % (np.mean(scores), np.std(scores)))

0.933 +- 0.018


In [14]:
# printing the listing of AUC scores in each fold:
scores

[0.9418604651162791, 0.9605263157894737, 0.9125, 0.9125, 0.9387755102040816]

In [15]:
# Now, Training our Final Model on Full train dataset and evaluating on test dataset -
dv, model = train(df_full_train, y_full_train)  
y_pred = predict(df_test, dv, model)   # using predict function created

# compute auc for ROC Curve:
auc = roc_auc_score(y_test, y_pred)
auc

0.5232558139534884

#### Saving the Model -

In [16]:
import pickle

In [17]:
# Step 1 - taking our model and writing it to a file - 
# creating a file where we'll write it:
output_file = f'model.bin'                  
output_file

'model.bin'

In [18]:
# write a Binary file using pickle - alternative to open and close codes we use with open to automatically open-close a file:
with open(output_file, 'wb') as f_out:    # file output
    pickle.dump((dv, model), f_out)

#### Loading the Model -

In [19]:
import pickle

In [20]:
# creating a variable with our model file:
input_file = 'model.bin'

# loads our model file: 
with open(input_file, 'rb') as f_in:    # file input; rb - used to read the file
    dv, model = pickle.load(f_in)     # load() function reads from the file

In [21]:
model

RandomForestClassifier(max_depth=15, max_features=8, n_estimators=70,
                       random_state=42)

In [22]:
sample_collided_person = {'CRASH_DATE': 1618704000,
                          'CRASH_TIME' : 12,
                          'PERSON_AGE' : 45,
                          'BODILY_INJURY' : 'Head', 
                          'SAFETY_EQUIPMENT' : 'Lap Belt & Harness',
                          'PERSON_SEX' : 'M', 
                          'PERSON_TYPE' : 'Pedestrian',
                          'PED_LOCATION': 'Unknown', 
                          'CONTRIBUTING_FACTOR_2' : 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion', 
                          'EJECTION' : 'Not Ejected',
                          'COMPLAINT' : 'Internal',
                          'EMOTIONAL_STATUS' : 'Apparent Death', 
                          'CONTRIBUTING_FACTOR_1' : 'Unspecified', 
                          'POSITION_IN_VEHICLE' : 'Unknown',
                          'PED_ROLE' : 'Pedestrian', 
                          'PED_ACTION' : 'Crossing With Signal', 
                          'CRASH_Mnth_Name' : 'Jun'}

In [23]:
# transforming the sample_collided_person's feature details into a dictionary using DictVectorizer:
X = dv.transform([sample_collided_person])

In [24]:
# make prediction on sample person using our model: 
y_pred = model.predict_proba(X)[:,1]

In [25]:
print('input:', sample_collided_person)
print('output:', float(y_pred)) 

input: {'CRASH_DATE': 1618704000, 'CRASH_TIME': 12, 'PERSON_AGE': 45, 'BODILY_INJURY': 'Head', 'SAFETY_EQUIPMENT': 'Lap Belt & Harness', 'PERSON_SEX': 'M', 'PERSON_TYPE': 'Pedestrian', 'PED_LOCATION': 'Unknown', 'CONTRIBUTING_FACTOR_2': 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion', 'EJECTION': 'Not Ejected', 'COMPLAINT': 'Internal', 'EMOTIONAL_STATUS': 'Apparent Death', 'CONTRIBUTING_FACTOR_1': 'Unspecified', 'POSITION_IN_VEHICLE': 'Unknown', 'PED_ROLE': 'Pedestrian', 'PED_ACTION': 'Crossing With Signal', 'CRASH_Mnth_Name': 'Jun'}
output: 0.3589322914227006


In [27]:
# Specifying the collision_injury decision for our model by specifying the threshold >= 0.55:
collision_injury = float(y_pred) >= 0.55 
collision_injury

False