In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib
#!pip install keras
#!pip install tensorflow

In [3]:
# dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
# read the cleaned data
df = pd.read_csv("../Data/ksi_cleaned.csv")
df.head()

Unnamed: 0,ACCNUM,DATE,TIME,HOUR,STREET1,STREET2,ROAD_CLASS,LATITUDE,LONGITUDE,LOCCOORD,...,TRUCK,TRSN_CITY_VEH,EMERG_VEH,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,Hood_ID,Neighbourhood
0,893184,2006/01/01 05:00:00+00,236,2,WOODBINE AVE,O CONNOR DR,Major Arterial,43.699595,-79.318797,Intersection,...,No,No,No,Yes,Yes,No,Yes,No,60,Woodbine-Lumsden (60)
1,893184,2006/01/01 05:00:00+00,236,2,WOODBINE AVE,O CONNOR DR,Major Arterial,43.699595,-79.318797,Intersection,...,No,No,No,Yes,Yes,No,Yes,No,60,Woodbine-Lumsden (60)
2,893184,2006/01/01 05:00:00+00,236,2,WOODBINE AVE,O CONNOR DR,Major Arterial,43.699595,-79.318797,Intersection,...,No,No,No,Yes,Yes,No,Yes,No,60,Woodbine-Lumsden (60)
3,893184,2006/01/01 05:00:00+00,236,2,WOODBINE AVE,O CONNOR DR,Major Arterial,43.699595,-79.318797,Intersection,...,No,No,No,Yes,Yes,No,Yes,No,60,Woodbine-Lumsden (60)
4,893184,2006/01/01 05:00:00+00,236,2,WOODBINE AVE,O CONNOR DR,Major Arterial,43.699595,-79.318797,Intersection,...,No,No,No,Yes,Yes,No,Yes,No,60,Woodbine-Lumsden (60)


# Select the features (columns)

In [5]:
# Select features then assign X and y 
X = df[["HOUR", "ROAD_CLASS" , "TRAFFCTL", "VISIBILITY", "LIGHT", "RDSFCOND"]]
y = np.array(df["ACCLASS"].values.tolist())
print(X.shape, y.shape)
X = X.astype(str)
X

(16093, 6) (16093,)


Unnamed: 0,HOUR,ROAD_CLASS,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND
0,2,Major Arterial,No Control,Clear,Dark,Wet
1,2,Major Arterial,No Control,Clear,Dark,Wet
2,2,Major Arterial,No Control,Clear,Dark,Wet
3,2,Major Arterial,No Control,Clear,Dark,Wet
4,2,Major Arterial,No Control,Clear,Dark,Wet
...,...,...,...,...,...,...
16088,23,Major Arterial,No Control,Clear,"Dark, artificial",Dry
16089,23,Major Arterial,No Control,Clear,"Dark, artificial",Dry
16090,15,Major Arterial,Traffic Signal,Clear,Daylight,Dry
16091,15,Major Arterial,Traffic Signal,Clear,Daylight,Dry


In [6]:
y

array(['Non-Fatal Injury', 'Non-Fatal Injury', 'Non-Fatal Injury', ...,
       'Non-Fatal Injury', 'Non-Fatal Injury', 'Non-Fatal Injury'],
      dtype='<U16')

In [7]:
from sklearn.preprocessing import LabelEncoder

# label-encode y values
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y


array([1, 1, 1, ..., 1, 1, 1])

In [8]:
# currently, 1 is non-fatal and 0 is fatal. Reverse this by:
# multiplying all by -1
# adding 1 to all of them
new_y = []

for i in encoded_y:
    j = (i*-1)+1
    new_y.append(j)

new_y = np.asarray(new_y)
new_y

array([0, 0, 0, ..., 0, 0, 0])

In [9]:
# generate binary values for features using get_dummies
hour_df = pd.get_dummies(X, columns=["HOUR"], prefix=["Hour"] )
road_class_df = pd.get_dummies(X, columns=["ROAD_CLASS"], prefix=["Road_class"])
traffictl_df = pd.get_dummies(X, columns=["TRAFFCTL"], prefix=["Traffctl"])
visibility_df = pd.get_dummies(X, columns=["VISIBILITY"], prefix=["Visibility"])
light_df = pd.get_dummies(X, columns=["LIGHT"], prefix=["Light"])
rdsfcond_df = pd.get_dummies(X, columns=["RDSFCOND"], prefix=["Rdsfcond"])


In [10]:
# combine with main features (X) on key values
encoded_X = pd.concat([X, hour_df, road_class_df, traffictl_df, visibility_df,
                           light_df, rdsfcond_df],axis=1)
encoded_X

Unnamed: 0,HOUR,ROAD_CLASS,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ROAD_CLASS.1,TRAFFCTL.1,VISIBILITY.1,LIGHT.1,...,Rdsfcond_Dry,Rdsfcond_Ice,Rdsfcond_Loose Sand or Gravel,Rdsfcond_Loose Snow,Rdsfcond_Other,Rdsfcond_Packed Snow,Rdsfcond_Slush,Rdsfcond_Spilled liquid,Rdsfcond_Unknown,Rdsfcond_Wet
0,2,Major Arterial,No Control,Clear,Dark,Wet,Major Arterial,No Control,Clear,Dark,...,0,0,0,0,0,0,0,0,0,1
1,2,Major Arterial,No Control,Clear,Dark,Wet,Major Arterial,No Control,Clear,Dark,...,0,0,0,0,0,0,0,0,0,1
2,2,Major Arterial,No Control,Clear,Dark,Wet,Major Arterial,No Control,Clear,Dark,...,0,0,0,0,0,0,0,0,0,1
3,2,Major Arterial,No Control,Clear,Dark,Wet,Major Arterial,No Control,Clear,Dark,...,0,0,0,0,0,0,0,0,0,1
4,2,Major Arterial,No Control,Clear,Dark,Wet,Major Arterial,No Control,Clear,Dark,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16088,23,Major Arterial,No Control,Clear,"Dark, artificial",Dry,Major Arterial,No Control,Clear,"Dark, artificial",...,1,0,0,0,0,0,0,0,0,0
16089,23,Major Arterial,No Control,Clear,"Dark, artificial",Dry,Major Arterial,No Control,Clear,"Dark, artificial",...,1,0,0,0,0,0,0,0,0,0
16090,15,Major Arterial,Traffic Signal,Clear,Daylight,Dry,Major Arterial,Traffic Signal,Clear,Daylight,...,1,0,0,0,0,0,0,0,0,0
16091,15,Major Arterial,Traffic Signal,Clear,Daylight,Dry,Major Arterial,Traffic Signal,Clear,Daylight,...,1,0,0,0,0,0,0,0,0,0


In [11]:
# drop unnecessary columns
encoded_X.drop(columns=["HOUR", "ROAD_CLASS", "TRAFFCTL", "VISIBILITY", "LIGHT", "RDSFCOND"], inplace=True)
encoded_X

Unnamed: 0,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,...,Rdsfcond_Dry,Rdsfcond_Ice,Rdsfcond_Loose Sand or Gravel,Rdsfcond_Loose Snow,Rdsfcond_Other,Rdsfcond_Packed Snow,Rdsfcond_Slush,Rdsfcond_Spilled liquid,Rdsfcond_Unknown,Rdsfcond_Wet
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16088,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16089,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16090,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
16091,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
# convert df to array
encoded_X = encoded_X.values
encoded_X

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [13]:
from sklearn.model_selection import train_test_split

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(encoded_X, new_y, random_state=42)

# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (12069, 74) (12069,)
Test (4024, 74) (4024,)


In [14]:
# Create a support vector machine linear classifer and fit it to the training data

from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
# Print the model score 
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8658546689866601
Testing Data Score: 0.8620775347912525


In [16]:
# Calculate the classification report
predictions = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names = ["Non-Fatal", "Fatal"]))

# we note the recall is very low for

              precision    recall  f1-score   support

   Non-Fatal       0.86      1.00      0.93      3467
       Fatal       0.55      0.02      0.04       557

    accuracy                           0.86      4024
   macro avg       0.70      0.51      0.48      4024
weighted avg       0.82      0.86      0.80      4024



# Hyperparameter Tuning

Using `GridSearchCV` to tune the model's parameters

In [17]:
# get the model parameters
print(model.get_params())

{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [18]:
# Create the GridSearchCV model

from sklearn.model_selection import GridSearchCV

# create parameter grid
param_grid2 = {'C': [0.1, 1, 5, 10, 50, 100],
              'gamma': [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.9]}

#param_grid2 = {'C': [0.1, 1, 5, 10],
#             'gamma': [0.00001, 0.0001, 0.001, 0.01]}

grid2 = GridSearchCV(model, param_grid2, verbose=4, scoring='recall')

In [19]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
# Train the model with GridSearch
grid2.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] C=0.1, gamma=1e-05 ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. C=0.1, gamma=1e-05, score=0.015, total=   2.3s
[CV] C=0.1, gamma=1e-05 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


[CV] .................. C=0.1, gamma=1e-05, score=0.012, total=   2.3s
[CV] C=0.1, gamma=1e-05 ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.6s remaining:    0.0s


[CV] .................. C=0.1, gamma=1e-05, score=0.018, total=   2.3s
[CV] C=0.1, gamma=1e-05 ..............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.9s remaining:    0.0s


[CV] .................. C=0.1, gamma=1e-05, score=0.012, total=   2.4s
[CV] C=0.1, gamma=1e-05 ..............................................
[CV] .................. C=0.1, gamma=1e-05, score=0.012, total=   2.3s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] ................. C=0.1, gamma=0.0001, score=0.015, total=   2.3s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] ................. C=0.1, gamma=0.0001, score=0.012, total=   2.3s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] ................. C=0.1, gamma=0.0001, score=0.018, total=   2.2s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] ................. C=0.1, gamma=0.0001, score=0.012, total=   2.2s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] ................. C=0.1, gamma=0.0001, score=0.012, total=   2.2s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[CV] .................... C=5, gamma=1e-05, score=0.033, total=   2.6s
[CV] C=5, gamma=1e-05 ................................................
[CV] .................... C=5, gamma=1e-05, score=0.027, total=   2.8s
[CV] C=5, gamma=1e-05 ................................................
[CV] .................... C=5, gamma=1e-05, score=0.015, total=   2.8s
[CV] C=5, gamma=1e-05 ................................................
[CV] .................... C=5, gamma=1e-05, score=0.039, total=   2.7s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.030, total=   2.7s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.033, total=   2.7s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................... C=5, gamma=0.0001, score=0.027, total=   2.7s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[CV] ..................... C=10, gamma=0.9, score=0.039, total=   2.8s
[CV] C=50, gamma=1e-05 ...............................................
[CV] ................... C=50, gamma=1e-05, score=0.030, total=   3.7s
[CV] C=50, gamma=1e-05 ...............................................
[CV] ................... C=50, gamma=1e-05, score=0.033, total=   3.6s
[CV] C=50, gamma=1e-05 ...............................................
[CV] ................... C=50, gamma=1e-05, score=0.027, total=   3.6s
[CV] C=50, gamma=1e-05 ...............................................
[CV] ................... C=50, gamma=1e-05, score=0.015, total=   3.7s
[CV] C=50, gamma=1e-05 ...............................................
[CV] ................... C=50, gamma=1e-05, score=0.039, total=   3.5s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.030, total=   3.6s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .

[CV] .................... C=100, gamma=0.9, score=0.027, total=   4.4s
[CV] C=100, gamma=0.9 ................................................
[CV] .................... C=100, gamma=0.9, score=0.015, total=   4.6s
[CV] C=100, gamma=0.9 ................................................
[CV] .................... C=100, gamma=0.9, score=0.039, total=   4.3s


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  9.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 5, 10, 50, 100],
                         'gamma': [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='recall', verbose=4)

In [21]:
print(grid2.best_params_)
print(grid2.best_score_)

{'C': 5, 'gamma': 1e-05}
0.029081754096859834


In [22]:
best2 = grid2.best_estimator_

In [23]:
# Make predictions with the hypertuned model
predictions2 = best2.predict(X_test)

In [24]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions2, target_names = ["Non-Fatal", "Fatal"]))

              precision    recall  f1-score   support

   Non-Fatal       0.86      1.00      0.93      3467
       Fatal       0.55      0.02      0.04       557

    accuracy                           0.86      4024
   macro avg       0.70      0.51      0.48      4024
weighted avg       0.82      0.86      0.80      4024

