In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from sklearn.multioutput import MultiOutputClassifier

import pickle

# Goal: Create an second model

In [2]:
unpickleFile = open('./Data/clean_joined_df.pkl', 'rb')
df = pickle.load(unpickleFile, encoding='bytes')
df.head()

Unnamed: 0,DRIVERS_LICENSE_CLASS,SEX,AGE,SAFETY_EQUIPMENT,UNIT_TYPE,NUM_PASSENGERS,MAKE,MODEL,VEHICLE_DEFECT,VEHICLE_TYPE,...,ROAD_DEFECT,CRASH_TYPE,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,STREET_DIRECTION,BEAT_OF_OCCURRENCE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,INJURY_LEVEL
0,D,M,25.0,NONE PRESENT,DRIVER,0.0,HONDA,CIVIC,UNKNOWN,PASSENGER,...,NO DEFECTS,NO INJURY / DRIVE AWAY,UNABLE TO DETERMINE,NOT APPLICABLE,N,1531.0,3.0,7.0,9.0,0
1,,M,37.0,SAFETY BELT USED,DRIVER,0.0,BUICK,ENCORE,NONE,PASSENGER,...,NO DEFECTS,NO INJURY / DRIVE AWAY,IMPROPER OVERTAKING/PASSING,FAILING TO REDUCE SPEED TO AVOID CRASH,W,613.0,22.0,2.0,4.0,0
2,,X,,USAGE UNKNOWN,DRIVER,0.0,UNKNOWN,UNKNOWN,NONE,PASSENGER,...,NO DEFECTS,NO INJURY / DRIVE AWAY,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,NOT APPLICABLE,W,821.0,5.0,1.0,11.0,0
3,,X,,USAGE UNKNOWN,DRIVER,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN/NA,...,NO DEFECTS,NO INJURY / DRIVE AWAY,UNABLE TO DETERMINE,UNABLE TO DETERMINE,N,2023.0,8.0,1.0,11.0,0
4,,X,,USAGE UNKNOWN,DRIVER,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN/NA,...,NO DEFECTS,NO INJURY / DRIVE AWAY,UNABLE TO DETERMINE,NOT APPLICABLE,S,223.0,10.0,1.0,11.0,0


In [3]:
df.shape

(1088146, 30)

In [5]:
df.dtypes

DRIVERS_LICENSE_CLASS       object
SEX                         object
AGE                        float64
SAFETY_EQUIPMENT            object
UNIT_TYPE                   object
NUM_PASSENGERS             float64
MAKE                        object
MODEL                       object
VEHICLE_DEFECT              object
VEHICLE_TYPE                object
VEHICLE_USE                 object
MANEUVER                    object
FIRST_CONTACT_POINT         object
POSTED_SPEED_LIMIT         float64
WEATHER_CONDITION           object
LIGHTING_CONDITION          object
FIRST_CRASH_TYPE            object
TRAFFICWAY_TYPE             object
ALIGNMENT                   object
ROADWAY_SURFACE_COND        object
ROAD_DEFECT                 object
CRASH_TYPE                  object
PRIM_CONTRIBUTORY_CAUSE     object
SEC_CONTRIBUTORY_CAUSE      object
STREET_DIRECTION            object
BEAT_OF_OCCURRENCE         float64
CRASH_HOUR                 float64
CRASH_DAY_OF_WEEK          float64
CRASH_MONTH         

In [6]:
df.WEATHER_CONDITION.value_counts()

CLEAR                       866704
RAIN                         97693
UNKNOWN                      43677
SNOW                         38557
CLOUDY/OVERCAST              33056
OTHER                         3261
FREEZING RAIN/DRIZZLE         1646
FOG/SMOKE/HAZE                1542
SLEET/HAIL                    1392
BLOWING SNOW                   446
SEVERE CROSS WIND GATE         164
BLOWING SAND, SOIL, DIRT         8
Name: WEATHER_CONDITION, dtype: int64

In [3]:
df.columns

Index(['DRIVERS_LICENSE_CLASS', 'SEX', 'AGE', 'SAFETY_EQUIPMENT', 'UNIT_TYPE',
       'NUM_PASSENGERS', 'MAKE', 'MODEL', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
       'VEHICLE_USE', 'MANEUVER', 'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'CRASH_TYPE', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE',
       'STREET_DIRECTION', 'BEAT_OF_OCCURRENCE', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'INJURY_LEVEL'],
      dtype='object')

In [4]:
df.isna().sum()

DRIVERS_LICENSE_CLASS      388727
SEX                             0
AGE                        291805
SAFETY_EQUIPMENT                0
UNIT_TYPE                       0
NUM_PASSENGERS                  0
MAKE                            0
MODEL                           0
VEHICLE_DEFECT                  0
VEHICLE_TYPE                    0
VEHICLE_USE                     0
MANEUVER                        0
FIRST_CONTACT_POINT             0
POSTED_SPEED_LIMIT              0
WEATHER_CONDITION               0
LIGHTING_CONDITION              0
FIRST_CRASH_TYPE                0
TRAFFICWAY_TYPE                 0
ALIGNMENT                       0
ROADWAY_SURFACE_COND            0
ROAD_DEFECT                     0
CRASH_TYPE                      0
PRIM_CONTRIBUTORY_CAUSE         0
SEC_CONTRIBUTORY_CAUSE          0
STREET_DIRECTION                0
BEAT_OF_OCCURRENCE              0
CRASH_HOUR                      0
CRASH_DAY_OF_WEEK               0
CRASH_MONTH                     0
INJURY_LEVEL  

For most nan's, we've been able to engineer our data or drop most rows that have nan's.

For the nan's that are leftover, they are in 'DRIVERS_LICENSE_CLASS' and 'AGE'.
In our pipelines, we'll need to impute fixes for these nan's:
- `DRIVERS_LICENSE_CLASS` - Fill with most common(assuming people are driving with a license)
- `AGE` - Fill with mean

We'll need to do this with pipelines to avoid data leakage.

We're only going to use a couple of variables for our target for our FSM.

We're also going to limit the amount of classes we're classifiying upon.

In [19]:
# Drop rows with a cause that has less than 10k total
drop_list = []
for cause, number in zip(df.PRIM_CONTRIBUTORY_CAUSE.value_counts().index, df.PRIM_CONTRIBUTORY_CAUSE.value_counts()):
    if number < 10000:
        drop_list.append(cause)
df.PRIM_CONTRIBUTORY_CAUSE.value_counts()

UNABLE TO DETERMINE                                                                 380362
FAILING TO YIELD RIGHT-OF-WAY                                                       133013
FOLLOWING TOO CLOSELY                                                               132795
IMPROPER OVERTAKING/PASSING                                                          57294
FAILING TO REDUCE SPEED TO AVOID CRASH                                               51012
NOT APPLICABLE                                                                       49737
IMPROPER LANE USAGE                                                                  43608
IMPROPER BACKING                                                                     41129
IMPROPER TURNING/NO SIGNAL                                                           39791
DRIVING SKILLS/KNOWLEDGE/EXPERIENCE                                                  30957
DISREGARDING TRAFFIC SIGNALS                                                         25028

In [20]:
df = df[~df['PRIM_CONTRIBUTORY_CAUSE'].isin(drop_list)]
df.PRIM_CONTRIBUTORY_CAUSE.value_counts()

UNABLE TO DETERMINE                                                                 380362
FAILING TO YIELD RIGHT-OF-WAY                                                       133013
FOLLOWING TOO CLOSELY                                                               132795
IMPROPER OVERTAKING/PASSING                                                          57294
FAILING TO REDUCE SPEED TO AVOID CRASH                                               51012
NOT APPLICABLE                                                                       49737
IMPROPER LANE USAGE                                                                  43608
IMPROPER BACKING                                                                     41129
IMPROPER TURNING/NO SIGNAL                                                           39791
DRIVING SKILLS/KNOWLEDGE/EXPERIENCE                                                  30957
DISREGARDING TRAFFIC SIGNALS                                                         25028

In [21]:
simple_X = df[['WEATHER_CONDITION', 'NUM_PASSENGERS', 'AGE']]
y = df[['PRIM_CONTRIBUTORY_CAUSE']]

X_train, X_test, y_train, y_test = train_test_split(simple_X, y, random_state=42)

In [22]:
# Create pipes
subpipe_num = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='median')),
    ('ss', StandardScaler())
])


subpipe_cat = Pipeline(steps=[
    ('cat_impute',SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

CT = ColumnTransformer(transformers=[
    ('subpipe_num', subpipe_num, selector(dtype_include=np.number)),
    ('subpipe_cat', subpipe_cat, selector(dtype_include=object))], remainder='passthrough')

In [23]:
FSM_model_pipe = Pipeline(steps=[
    ('ct', CT),
    ('fsm', MultiOutputClassifier(LogisticRegression()))
])

In [24]:
fsm = FSM_model_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
fsm.score(X_train, y_train)

0.3698445298725023

In [31]:
fsm.score(X_test, y_test)

0.3701697026311692

That score is a little rough, having only 37% score on unseen data. The 'good' news is that we're getting an almost identical score for the trained data which means the model performs identically on seen vs unseen data.

This score is slightly worse than before, when we ran the model without person/car info.

The next step is to iterate on this model and see how we can improve it.

For the next iteration we'll modify our target, as a target of 'unable to determine' isn't helpful. We may need to apply SMOTE to the imbalanced dataset.

We'll also apply gridsearch to find a more optimal model.