In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from pandas_profiling import ProfileReport

from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from sklearn.multioutput import MultiOutputClassifier

from sklearn.tree import DecisionTreeClassifier

import pickle

In [2]:
unpickleFile = open('./Data/clean_joined_df.pkl', 'rb')
df = pickle.load(unpickleFile, encoding='bytes')
df.head()

Unnamed: 0,DRIVERS_LICENSE_CLASS,SEX,AGE,SAFETY_EQUIPMENT,UNIT_TYPE,NUM_PASSENGERS,MAKE,MODEL,VEHICLE_DEFECT,VEHICLE_TYPE,...,ROAD_DEFECT,CRASH_TYPE,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,STREET_DIRECTION,BEAT_OF_OCCURRENCE,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,INJURY_LEVEL
0,D,M,25.0,NONE PRESENT,DRIVER,0.0,HONDA,CIVIC,UNKNOWN,PASSENGER,...,NO DEFECTS,NO INJURY / DRIVE AWAY,UNABLE TO DETERMINE,NOT APPLICABLE,N,1531.0,3.0,7.0,9.0,0
1,,M,37.0,SAFETY BELT USED,DRIVER,0.0,BUICK,ENCORE,NONE,PASSENGER,...,NO DEFECTS,NO INJURY / DRIVE AWAY,IMPROPER OVERTAKING/PASSING,FAILING TO REDUCE SPEED TO AVOID CRASH,W,613.0,22.0,2.0,4.0,0
2,,X,,USAGE UNKNOWN,DRIVER,0.0,UNKNOWN,UNKNOWN,NONE,PASSENGER,...,NO DEFECTS,NO INJURY / DRIVE AWAY,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,NOT APPLICABLE,W,821.0,5.0,1.0,11.0,0
3,,X,,USAGE UNKNOWN,DRIVER,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN/NA,...,NO DEFECTS,NO INJURY / DRIVE AWAY,UNABLE TO DETERMINE,UNABLE TO DETERMINE,N,2023.0,8.0,1.0,11.0,0
4,,X,,USAGE UNKNOWN,DRIVER,0.0,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN/NA,...,NO DEFECTS,NO INJURY / DRIVE AWAY,UNABLE TO DETERMINE,NOT APPLICABLE,S,223.0,10.0,1.0,11.0,0


In [3]:
df.shape

(1088146, 30)

In [4]:
# Unfortunately, the data set is still in an unworkable position.
# This will be a final pass for the data to:
# reduce the scope(focus on top 3-5 classes)
# reduce the complexity(engineer variables to simplify variables)
# reduce the complexity(delete overly complex variables that can't be engineered)

In [5]:
df.columns

Index(['DRIVERS_LICENSE_CLASS', 'SEX', 'AGE', 'SAFETY_EQUIPMENT', 'UNIT_TYPE',
       'NUM_PASSENGERS', 'MAKE', 'MODEL', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
       'VEHICLE_USE', 'MANEUVER', 'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'CRASH_TYPE', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE',
       'STREET_DIRECTION', 'BEAT_OF_OCCURRENCE', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'INJURY_LEVEL'],
      dtype='object')

In [6]:
df.isna().sum()

DRIVERS_LICENSE_CLASS      388727
SEX                             0
AGE                        291805
SAFETY_EQUIPMENT                0
UNIT_TYPE                       0
NUM_PASSENGERS                  0
MAKE                            0
MODEL                           0
VEHICLE_DEFECT                  0
VEHICLE_TYPE                    0
VEHICLE_USE                     0
MANEUVER                        0
FIRST_CONTACT_POINT             0
POSTED_SPEED_LIMIT              0
WEATHER_CONDITION               0
LIGHTING_CONDITION              0
FIRST_CRASH_TYPE                0
TRAFFICWAY_TYPE                 0
ALIGNMENT                       0
ROADWAY_SURFACE_COND            0
ROAD_DEFECT                     0
CRASH_TYPE                      0
PRIM_CONTRIBUTORY_CAUSE         0
SEC_CONTRIBUTORY_CAUSE          0
STREET_DIRECTION                0
BEAT_OF_OCCURRENCE              0
CRASH_HOUR                      0
CRASH_DAY_OF_WEEK               0
CRASH_MONTH                     0
INJURY_LEVEL  

In [7]:
df.UNIT_TYPE.value_counts()

DRIVER                 1088051
NON-CONTACT VEHICLE         95
Name: UNIT_TYPE, dtype: int64

In [8]:
# list of variables to screen with
# WILL TRY ON TARGET FIRST AND THEN SEE HOW MUCH SCREENING IS NECESSARY

# 'PRIM_CONTRIBUTORY_CAUSE' - limit to top 4:
# FAILING TO YIELD RIGHT-OF-WAY                                                       133013
# FOLLOWING TOO CLOSELY                                                               132795
# IMPROPER OVERTAKING/PASSING                                                          57294
# FAILING TO REDUCE SPEED TO AVOID CRASH                                               51012


# 'DRIVERS_LICENSE_CLASS - only take people with drivers licenses (D)
# 'UNIT_TYPE' - only for drivers
# 'AGE' - only people 16 or above can use a drivers license

In [9]:
# list of variables to engineer
# SAFETY_EQUIPMENT - safety belt used/not used
# 'NUM_PASSENGERS' - has/does not have passengers
# 'VEHICLE_DEFECT' - defective/not defective
# 'VEHICLE_TYPE' - motorcycle/passenger/large passenger/large
# 'VEHICLE_USE' - personal/not-personal
# 'MANEUVER' - straight/turn/traffic/other
# 'FIRST_CONTACT_POINT' - LEAVE AS IS FOR NOW
# 'POSTED_SPEED_LIMIT' - BUCKET(LOW/MED/HIGH)
# 'WEATHER_CONDITION' - clear+unknown/rain/snow
# 'ROADWAY_SURFACE_COND' - DRY/WET/OTHER
# 'ROAD_DEFECT' - no defect/possible defect
# 'TRAFFICWAY_TYPE'' - not divided/divided/other
# 'ALIGNMENT' - straight/curved


In [10]:
# Overly complicated: remove with predjudice
# 'MAKE', 'MODEL', 'FIRST_CRASH_TYPE', 'SEC_CONTRIBUTORY_CAUSE', 'BEAT_OF_OCCURRENCE'

In [11]:
# list of variables to remove after above steps
# 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT', 'UNIT_TYPE', 'NUM_PASSENGERS', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
# 'VEHICLE_USE', 'MANEUVER', 'POSTED_SPEED_LIMIT', 'WEATHER_CONDITION', 'ROADWAY_SURFACE_COND', 'ALIGNMENT',
# 'ROAD_DEFECT'

In [12]:
# SCREEN pass 1: target reduction
cause_list = ['FAILING TO YIELD RIGHT-OF-WAY', 'FOLLOWING TOO CLOSELY',
              'IMPROPER OVERTAKING/PASSING', 'FAILING TO REDUCE SPEED TO AVOID CRASH']
df = df[df['PRIM_CONTRIBUTORY_CAUSE'].isin(cause_list)]

In [13]:
df.shape

(374114, 30)

In [14]:
# SCREEN pass 2: has drivers license
df = df[df['DRIVERS_LICENSE_CLASS'].isin(['D'])]
df.DRIVERS_LICENSE_CLASS.value_counts()

D    240793
Name: DRIVERS_LICENSE_CLASS, dtype: int64

In [15]:
# SCREEN pass 3: UNIT_TYPE only for drivers
df = df[df['UNIT_TYPE'].isin(['DRIVER'])]
df.UNIT_TYPE.value_counts()

DRIVER    240784
Name: UNIT_TYPE, dtype: int64

In [16]:
# SCREEN pass 4: age 16 or above
df = df[df['AGE'] >= 16]
df.AGE.value_counts()

25.0     7099
27.0     7023
26.0     6925
28.0     6742
24.0     6689
         ... 
97.0        4
103.0       3
108.0       2
102.0       1
104.0       1
Name: AGE, Length: 89, dtype: int64

In [17]:
df.shape

(230514, 30)

In [18]:
# DELETE pass: remove overly complex variables
df.drop(columns=['MAKE', 'MODEL', 'FIRST_CRASH_TYPE', 'SEC_CONTRIBUTORY_CAUSE', 'BEAT_OF_OCCURRENCE'], inplace=True)
df.columns

Index(['DRIVERS_LICENSE_CLASS', 'SEX', 'AGE', 'SAFETY_EQUIPMENT', 'UNIT_TYPE',
       'NUM_PASSENGERS', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE',
       'MANEUVER', 'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'TRAFFICWAY_TYPE',
       'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE',
       'PRIM_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'INJURY_LEVEL'],
      dtype='object')

In [19]:
df.CRASH_TYPE.value_counts()

NO INJURY / DRIVE AWAY              160364
INJURY AND / OR TOW DUE TO CRASH     70150
Name: CRASH_TYPE, dtype: int64

In [20]:
# ENGINEER pass: create function to engineer new variables for all required cols

def Engineer_Vars(data):
    
    # 'SAFETY_EQUIPMENT' - safety belt or helmet used/not used+unkown
    data['se_simple'] = np.where(((data['SAFETY_EQUIPMENT']=='SAFETY BELT USED')|
                               (data['SAFETY_EQUIPMENT']=='DOT COMPLIANT MOTORCYCLE HELMET')), 'safety belt/helmet used', 
                                 'NO safety belt/helmet + unknown')
    
    # 'NUM_PASSENGERS' - has/does not have passengers
    data['passengers_simple'] = np.where(data['NUM_PASSENGERS'] > 0, 'has passengers', 'no passengers')
    
    # 'VEHICLE_DEFECT' - not defective/defective+unknown
    data['defect_simple'] = np.where(data['VEHICLE_DEFECT'] == 'NONE', 'not defective', 'defective/unknown')
    
    # 'VEHICLE_TYPE' - passenger/other
    data['vehicletype_simple'] = np.where(data['VEHICLE_TYPE'] == 'PASSENGER', 'passenger car', 'other')
    
    # 'VEHICLE_USE' - personal/not-personal
    data['vehicleuse_simple'] = np.where(data['VEHICLE_USE'] == 'PERSONAL', 'personal vehicle', 'non-personal vehicle')  
    
    # 'MANEUVER' - straight/turn/traffic/other    
#     Leave as is for now. too complicated

    # 'FIRST_CONTACT_POINT' - LEAVE AS IS FOR NOW
    
    # 'POSTED_SPEED_LIMIT' - BUCKET(LOW/MED/HIGH)
    conditions = [
        (data['POSTED_SPEED_LIMIT'] < 30),
        (data['POSTED_SPEED_LIMIT'] >= 30) & (data['POSTED_SPEED_LIMIT'] < 40),
        (data['POSTED_SPEED_LIMIT'] >= 40)]
    
    choices = ['low', 'med', 'high']
    data['speedlimit_simple'] = np.select(conditions, choices, default='low')
    
    # 'WEATHER_CONDITION' - clear+unknown/rain/snow/other
    conditions = [
        (data['WEATHER_CONDITION'] == 'CLEAR') | (data['WEATHER_CONDITION'] == 'UNKNOWN'),
        (data['WEATHER_CONDITION'] == 'RAIN') | (data['WEATHER_CONDITION'] == 'CLOUDY/OVERCAST') |
            (data['WEATHER_CONDITION'] == 'CLOUDY/OVERCAST'),
        (data['WEATHER_CONDITION'] == 'SNOW')]
    
    choices = ['clear/unknown', 'rain', 'snow']
    data['weather_simple'] = np.select(conditions, choices, default='other')
    
    # 'ROADWAY_SURFACE_COND' - DRY/H20/OTHER
    conditions = [
        (data['ROADWAY_SURFACE_COND'] == 'DRY'),
        (data['ROADWAY_SURFACE_COND'] == 'WET') | (data['ROADWAY_SURFACE_COND'] == 'SNOW OR SLUSH') |
            (data['ROADWAY_SURFACE_COND'] == 'ICE')]
    
    choices = ['dry', 'H20']
    data['roadcond_simple'] = np.select(conditions, choices, default='other')
    
    # 'ROAD_DEFECT' - no defect/possible defect
    data['roaddef_simple'] = np.where(data['ROAD_DEFECT'] == 'NO DEFECTS', 'no road defect', 'possible road defect')
    
    # 'TRAFFICWAY_TYPE'' - not divided/divided/other
    conditions = [
        (data['TRAFFICWAY_TYPE'] == 'NOT DIVIDED') | (data['TRAFFICWAY_TYPE'] == 'ONE-WAY'),
        (data['TRAFFICWAY_TYPE'] == 'DIVIDED - W/MEDIAN (NOT RAISED)') | (data['TRAFFICWAY_TYPE'] == 'DIVIDED - W/MEDIAN BARRIER')
        ]
    
    choices = ['not divided', 'divided']
    data['trafficway_simple'] = np.select(conditions, choices, default='other')
    
    # 'ALIGNMENT' - straight/curved
    data['alignment_simple'] = np.where(data['ALIGNMENT'].str.contains('STRAIGHT'), 'straight', 'curved')
    
    
    

In [21]:
Engineer_Vars(df)
df.columns

Index(['DRIVERS_LICENSE_CLASS', 'SEX', 'AGE', 'SAFETY_EQUIPMENT', 'UNIT_TYPE',
       'NUM_PASSENGERS', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'VEHICLE_USE',
       'MANEUVER', 'FIRST_CONTACT_POINT', 'POSTED_SPEED_LIMIT',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'TRAFFICWAY_TYPE',
       'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE',
       'PRIM_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'INJURY_LEVEL', 'se_simple',
       'passengers_simple', 'defect_simple', 'vehicletype_simple',
       'vehicleuse_simple', 'speedlimit_simple', 'weather_simple',
       'roadcond_simple', 'roaddef_simple', 'trafficway_simple',
       'alignment_simple'],
      dtype='object')

In [22]:
# DELETE pass: remove unneccary variables
def Remove_complicated_cols(data):
    remove_list = [
        'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT', 'UNIT_TYPE', 'NUM_PASSENGERS', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
        'VEHICLE_USE', 'POSTED_SPEED_LIMIT', 'WEATHER_CONDITION', 'ROADWAY_SURFACE_COND', 'TRAFFICWAY_TYPE', 'ALIGNMENT',
        'ROAD_DEFECT' 
        
#         'MANEUVER', 'FIRST_CONTACT_POINT'
    ]
    data.drop(columns=remove_list, inplace=True)

In [23]:
Remove_complicated_cols(df)
df.columns

Index(['SEX', 'AGE', 'MANEUVER', 'FIRST_CONTACT_POINT', 'LIGHTING_CONDITION',
       'CRASH_TYPE', 'PRIM_CONTRIBUTORY_CAUSE', 'STREET_DIRECTION',
       'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'INJURY_LEVEL',
       'se_simple', 'passengers_simple', 'defect_simple', 'vehicletype_simple',
       'vehicleuse_simple', 'speedlimit_simple', 'weather_simple',
       'roadcond_simple', 'roaddef_simple', 'trafficway_simple',
       'alignment_simple'],
      dtype='object')

In [24]:
df.shape

(230514, 23)

In [25]:
# final prep complete. Will pickle and try to gridsearch again!
df.to_pickle("./Data/final_data.pkl")