In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from sklearn.multioutput import MultiOutputClassifier

In [2]:
df = pd.read_csv('../Datasets/Traffic_Crashes_-_People.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.columns

Index(['PERSON_ID', 'PERSON_TYPE', 'CRASH_RECORD_ID', 'RD_NO', 'VEHICLE_ID',
       'CRASH_DATE', 'SEAT_NO', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 'AGE',
       'DRIVERS_LICENSE_STATE', 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT',
       'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'HOSPITAL',
       'EMS_AGENCY', 'EMS_RUN_NO', 'DRIVER_ACTION', 'DRIVER_VISION',
       'PHYSICAL_CONDITION', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY',
       'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT VALUE',
       'CELL_PHONE_USE'],
      dtype='object')

In [4]:
df.shape

(1412636, 30)

In [5]:
clean_people = df[['PERSON_ID', 'CRASH_RECORD_ID', 'VEHICLE_ID', 'DRIVERS_LICENSE_CLASS',
              'SEX', 'AGE', 'SAFETY_EQUIPMENT'
              ]]

In [6]:
clean_people.dtypes

PERSON_ID                 object
CRASH_RECORD_ID           object
VEHICLE_ID               float64
DRIVERS_LICENSE_CLASS     object
SEX                       object
AGE                      float64
SAFETY_EQUIPMENT          object
dtype: object

In [7]:
clean_people.shape

(1412636, 7)

In [116]:
# drop rows where ID for people starts with P(Passenger)
# Assuming that the driver is at fault for the crash
clean_people = clean_people[clean_people["PERSON_ID"].str.contains(("P"))==False]

In [117]:
clean_people.dtypes

PERSON_ID                 object
CRASH_RECORD_ID           object
VEHICLE_ID               float64
DRIVERS_LICENSE_CLASS     object
SEX                       object
AGE                      float64
SAFETY_EQUIPMENT          object
dtype: object

In [118]:
clean_people.AGE.value_counts()

 27.0     24384
 25.0     24334
 26.0     24206
 28.0     23835
 29.0     23369
          ...  
 102.0        2
-47.0         1
-177.0        1
-1.0          1
-49.0         1
Name: AGE, Length: 113, dtype: int64

In [119]:
# drop negative value
# can't do this yet: deletes nans
# clean_people = clean_people[clean_people["AGE"] > 0]

In [120]:
clean_people.AGE.value_counts()

 27.0     24384
 25.0     24334
 26.0     24206
 28.0     23835
 29.0     23369
          ...  
 102.0        2
-47.0         1
-177.0        1
-1.0          1
-49.0         1
Name: AGE, Length: 113, dtype: int64

In [121]:
clean_people.isna().sum()

PERSON_ID                     0
CRASH_RECORD_ID               0
VEHICLE_ID                27644
DRIVERS_LICENSE_CLASS    418809
SEX                        1015
AGE                      297453
SAFETY_EQUIPMENT           3793
dtype: int64

In [122]:
# Not assuming sex of driver
clean_people['SEX'].fillna('X', inplace=True)
# df[1].fillna(0, inplace=True)

In [123]:
# Driver license missing will need to be imputed with most common
clean_people.DRIVERS_LICENSE_CLASS.value_counts()

D     616038
A      24453
C      19321
B      18854
DM     11055
       ...  
MA         1
?          1
BX         1
A2         1
DP         1
Name: DRIVERS_LICENSE_CLASS, Length: 257, dtype: int64

In [124]:
y=0

In [125]:
# SI = SimpleImputer(missing_values=np.nan, )
# clean_people.AGE = SI.fit_transform(clean_people.AGE, y)

In [126]:
clean_people.SAFETY_EQUIPMENT.value_counts()

SAFETY BELT USED                              541871
USAGE UNKNOWN                                 531166
NONE PRESENT                                   33655
HELMET NOT USED                                 5848
SAFETY BELT NOT USED                            4579
BICYCLE HELMET (PEDACYCLIST INVOLVED ONLY)      2198
HELMET USED                                     1332
DOT COMPLIANT MOTORCYCLE HELMET                  846
NOT DOT COMPLIANT MOTORCYCLE HELMET              169
SHOULD/LAP BELT USED IMPROPERLY                  122
WHEELCHAIR                                        82
CHILD RESTRAINT USED                              11
CHILD RESTRAINT NOT USED                           5
STRETCHER                                          3
CHILD RESTRAINT - TYPE UNKNOWN                     2
CHILD RESTRAINT - FORWARD FACING                   2
BOOSTER SEAT                                       1
Name: SAFETY_EQUIPMENT, dtype: int64

In [127]:
clean_people.isna().sum()

PERSON_ID                     0
CRASH_RECORD_ID               0
VEHICLE_ID                27644
DRIVERS_LICENSE_CLASS    418809
SEX                           0
AGE                      297453
SAFETY_EQUIPMENT           3793
dtype: int64

In [128]:
clean_people.shape

(1125685, 7)

In [129]:
# DRIVERS_LICENSE_CLASS - impute w/ most common
# AGE - impute w/ mean
# SAFETY_EQUIPMENT - can drop rows missing safety equipment

In [130]:
clean_people.dropna(subset=['SAFETY_EQUIPMENT'], inplace=True)
clean_people.isna().sum()

PERSON_ID                     0
CRASH_RECORD_ID               0
VEHICLE_ID                23862
DRIVERS_LICENSE_CLASS    415017
SEX                           0
AGE                      296536
SAFETY_EQUIPMENT              0
dtype: int64

In [131]:
subpipe_num = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('ss', StandardScaler())
])


subpipe_cat = Pipeline(steps=[
    ('cat_impute',SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

CT = ColumnTransformer(transformers=[
    ('subpipe_num', subpipe_num, selector(dtype_include=np.number)),
    ('subpipe_cat', subpipe_cat, selector(dtype_include=object))], remainder='passthrough')

In [132]:
# Can't do transforms here because it'll allow data leakage!!!! DANG!
# CT.fit_transform(clean_people, )

In [133]:
# pickle df to combine with crash and vehicle df's
clean_people.to_pickle("./Data/clean_people.pkl")