# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve
from sklearn.preprocessing import OneHotEncoder, StandardScaler
%matplotlib inline
import seaborn as sns
import folium 

In [None]:
#importing datasets
df = pd.read_csv('../Traffic_Crashes_-_Crashes.csv')
df_people = pd.read_csv('../Traffic_Crashes_-_People.csv')
df_vehicles = pd.read_csv('../Traffic_Crashes_-_Vehicles.csv')
df.head()

In [None]:
df_vehicles.head()

In [None]:
df.info()

In [None]:
df_people.info()

In [None]:
df_vehicles.info()

In [None]:
df.shape

# Data Preparation

In [None]:
#Merging Datasets on the Crash_Record_ID Column
df_merge = pd.merge(df, df_vehicles, on='CRASH_RECORD_ID').reset_index()
df_merge_2 = pd.merge(df_merge, df_people, on='CRASH_RECORD_ID').reset_index()
#dropping dupllicates(basically only having one instance of the crashID on there)
df_dropped= df_merge_2.drop_duplicates(subset=['CRASH_RECORD_ID'], keep='first')
#Taking only necessary columns
columns = ['CRASH_RECORD_ID', 'RD_NO_x', 'CRASH_DATE_x', 'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE', 'INTERSECTION_RELATED_I', 'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO', 'STREET_DIRECTION', 'STREET_NAME',  'DOORING_I', 'WORK_ZONE_I', 'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS', 'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL', 'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LATITUDE', 'LONGITUDE', 'LOCATION', 'RD_NO_y', 'UNIT_NO', 'UNIT_TYPE', 'NUM_PASSENGERS', 'VEHICLE_ID_x', 'CMRC_VEH_I', 'MAKE', 'MODEL', 'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE', 'TRAVEL_DIRECTION', 'MANEUVER', 'OCCUPANT_CNT', 'EXCEED_SPEED_LIMIT_I', 'FIRST_CONTACT_POINT', 'PERSON_TYPE', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 'AGE',  'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION', 'DRIVER_VISION','PHYSICAL_CONDITION', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY', 'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT VALUE', 'CELL_PHONE_USE']
df_comb = df_dropped[columns]

In [None]:
#Taking the Year from the date column
df_comb['date'] = pd.to_datetime(df_comb['CRASH_DATE_x'])
df_comb['Crash_year'] = df_comb['date'].apply(lambda date: date.year)
#only accidents with the driver
df_driver = df_comb[df_comb['PERSON_TYPE']=='DRIVER']
#necessary columns
new_columns = ['CRASH_RECORD_ID', 'CRASH_DATE_x', 'POSTED_SPEED_LIMIT',
       'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION',
       'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE',
       'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE',
       'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE','NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT','CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LATITUDE', 'LONGITUDE', 'MAKE', 'MODEL',
       'VEHICLE_YEAR', 'VEHICLE_DEFECT', 'VEHICLE_TYPE',
       'MANEUVER', 'OCCUPANT_CNT', 
       'FIRST_CONTACT_POINT', 'PERSON_TYPE','SEX',
       'AGE', 'AIRBAG_DEPLOYED', 'EJECTION', 'INJURY_CLASSIFICATION','DRIVER_VISION',
       'PHYSICAL_CONDITION', 'Crash_year']

In [None]:
#going to drop columns with lots of missing values
pd.isnull(df_driver).sum().sort_values(ascending=False)

In [None]:
#Dataframe with the new columns
df_driver = df_driver[new_columns]
#check for nulls
pd.isnull(df_driver).sum().sort_values(ascending=False)

In [None]:
#dropping missing values
df_driver.dropna(axis=0, subset=['LATITUDE'], inplace =True)
df_driver.dropna(axis=0, subset=['LONGITUDE'], inplace =True)
df_driver.dropna(axis=0, subset=['MODEL'], inplace =True)
df_driver.dropna(axis=0, subset=['MAKE'], inplace =True)
df_driver.dropna(axis=0, subset=['FIRST_CONTACT_POINT'], inplace =True)

In [None]:
pd.isnull(df_driver).sum().sort_values(ascending=False).iloc[:5]

In [None]:
df_driver.INJURIES_FATAL.value_counts()

In [None]:
df_driver.AGE.value_counts()

In [None]:
df_driver.SEX.value_counts()

In [None]:
df_driver.VEHICLE_YEAR.value_counts()

In [None]:
df_driver.VEHICLE_YEAR.describe()

In [None]:
# Handling nulls for AGE, SEX, and VEHICLE_YEAR
df_driver = df_driver[(df_driver.VEHICLE_YEAR>=1970) & (df_driver.VEHICLE_YEAR <=2021)]
df_driver.AGE.replace({np.NAN: df_driver.AGE.median()},inplace=True)
df_driver.SEX.replace({np.NAN : 'X'},inplace=True)
# Reassigning values for INJURIES FATAL
df_driver.INJURIES_FATAL.replace({0.0:0, 1.0:1, 2.0:1, 3.0:1, 4.0:1}, inplace=True)
df_driver.shape

In [None]:
df_driver.isna().sum()

In [None]:
# df_driver.hist(figsize=(20,20), bins='auto');

In [None]:
# dropping columns we do not need
columns_to_drop = ['CRASH_RECORD_ID','CRASH_DATE_x','INJURIES_TOTAL','INJURIES_INCAPACITATING',
                   'INJURIES_NON_INCAPACITATING','INJURIES_REPORTED_NOT_EVIDENT','MAKE','MOST_SEVERE_INJURY',
                   'MODEL','VEHICLE_YEAR','OCCUPANT_CNT','PERSON_TYPE',
                   'INJURY_CLASSIFICATION','CRASH_TYPE','LONGITUDE','LATITUDE','Crash_year','CRASH_MONTH']

df_driver = df_driver.drop(columns_to_drop,axis=1)

In [None]:
df_driver.info()

In [None]:
df_driver.describe()

In [None]:
df_driver.shape

In [None]:
df_driver.PRIM_CONTRIBUTORY_CAUSE.value_counts()

In [None]:
# Subsetting df where primary contributory cause is determined
df_driver = df_driver[df_driver.PRIM_CONTRIBUTORY_CAUSE != 'UNABLE TO DETERMINE']
df_driver.shape

In [None]:
df_driver.SEC_CONTRIBUTORY_CAUSE.value_counts()

In [None]:
# Subsetting df where Secondary contributory cause is determined and applicable
df_driver = df_driver[(df_driver.SEC_CONTRIBUTORY_CAUSE != 'UNABLE TO DETERMINE')]
df_driver = df_driver[(df_driver.SEC_CONTRIBUTORY_CAUSE != 'NOT APPLICABLE')]
df_driver.shape

# Decision Tree

In [None]:
df_driver.columns

In [None]:
# Features to model
features = ['AGE', 'CRASH_HOUR','CRASH_DAY_OF_WEEK', 'POSTED_SPEED_LIMIT','PRIM_CONTRIBUTORY_CAUSE',
           'WEATHER_CONDITION','LIGHTING_CONDITION','TRAFFIC_CONTROL_DEVICE','DRIVER_VISION',
           'PHYSICAL_CONDITION','ROADWAY_SURFACE_COND','VEHICLE_DEFECT','EJECTION','INJURIES_FATAL']
test_df = df_driver[features]

### Train-Test Split

In [None]:
X = test_df.drop('INJURIES_FATAL',axis=1)
y = test_df.INJURIES_FATAL

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))

### One Hot Encoding

In [None]:
#Ohe-ing Train set
X_train_cat = X_train.select_dtypes('object')

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums_train = ohe.fit_transform(X_train_cat)
dums_train_df = pd.DataFrame(dums_train,
                       columns=ohe.get_feature_names(),
                       index=X_train_cat.index)

nums_train_df = X_train.select_dtypes(['int64','float64'])

X_train_clean = pd.concat([nums_train_df, dums_train_df], axis=1)

In [None]:
#Ohe-ing Test set
X_test_cat = X_test.select_dtypes('object')

dums_test = ohe.transform(X_test_cat)
dums_test_df = pd.DataFrame(dums_test,
                       columns=ohe.get_feature_names(),
                       index=X_test_cat.index)

nums_test_df = X_test.select_dtypes(['int64','float64'])

X_test_clean = pd.concat([nums_test_df, dums_test_df], axis=1)

In [None]:
# X_train_clean.columns

### First Decision Tree

In [None]:
tree = DecisionTreeClassifier()

In [None]:
tree_grid = {'max_leaf_nodes': list(range(2, 10)), 
             'min_samples_split': [2, 3, 4],
             'max_depth': [5, 6, 7, 8, 9],
            }

In [None]:
tree_grid_search = GridSearchCV(tree, tree_grid, cv=5)

In [None]:
tree_grid_search.fit(X_train_clean, y_train)

In [None]:
tree_grid_search.best_params_

In [None]:
tree_grid_search.best_score_

In [None]:
tree_grid_search.best_estimator_.score(X_test_clean, y_test)

In [None]:
tree_cv_df = pd.DataFrame(tree_grid_search.cv_results_)
tree_cv_df.head()

In [None]:
best_tree = tree_grid_search.best_estimator_

In [None]:
def plot_feature_importances(model):
    n_features = X_train_clean.shape[1]
    plt.figure(figsize=(30,30))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train_clean.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

In [None]:
plot_feature_importances(best_tree)

# Random Forest

In [None]:
forest = RandomForestClassifier()

In [None]:
# Number of trees in random forest
# n_estimators = [100,150,200]

# The number of features to consider when looking for the best split
max_features = ['sqrt', 'log2']

# function to measure the quality of a split
criteria = ['gini', 'entropy']

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
forrest_grid = {'max_features': max_features, 
                'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }

In [None]:
forrest_grid_search = GridSearchCV(estimator=forest, param_grid=forrest_grid, cv=5)

In [None]:
forrest_grid_search.fit(X_train_clean, y_train)

In [None]:
forrest_grid_search.best_params_

In [None]:
forrest_grid_search.best_score_

In [None]:
forrest_grid_search.best_estimator_.score(X_test_clean, y_test)

In [None]:
best_forest = forrest_grid_search.best_estimator_

In [None]:
plot_feature_importances(best_forest)

In [None]:
forest_cv_df = pd.DataFrame(forrest_grid_search.cv_results_)
forest_cv_df.head()

# K Nearest Neighbor

In [None]:
# Splitting categorical and numeric features from train and test set
X_train_cat = X_train.select_dtypes('object')
X_test_cat = X_test.select_dtypes('object')
nums_train = X_train.select_dtypes(['int64','float64'])
nums_test = X_test.select_dtypes(['int64','float64'])

# One Hot Encoding the categoricals
ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums_train = ohe.fit_transform(X_train_cat)
dums_train_df = pd.DataFrame(dums_train,
                       columns=ohe.get_feature_names(),
                       index=X_train_cat.index)

dums_test = ohe.transform(X_test_cat)
dums_test_df = pd.DataFrame(dums_test,
                       columns=ohe.get_feature_names(),
                       index=X_test_cat.index)
# Scaling the Numerics
scaler = StandardScaler()

nums_train_scaled = scaler.fit_transform(nums_train)
nums_train_df = pd.DataFrame(nums_train_scaled,
                       columns=nums_train.columns,
                       index=nums_train.index)

nums_test_scaled = scaler.transform(nums_test)
nums_test_df = pd.DataFrame(nums_test_scaled,
                       columns=nums_test.columns,
                       index=nums_test.index)

# Scaled and Encoded train and test data
X_train_scaled = pd.concat([nums_train_df, dums_train_df], axis=1)
X_test_scaled = pd.concat([nums_test_df, dums_test_df], axis=1)

In [None]:
X_train_scaled.head()

In [None]:
knn = KNeighborsClassifier()

In [None]:
# create params_grid for KNeighborsClassifier  
knn_grid = {'n_neighbors': [3, 5, 7, 9, 11, 21],
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan']}

# create grid search 
knn_grid = GridSearchCV(knn, knn_grid, cv=5)

In [None]:
# fit x_train and y_train to grid 
knn_grid.fit(X_train_scaled,y_train)

In [None]:
# observe combination of best params 
knn_grid.best_params_

In [None]:
knn_cv_df = pd.DataFrame(knn_grid.cv_results_)
knn_cv_df.head()

In [None]:
knn_grid.best_estimator_.score(scaled_data_test, y_test)

In [None]:
test_preds = knn_grid.best_estimator_.predict(scaled_data_test)