# In this notebook we will continue our project and start creating our ML model. Our goal is to properly classify the severity of traffic accidents based on various categorical variables

In [519]:
#importing our datasets from our ETL / EDA file
import import_ipynb
import tempe_traffic_EDA_ETL
import sklearn
import pandas as pd
import seaborn as sns
import numpy as np
from pandasql import sqldf
from tempe_traffic_EDA_ETL import traffic_dataset
from tempe_traffic_EDA_ETL import fatal_frame
from tempe_traffic_EDA_ETL import nonfatal_frame
from tempe_traffic_EDA_ETL import fatal_drv_info 
from tempe_traffic_EDA_ETL import non_fatal_drv_info 
from tempe_traffic_EDA_ETL import time_location_fatal 
from tempe_traffic_EDA_ETL import time_location_non_fatal 

In [520]:
#importing our split function + dropping null values
from sklearn.model_selection import train_test_split
traffic_dataset = traffic_dataset.dropna()

In [521]:
## call this function to find counts for any categorical variable, requires a column that you want counts for and table name
def count_query(column, table):
    q = (f"SELECT {column}, COUNT({column}) FROM {table} GROUP BY {column}")
    query = sqldf(q, globals())
    return query

In [522]:
#getting rid of age outliers
def clean_outliers(frame, cat1):
    Q1 = frame[cat1].quantile(0.25)
    Q3 = frame[cat1].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_lim = Q1 - (1.5 * IQR)
    upper_lim = Q3 + (1.5 * IQR)
    
    outliers_lower = (frame[cat1] < lower_lim)
    outliers_upper = (frame[cat1] > upper_lim)
    
    frame = frame[~(outliers_lower | outliers_upper)]
    
    return frame

clean1 = clean_outliers(traffic_dataset, 'Age_Drv1')
cleanFinal = clean_outliers(clean1, 'Age_Drv2')

In [523]:
#checking to make sure our cleaning worked
cleanFinal.isnull().sum()

X                      0
Y                      0
OBJECTID               0
Incidentid             0
DateTime               0
Year                   0
StreetName             0
CrossStreet            0
Distance               0
JunctionRelation       0
Totalinjuries          0
Totalfatalities        0
Injuryseverity         0
Collisionmanner        0
Lightcondition         0
Weather                0
SurfaceCondition       0
Unittype_One           0
Age_Drv1               0
Gender_Drv1            0
Traveldirection_One    0
Unitaction_One         0
Violation1_Drv1        0
AlcoholUse_Drv1        0
DrugUse_Drv1           0
Unittype_Two           0
Age_Drv2               0
Gender_Drv2            0
Traveldirection_Two    0
Unitaction_Two         0
Violation1_Drv2        0
AlcoholUse_Drv2        0
DrugUse_Drv2           0
Latitude               0
Longitude              0
dtype: int64

In [524]:
#categorical features that we will use in our classification model
features = cleanFinal.select_dtypes(include='O').keys()
features

Index(['DateTime', 'StreetName', 'CrossStreet', 'JunctionRelation',
       'Injuryseverity', 'Collisionmanner', 'Lightcondition', 'Weather',
       'SurfaceCondition', 'Unittype_One', 'Gender_Drv1',
       'Traveldirection_One', 'Unitaction_One', 'Violation1_Drv1',
       'AlcoholUse_Drv1', 'DrugUse_Drv1', 'Unittype_Two', 'Gender_Drv2',
       'Traveldirection_Two', 'Unitaction_Two', 'Violation1_Drv2',
       'AlcoholUse_Drv2', 'DrugUse_Drv2'],
      dtype='object')

## Basic Exploration of Categorical Variables

In [525]:
cleanFinal['Lightcondition'].value_counts(normalize=True).head(50)


Daylight                 0.756226
Dark Lighted             0.193043
Dusk                     0.032507
Dawn                     0.007727
Unknown 51               0.004525
Dark Not Lighted         0.004248
Dark Unknown Lighting    0.000985
51                       0.000739
Name: Lightcondition, dtype: float64

In [526]:
cleanFinal['StreetName'].value_counts(normalize=True).head(50)

Rural Rd             0.112760
Baseline Rd          0.096845
University Dr        0.086286
Broadway Rd          0.084347
McClintock Dr        0.070155
Southern Ave         0.064337
Priest Dr            0.060397
Mill Ave             0.057226
Elliot Rd            0.036909
Price Rd             0.032969
Scottsdale Rd        0.031984
Warner Rd            0.027890
Apache Blvd          0.027613
Rio Salado Pkwy      0.027213
Guadalupe Rd         0.019948
Mcclintock Dr        0.019024
Kyrene Rd            0.015576
Hardy Dr             0.015361
48th St              0.013052
5th St               0.004925
College Ave          0.004217
Curry Rd             0.004125
52nd St              0.003940
Ash Ave              0.003386
Grove Pkwy           0.002863
Washington St        0.002863
Dorsey Ln            0.002771
Roosevelt St         0.002340
McKellips Rd         0.002216
Ray Rd               0.002062
Lemon St             0.001939
Alameda Dr           0.001878
Lakeshore Dr         0.001755
6th St    

In [527]:
cleanFinal['Violation1_Drv1'].value_counts(normalize=True).head(50)

Speed To Fast For Conditions                        0.301431
Failed To Yield Right Of Way                        0.229244
Unsafe Lane Change                                  0.082223
Disregarded Traffic Signal                          0.073511
Followed Too Closely                                0.072803
Other                                               0.053779
Unknown                                             0.047776
Made Improper Turn                                  0.043128
Inattention Distraction                             0.039649
Failed To Keep In Proper Lane                       0.016561
No Improper Action                                  0.012159
Drove Rode In Opposing Traffic Lane                 0.009543
Ran Stop Sign                                       0.004156
Exceeded Lawful Speed                               0.003632
Other Unsafe Passing                                0.003140
Did Not Use Crosswalk                               0.002647
Disregarded Pavement Mar

In [528]:
cleanFinal['Collisionmanner'].value_counts(normalize=True).head(30)

Rear End                                       0.407511
Left Turn                                      0.219917
ANGLE (Front To Side)(Other Than Left Turn)    0.146745
Sideswipe Same Direction                       0.127720
Angle - Other Than Left Turn 2                 0.035216
Other                                          0.034847
Head On                                        0.010774
Sideswipe Opposite Direction                   0.005572
Unknown                                        0.005079
Rear To Side                                   0.003263
U Turn                                         0.001447
Rear To Rear                                   0.001170
10                                             0.000739
Name: Collisionmanner, dtype: float64

## Encoding the data for the first RF classification model

In [529]:
#encoding our features, from categorical to discrete values w/ one hot encoding, for our first model
#features being chosen from domain research
x = ['Lightcondition', 'StreetName', 'Violation1_Drv1', 'Collisionmanner', 'Injuryseverity']
features = cleanFinal[x]

bad_labels = ["Unknown 51", "51", "108", "109", "10"]


In [530]:
features['Lightcondition'].value_counts().head(30)

Daylight                 24566
Dark Lighted              6271
Dusk                      1056
Dawn                       251
Unknown 51                 147
Dark Not Lighted           138
Dark Unknown Lighting       32
51                          24
Name: Lightcondition, dtype: int64

In [531]:
q = """SELECT *
    FROM features
    WHERE Lightcondition != "51" AND
    Lightcondition != "Unknown 51" AND
    Violation1_Drv1 != "108" AND 
    Violation1_Drv1 != "109" AND
    Violation1_Drv1 != "Unknown" AND 
    Violation1_Drv1 != "OTHER" AND
    Collisionmanner != "OTHER" AND 
    Collisionmanner != "Unknown" AND
    Collisionmanner != "10"
    """
    
features = (sqldf(q))

In [532]:
features

Unnamed: 0,Lightcondition,StreetName,Violation1_Drv1,Collisionmanner,Injuryseverity
0,Daylight,Mcclintock Dr,Speed To Fast For Conditions,Rear End,Non Incapacitating Injury
1,Daylight,Priest Dr,Unsafe Lane Change,Sideswipe Same Direction,No Injury
2,Daylight,Baseline Rd,Inattention Distraction,Rear End,No Injury
3,Daylight,Broadway Rd,Inattention Distraction,ANGLE (Front To Side)(Other Than Left Turn),No Injury
4,Daylight,Priest Dr,Other,ANGLE (Front To Side)(Other Than Left Turn),No Injury
...,...,...,...,...,...
30644,Dark Lighted,University Dr,Speed To Fast For Conditions,Rear End,No Injury
30645,Daylight,Rural Rd,Disregarded Traffic Signal,Angle - Other Than Left Turn 2,No Injury
30646,Daylight,Broadway Rd,Failed To Yield Right Of Way,Left Turn,Possible Injury
30647,Daylight,Rural Rd,Failed To Yield Right Of Way,U Turn,Suspected Minor Injury


In [558]:
#building out our classificaition model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Y = features['Injuryseverity']
X = pd.get_dummies(features[['Lightcondition','StreetName', 'Violation1_Drv1', 'Collisionmanner']], columns=['Lightcondition','StreetName', 'Violation1_Drv1', 'Collisionmanner' ])
Y = pd.get_dummies(Y, columns=['Injuryseverity'])

x_train, x_test, y_train, y_test= train_test_split(X, Y)

classifier = RandomForestClassifier()
classifier = classifier.fit(x_train, y_train)
predicted = classifier.predict(x_test)

In [534]:
accuracy_score(y_test, predicted, normalize=True)

0.6042020096567924

In [535]:
sklearn.metrics.f1_score(y_test, predicted, average='micro')

0.6414519257412026

## Creating a second RF classification model with, chi square feature selection

In [711]:
new_set = cleanFinal[['StreetName', 'CrossStreet', 'JunctionRelation',
        'Collisionmanner', 'Lightcondition', 'Weather',
       'SurfaceCondition', 'Unittype_One', 'Gender_Drv1',
       'Traveldirection_One', 'Unitaction_One', 'Violation1_Drv1',
       'AlcoholUse_Drv1', 'DrugUse_Drv1', 'Unittype_Two', 'Gender_Drv2',
       'Traveldirection_Two', 'Unitaction_Two', 'Violation1_Drv2',
       'AlcoholUse_Drv2', 'DrugUse_Drv2']]

q2 = """SELECT *
    FROM new_set
    WHERE Lightcondition != "51" AND
    Lightcondition != "Unknown 51" AND
    Violation1_Drv1 != "108" AND 
    Violation1_Drv1 != "109" AND
    Violation1_Drv1 != "Unknown" AND 
    Violation1_Drv1 != "OTHER" AND
    Collisionmanner != "OTHER" AND 
    Collisionmanner != "Unknown" AND
    Collisionmanner != "10"
    """

X = sqldf(q2)
Y = features['Injuryseverity']

X = pd.get_dummies(X, columns = ['StreetName', 'CrossStreet', 'JunctionRelation',
        'Collisionmanner', 'Lightcondition', 'Weather',
       'SurfaceCondition', 'Unittype_One', 'Gender_Drv1',
       'Traveldirection_One', 'Unitaction_One', 'Violation1_Drv1',
       'AlcoholUse_Drv1', 'DrugUse_Drv1', 'Unittype_Two', 'Gender_Drv2',
       'Traveldirection_Two', 'Unitaction_Two', 'Violation1_Drv2',
       'AlcoholUse_Drv2', 'DrugUse_Drv2'])
Y = pd.get_dummies(Y, columns = ['Injuryseverity'])
X, y = X, Y



In [712]:
#Feature Engineering, with the 4 'best' categorical variables
selector = SelectKBest(chi2, k=4)
new = selector.fit_transform(X, y)

cols = selector.get_support(indices=True)

X = pd.DataFrame(data=X)

features_df_new = X.iloc[:,cols]
features_df_new

Unnamed: 0,JunctionRelation_Intersection 4 Way 202,Collisionmanner_Angle - Other Than Left Turn 2,Unittype_One_Pedestrian,Unitaction_One_Crossing Road
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
30644,0,0,0,0
30645,1,1,0,0
30646,1,0,0,0
30647,0,0,0,0


In [713]:
#creating our new model
X = new
x_train, x_test, y_train, y_test= train_test_split(X, Y)
classifier = classifier.fit(x_train, y_train)
predicted = classifier.predict(x_test)

In [714]:
#accuracy score
accuracy_score(y_test, predicted, normalize=True)

0.6616207751533342

In [715]:
#f1 score
sklearn.metrics.f1_score(y_test, predicted, average='micro')

0.6676323413220964

In [716]:
#precision score
sklearn.metrics.precision_score(y_test, predicted, average='micro')

0.6737541528239203