In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [224]:
from sklearn.metrics import mean_squared_error, r2_score

from sklearn import tree

from sklearn.preprocessing import LabelEncoder 

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
df = pd.read_csv("US_Accidents_June20.csv",index_col='ID',parse_dates=['Start_Time','End_Time'])

In [225]:
from datetime import datetime, date

In [231]:
# adding time variables

#df['Month'] = df['Start_Time'].dt.month
#df['Year'] = df['Start_Time'].dt.year
#df['Hour'] = df['Start_Time'].dt.hour
#df['Weekday'] = df['Start_Time'].dt.weekday
#df['Day'] = df['Start_Time'].dt.day
#df['Impact'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds()/60
#df["YMD"] = df["Start_Time"].dt.date


## Preprocessing for classification
# Drop variables

# select inputs

drop_variables1 = ['Start_Time','End_Time','Start_Lat','Start_Lng','End_Lat','End_Lng']

drop_variables2 = ['Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']

drop_variables3 = ['Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']

drop_variables4 = ['Description','Number','Zipcode','Airport_Code','Weather_Timestamp','Wind_Chill(F)']

drop_variables5 = ['Street','Side','City','County','Country']

drop_variables6 = ['Source','TMC','Timezone']

drop_variables7 = ['Temperature(F)','Humidity(%)','Precipitation(in)','Pressure(in)','Wind_Direction','Wind_Speed(mph)']

drop_variables8 = ['Amenity','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station',\
                  'Station','Stop','Traffic_Calming','Traffic_Signal','Turning_Loop']
                   
df_dropped = df.drop(drop_variables1+drop_variables2+drop_variables3+drop_variables4+\
                     drop_variables5+drop_variables6+drop_variables7+drop_variables8 , axis='columns')


df_reduced = df_dropped.dropna(how='any')


#df_MA = df_reduced[df_reduced.loc[:,'State'] == ["MA",'NY','CA']]

df_MA = df_reduced[df_reduced.State.isin(['MA'])]

## Select inputs and output (target)
X_MA = df_MA.drop(['State'],axis='columns')

## We only leave Distance, Visibility, Weather_Condition, Sunrise_Sunset as features for classification

#X_MA.head()

print(X_MA)



           Severity  Distance(mi)  Visibility(mi) Weather_Condition  \
ID                                                                    
A-194264          2         0.000             3.0        Light Rain   
A-194268          2         0.000             5.0              Rain   
A-194269          3         0.010             5.0              Rain   
A-194270          4         0.010             3.0        Light Rain   
A-194271          3         0.010             6.0        Light Rain   
...             ...           ...             ...               ...   
A-3512289         2         0.418            10.0        Light Rain   
A-3512823         2         0.327             1.0               Fog   
A-3513364         2         0.163            10.0              Fair   
A-3513370         2         0.442            10.0              Fair   
A-3513380         2         0.545            10.0              Fair   

          Sunrise_Sunset  
ID                        
A-194264             D

In [232]:
## Find max and min values in non-categorical values

visibility_max = X_MA['Visibility(mi)'].max()
visibility_min = X_MA['Visibility(mi)'].min()

distance_max = X_MA['Distance(mi)'].max()
distance_min = X_MA['Distance(mi)'].min()

max_min = [visibility_max, visibility_min, distance_max, distance_min  ]

print(max_min)

#X_MA.head()


[10.5, 0.0, 79.946, 0.0]


In [228]:
## Bin numerical data

bins_visibility = [1.1*i-0.01 for i in range(0,11)]
bins_distance = [10*i-0.01 for i in range(0,11)]

X_MA['Distance(mi)_b'] = pd.cut(X_MA['Distance(mi)'],bins_distance)
X_MA['Visibility(mi)_b'] = pd.cut(X_MA['Visibility(mi)'],bins_visibility)

X_MA_drop = X_MA.drop(['Distance(mi)','Visibility(mi)'], axis='columns')


print(bins_distance)
print(bins_visibility)

#print(X_MA_drop)


[-0.01, 9.99, 19.99, 29.99, 39.99, 49.99, 59.99, 69.99, 79.99, 89.99, 99.99]
[-0.01, 1.09, 2.1900000000000004, 3.2900000000000005, 4.390000000000001, 5.49, 6.590000000000001, 7.690000000000001, 8.790000000000001, 9.89, 10.99]


In [233]:

X_MA_d = X_MA_drop.drop( ['Severity'], axis='columns')
Y_MA = X_MA_drop['Severity']

#X_MA_d = X_MA_x.dropna(how='any')
#Y_MA = X_MA_x['Severity']


print(Y_MA)


## Encoding

X_MA_d.head()


le = {}
for col in X_MA_d.columns:
    le[col] = LabelEncoder()
    le[col].fit(X_MA_d[col].unique())
    print('{0:12s} => {1}'.format(col, le[col].classes_))
    X_MA_d[col] = le[col].transform(X_MA_d.loc[:,col])
    

#Y_MA.head()





ID
A-194264     2
A-194268     2
A-194269     3
A-194270     4
A-194271     3
            ..
A-3512289    2
A-3512823    2
A-3513364    2
A-3513370    2
A-3513380    2
Name: Severity, Length: 36384, dtype: int64
Weather_Condition => ['Blowing Snow' 'Clear' 'Cloudy' 'Cloudy / Windy' 'Fair' 'Fair / Windy'
 'Fog' 'Haze' 'Heavy Rain' 'Heavy Rain / Windy' 'Heavy Snow'
 'Heavy T-Storm' 'Heavy T-Storm / Windy' 'Heavy Thunderstorms and Rain'
 'Ice Pellets' 'Light Drizzle' 'Light Drizzle / Windy'
 'Light Freezing Drizzle' 'Light Freezing Fog' 'Light Freezing Rain'
 'Light Rain' 'Light Rain / Windy' 'Light Rain with Thunder' 'Light Snow'
 'Light Snow / Windy' 'Light Snow and Sleet'
 'Light Thunderstorms and Rain' 'Mist' 'Mostly Cloudy'
 'Mostly Cloudy / Windy' 'N/A Precipitation' 'Overcast' 'Partly Cloudy'
 'Partly Cloudy / Windy' 'Patches of Fog' 'Rain' 'Rain / Windy'
 'Scattered Clouds' 'Shallow Fog' 'Small Hail' 'Smoke' 'Snow' 'Squalls'
 'T-Storm' 'Thunder' 'Thunder in the Vicinity' 'Thunders

In [239]:

## Data range selection

#Y_MA.shape[0]

index = 27000

X_MA_training = X_MA_d.iloc[1:index,:]
Y_MA_training = Y_MA.iloc[1:index]

X_MA_testing = X_MA_d.iloc[index:36384,:]
Y_MA_testing = Y_MA.iloc[index:36384]




In [240]:
## Train classifier

model = tree.DecisionTreeClassifier()
model.fit(X_MA_training, Y_MA_training)



DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [241]:
## Scores


model.score(X_MA_testing, Y_MA_testing)



0.5752344416027281

In [181]:
# Gauss Naive Bayes

gnb = GaussianNB()

# Train classifier

gnb.fit( X_MA_training, Y_MA_training)

y_pred = gnb.predict(X_MA_testing)

print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_MA_testing.shape[0],
          (Y_MA_testing != y_pred).sum(),
          100*(1-(Y_MA_testing != y_pred).sum()/X_MA_testing.shape[0])
))





Number of mislabeled points out of a total 9384 points : 3958, performance 57.82%
