## This notebook is used for Data Exploration

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from scipy import stats
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [54]:
df = pd.read_csv('https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv',header=0)
df.dtypes

SEVERITYCODE        int64
X                 float64
Y                 float64
OBJECTID            int64
INCKEY              int64
COLDETKEY           int64
REPORTNO           object
STATUS             object
ADDRTYPE           object
INTKEY            float64
LOCATION           object
EXCEPTRSNCODE      object
EXCEPTRSNDESC      object
SEVERITYCODE.1      int64
SEVERITYDESC       object
COLLISIONTYPE      object
PERSONCOUNT         int64
PEDCOUNT            int64
PEDCYLCOUNT         int64
VEHCOUNT            int64
INCDATE            object
INCDTTM            object
JUNCTIONTYPE       object
SDOT_COLCODE        int64
SDOT_COLDESC       object
INATTENTIONIND     object
UNDERINFL          object
WEATHER            object
ROADCOND           object
LIGHTCOND          object
PEDROWNOTGRNT      object
SDOTCOLNUM        float64
SPEEDING           object
ST_COLCODE         object
ST_COLDESC         object
SEGLANEKEY          int64
CROSSWALKKEY        int64
HITPARKEDCAR       object
dtype: objec

In [55]:
#Exploring the severity of the collision included in the data set
df['SEVERITYCODE'].value_counts().to_frame()

Unnamed: 0,SEVERITYCODE
1,136485
2,58188


In [56]:
#It seemed liked the dataset only included only two level of severity, let's look at their descpriction
df['SEVERITYDESC'].value_counts().to_frame()

Unnamed: 0,SEVERITYDESC
Property Damage Only Collision,136485
Injury Collision,58188


In [57]:
#So it looks like the dataset only specifies 2 type of severity which are injury and not-injury
#In this project, I will chooses several facor such as COLLISIONTYPE, WEATHER, ROADCOND,LIGHTCOND,
#INATTENTIONIND, UNDERINFL, SPEEDING as the main focus factors
#Let's build a data set contain those columns
df_new = df[['COLLISIONTYPE','WEATHER','ROADCOND',
             'LIGHTCOND','INATTENTIONIND','UNDERINFL','SPEEDING','SEVERITYCODE']]
df_new.dtypes

COLLISIONTYPE     object
WEATHER           object
ROADCOND          object
LIGHTCOND         object
INATTENTIONIND    object
UNDERINFL         object
SPEEDING          object
SEVERITYCODE       int64
dtype: object

In [58]:
df_new['WEATHER'].value_counts().to_frame()

Unnamed: 0,WEATHER
Clear,111135
Raining,33145
Overcast,27714
Unknown,15091
Snowing,907
Other,832
Fog/Smog/Smoke,569
Sleet/Hail/Freezing Rain,113
Blowing Sand/Dirt,56
Severe Crosswind,25


In [59]:
df_new['ROADCOND'].value_counts().to_frame()

Unnamed: 0,ROADCOND
Dry,124510
Wet,47474
Unknown,15078
Ice,1209
Snow/Slush,1004
Other,132
Standing Water,115
Sand/Mud/Dirt,75
Oil,64


In [60]:
df_new['LIGHTCOND'].value_counts().to_frame()

Unnamed: 0,LIGHTCOND
Daylight,116137
Dark - Street Lights On,48507
Unknown,13473
Dusk,5902
Dawn,2502
Dark - No Street Lights,1537
Dark - Street Lights Off,1199
Other,235
Dark - Unknown Lighting,11


In [61]:
df_new['COLLISIONTYPE'].value_counts().to_frame()

Unnamed: 0,COLLISIONTYPE
Parked Car,47987
Angles,34674
Rear Ended,34090
Other,23703
Sideswipe,18609
Left Turn,13703
Pedestrian,6608
Cycles,5415
Right Turn,2956
Head On,2024


In [62]:
#Drop rows with null values
df_new = df_new.dropna(axis='rows')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 695 entries, 1320 to 194309
Data columns (total 8 columns):
COLLISIONTYPE     695 non-null object
WEATHER           695 non-null object
ROADCOND          695 non-null object
LIGHTCOND         695 non-null object
INATTENTIONIND    695 non-null object
UNDERINFL         695 non-null object
SPEEDING          695 non-null object
SEVERITYCODE      695 non-null int64
dtypes: int64(1), object(7)
memory usage: 48.9+ KB


In [105]:
#label encoded the dataset since all features are catogories types
categorical_cols = ['COLLISIONTYPE','WEATHER','ROADCOND',
             'LIGHTCOND','INATTENTIONIND','UNDERINFL','SPEEDING']
labelencoder = LabelEncoder()
df_new[categorical_cols] = df_new[categorical_cols].apply(lambda col: labelencoder.fit_transform(col))
df_new.head(10)

Unnamed: 0,COLLISIONTYPE,WEATHER,ROADCOND,LIGHTCOND,INATTENTIONIND,UNDERINFL,SPEEDING,SEVERITYCODE
1320,0,0,0,4,0,2,0,2
1572,0,0,0,4,0,2,0,1
2918,2,0,0,4,0,2,0,2
3045,4,6,2,4,0,2,0,2
3499,4,3,0,2,0,0,0,1
3517,7,4,5,2,0,2,0,2
3974,7,0,0,4,0,0,0,1
4472,5,0,0,4,0,0,0,2
4902,4,4,5,2,0,1,0,1
4969,7,0,0,4,0,0,0,2


In [93]:
X_dataset = df_new.drop('SEVERITYCODE', axis = 1)
y_dataset = df_new['SEVERITYCODE']