# Capstone Project

### Import required libraries

In [29]:
import numpy as np
#in case we need to repeat experiment
np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 25

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

from IPython.display import display, HTML

%matplotlib inline
%precision 3
debug = False

## Utility Functions for this project

In [30]:
def plotSimpleHist(data, title, hight=10, width=10):
    plot_enabled = True
    if(plot_enabled):
        shist = data.value_counts().plot(figsize=(width,hight), kind='bar',title=title, stacked = False)
        shist.set(xlabel=data.name, ylabel='Accident Count')

In [31]:
# list the accident severity count
def severity_count(data):
    sv_count = data.value_counts()
    sv_count.index.name = 'Severity'
    df_sv_count = pd.DataFrame({'Count':sv_count.values}, index = sv_count.index)
    df_sv_count.rename(index={'1':'non-fatal','2':'injury', '3':'fatal'},inplace=True)
    return df_sv_count

In [32]:
# list the accident severity count by dependent variable
def by_severity_count(data, ind, dep):
    bg_count = data.groupby([data[dep],data[ind]])
    sv_count = bg_count.size().unstack() 
    sv_count.index.name = 'Severity'
    sv_count.columns.name = ''
    sv_count.rename(index={'1':'non-fatal','2':'injury', '3':'fatal'},inplace=True)
    sv_count.fillna(value=0, inplace=True)
    sv_count.transpose()
    return sv_count

In [33]:
#plot histogram
def checkHistogram(df, feature, pred, plot):
    by_level = df.groupby([feature,pred]).size().unstack()

    if(plot):
        plot = by_level.plot(figsize=(20,15), kind='bar',title='Weather Configuration', stacked = False);
            
    return by_level

In [34]:
#functions
debug2 = False

def selectBin(start, end, P, leadingZero = False):
    x = np.random.choice(np.arange(start, end), p = P)
    if debug2:
        print("Selected bin: {0}".format(x))
    if leadingZero:
        if x < 10:
            x = '0' + str(x)
    return str(x)

In [35]:
#compute the probability of each bin
def computeP(dataFrm):
    N = dataFrm[dataFrm.index.astype('str').str.contains('[0-9]')].sum().sum()
    if debug:
        print(dataFrm[dataFrm.index.astype('str').str.contains('[0-9]')])
        print(N)
    
    maxCol = dataFrm.index[dataFrm.index.astype('str').str.contains('[0-9]')].max()

    #calculate the probability of the each category
    P=[]
    cur = 1;
    for ind in dataFrm.index[dataFrm.index.astype('str').str.contains('[0-9]')]:
        if debug:
            print("ind: {0}".format(ind))
            print("cur: {0}".format(cur))
            
        # some categoris may not have any values
        # we need to build the P matrix for the same number of colums
        end = int(ind)
        if cur < end:
            for i in range(cur, end):
                if debug:
                    print("i: {0}".format(i))
                P.append(0)
                cur = cur +1
                
        p = dataFrm.loc[ind].sum()/N
        P.append(p)
        cur = cur +1
    
    if debug:
        print(P)
        print(np.sum(P))
    
    return P      


In [36]:
# remove unwanted data from the dataset
def cleanData(df, ind, dep, replace, binStart, binLast, appendZero=False):
    #display(by_severity_count(df, dep, ind))
    print("{count} non-numeric rows observed".format(count = df[ind].astype('str').str.contains('[^0-9]').sum()))
    gby = checkHistogram(df, ind, dep, False)
    P = computeP(gby)
    for item in replace:     
        data_to_impute = df[ind].astype('str').str.contains(item)
        df[ind][data_to_impute] = df[ind][data_to_impute].apply(lambda x: x.replace(item, selectBin(binStart, binLast, P, appendZero)))
    print("{count} non-numeric rows observed".format(count = df[ind].astype('str').str.contains('[^0-9]').sum()))
    #display(by_severity_count(df, dep, ind))

### Read data file

In [37]:
#df = pd.read_csv('NCDB_2016.csv', engine = 'python')
df = pd.read_csv('NCDB_1999_to_2016.csv', engine = 'python')

### Number of rows and columns

In [38]:
print(df.shape)

(6486831, 23)


### List dataset columns

In [39]:
print(df.columns)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_SEV', 'C_VEHS', 'C_CONF',
       'C_RCFG', 'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_ID', 'V_TYPE',
       'V_YEAR', 'P_ID', 'P_SEX', 'P_AGE', 'P_PSN', 'P_ISEV', 'P_SAFE',
       'P_USER', 'C_CASE'],
      dtype='object')


In [40]:
print(df.head(2))

   C_YEAR C_MNTH C_WDAY C_HOUR  C_SEV C_VEHS C_CONF C_RCFG C_WTHR C_RSUR  \
0    1999     01      1     20      2     02     34     UU      1      5   
1    1999     01      1     20      2     02     34     UU      1      5   

   ...   V_TYPE V_YEAR P_ID P_SEX P_AGE P_PSN P_ISEV P_SAFE P_USER C_CASE  
0  ...       06   1990   01     M    41    11      1     UU      1    752  
1  ...       01   1987   01     M    19    11      1     UU      1    752  

[2 rows x 23 columns]


In [41]:
## Inital type of each independent variable

In [42]:
print(df.dtypes)

C_YEAR     int64
C_MNTH    object
C_WDAY    object
C_HOUR    object
C_SEV      int64
C_VEHS    object
C_CONF    object
C_RCFG    object
C_WTHR    object
C_RSUR    object
C_RALN    object
C_TRAF    object
V_ID      object
V_TYPE    object
V_YEAR    object
P_ID      object
P_SEX     object
P_AGE     object
P_PSN     object
P_ISEV    object
P_SAFE    object
P_USER    object
C_CASE     int64
dtype: object


### Backup a copy of the dataset

In [43]:
# backup the origial data set
dfcopy = df.copy()

### Select Pasenger Severity as the Dependent Variable
#### Move the dependent variable as the last column

In [44]:
#Move the dependent variable to the outside and drop C_CASE column
P_ISEV = df['P_ISEV']
df.drop(columns = ['P_ISEV'], inplace = True)
df['P_ISEV'] = P_ISEV

### Remove rows with more than 4 missing values

In [45]:
### drop rows that have more than 4 missing values and P_ISEV is not fatal
missing_value_strings = ['U', 'UU', 'UUU', 'UUUU', 'N', 'NN', 'NNN', 'NNNN', 'Q', 'QQ', 'QQQ', 'QQQQ', 'X', 'XX', 'XXX', 'XXXX']
non_fatal_with_more_than_4_missing = ((df.isin(missing_value_strings).sum(axis=1) > 3) & ~(df['P_ISEV'] == '3')).sum()
#df.drop(df.index[(df.isin(missing_value_strings).sum(axis=1) > 3) & ~(df['P_ISEV'] == '3')], inplace = True)
df.drop(df.index[(df.isin(missing_value_strings).sum(axis=1) > 4)], inplace = True)
df = df.reset_index(drop=True)
print("Dropped {drop_count} rows from dataset".format(drop_count = non_fatal_with_more_than_4_missing))

Dropped 432980 rows from dataset


### Drop columns C_SEV and C_CASE
#### We will not be using them

In [46]:
#Move the dependent variable to the outside and drop C_CASE column
df.drop(columns = ['C_SEV', 'C_CASE'], inplace = True)

### Drop rows where Passenger Severity is unknown

In [47]:
#Drop rows where Passenger severity is unkown
drop_count = df['P_ISEV'].isin(['U', 'N',]).sum()
df.drop(df.index[df['P_ISEV'].isin(['U', 'N'])], inplace=True)
print("Dropped {drop_count} rows from dataset where P_ISEV contained unkown values".format(drop_count = drop_count))
df = df.reset_index(drop=True)

Dropped 265119 rows from dataset where P_ISEV contained unkown values


### Check for missing values

In [None]:
print("This dataset has {0} Null values".format(df.isnull().sum().sum()))
print("This dataset has {0} Not a Number (NaN) values".format(df.isna().sum().sum()))

## Histogram of each variable

In [None]:
# Histogram of Accident Severity of Vehicle Occupant
plotSimpleHist(df['P_ISEV'], 'Accident Severity of Vehicle Occupant', hight=10, width=20)
severity_count(df['P_ISEV']).transpose()
#severity_count(df['P_ISEV'])

In [None]:
#Year of Accident
plotSimpleHist(df['C_YEAR'], 'Year of Accident', hight=10, width=20)
by_severity_count(df, 'C_YEAR', 'P_ISEV')

In [None]:
# Accident Month
plotSimpleHist(df['C_MNTH'], 'Accident Month', hight=10, width=20)
by_severity_count(df, 'C_MNTH', 'P_ISEV')

In [None]:
# Weekday of Accident
plotSimpleHist(df['C_WDAY'], 'Weekday of Accident', hight=10, width=20)
by_severity_count(df, 'C_WDAY', 'P_ISEV')

In [None]:
# Accident Hour
plotSimpleHist(df['C_HOUR'], 'Accident Hour', hight=10, width=20)
by_severity_count(df, 'C_HOUR', 'P_ISEV')

In [None]:
# Number of vehicles involved in collision
plotSimpleHist(df['C_VEHS'], 'Number of vehicles involved in collision', hight=10, width=20)
by_severity_count(df, 'C_VEHS', 'P_ISEV')

In [None]:
# Collision configuration
plotSimpleHist(df['C_CONF'], 'Collision configuration', hight=10, width=20)
by_severity_count(df, 'C_CONF', 'P_ISEV')

In [None]:
# Roadway configuration
plotSimpleHist(df['C_RCFG'], 'Roadway configuration', hight=10, width=20)
by_severity_count(df, 'C_RCFG', 'P_ISEV')

In [None]:
# Weather condition
plotSimpleHist(df['C_WTHR'], 'Weather condition', hight=10, width=20)
by_severity_count(df, 'C_WTHR', 'P_ISEV')

In [None]:
# Road surface
plotSimpleHist(df['C_RSUR'], 'Road surface', hight=10, width=20)
by_severity_count(df, 'C_RSUR', 'P_ISEV')

In [None]:
# Road alignment
plotSimpleHist(df['C_RALN'], 'Road alignment', hight=10, width=20)
by_severity_count(df, 'C_RALN', 'P_ISEV')

In [None]:
# Traffic control
plotSimpleHist(df['C_TRAF'], 'Traffic control', hight=10, width=20)
by_severity_count(df, 'C_TRAF', 'P_ISEV')

In [None]:
# Vehicle sequence number
# 99 = not a vehile
plotSimpleHist(df['V_ID'], 'Vehicle sequence number', hight=10, width=20)
by_severity_count(df, 'V_ID', 'P_ISEV')

In [None]:
# Vehicle type
plotSimpleHist(df['V_TYPE'], 'Vehicle type', hight=10, width=20)
by_severity_count(df, 'V_TYPE', 'P_ISEV')

In [None]:
# Vehicle model year
plotSimpleHist(df['V_YEAR'], 'Vehicle model year', hight=10, width=20)
by_severity_count(df, 'V_YEAR', 'P_ISEV')

In [None]:
# Person sequence number
plotSimpleHist(df['P_ID'], 'Person sequence number', hight=10, width=20)
by_severity_count(df, 'P_ID', 'P_ISEV')

In [None]:
# Person sex
plotSimpleHist(df['P_SEX'], 'Person sex', hight=10, width=20)
by_severity_count(df, 'P_SEX', 'P_ISEV')

In [None]:
# Person age
plotSimpleHist(df['P_AGE'], 'Person age', hight=10, width=20)
by_severity_count(df, 'P_AGE', 'P_ISEV')

In [None]:
# Person position
plotSimpleHist(df['P_PSN'], 'Person position', hight=10, width=20)
by_severity_count(df, 'P_PSN', 'P_ISEV')

In [None]:
# Safety device used
plotSimpleHist(df['P_SAFE'], 'Safety device used', hight=10, width=20)
by_severity_count(df, 'P_SAFE', 'P_ISEV')

In [None]:
# Road user class
plotSimpleHist(df['P_USER'], 'Road user class', hight=10, width=20)
by_severity_count(df, 'P_USER', 'P_ISEV')

### Copy dataset

In [None]:
dfcopy2 = df.copy()

## Clean dataset

#### Clean: P_ISEV

In [None]:
#P_ISEV
#df = dfcopy2.copy()
display(severity_count(df['P_ISEV']).transpose())
plotSimpleHist(df['P_ISEV'], 'Accident Severity of Vehicle Occupant', hight=10, width=20)
print("Dropped {drop_count} rows from dataset".format(drop_count = drop_count))

#### Clean: C_YEAR

In [None]:
#C_YEAR
display(by_severity_count(df, 'C_YEAR', 'P_ISEV'))
plotSimpleHist(df['C_YEAR'], 'Year of Accident', hight=10, width=20)
print("{count} non-numeric rows observed".format(count = df['C_YEAR'].astype('str').str.contains('[^0-9]').sum()))

#### Clean: C_MNTH

In [None]:
display(by_severity_count(df, 'C_MNTH', 'P_ISEV'))
print("{count} non-numeric rows observed".format(count = df['C_WTHR'].astype('str').str.contains('[^0-9]').sum()))
gbC_MNTH = checkHistogram(df, 'C_MNTH', 'P_ISEV', False)
P = computeP(gbC_MNTH)
#data_to_impute = df['C_MNTH'].astype('str').str.contains('UU')
#df[data_to_impute] = df.loc[data_to_impute].apply(lambda x: x.replace('UU', selectBin(1, 13, P, True)))

data_to_impute = df['C_MNTH'].astype('str').str.contains('UU')
df['C_MNTH'][data_to_impute] = df['C_MNTH'][data_to_impute].apply(lambda x: x.replace('UU', selectBin(1, 13, P, True)))

print("{count} non-numeric rows observed".format(count = df['C_MNTH'].astype('str').str.contains('[^0-9]').sum()))
plotSimpleHist(df['C_MNTH'], 'Accident Month', hight=10, width=20)
display(by_severity_count(df, 'C_MNTH', 'P_ISEV'))

#### Clean: C_WDAY

In [None]:
# C_WDAY
display(by_severity_count(df, 'C_WDAY', 'P_ISEV')) 
cleanData(df, 'C_WDAY', 'P_ISEV', ['U'], 1, 8, False)
plotSimpleHist(df['C_WDAY'], 'Weekday of Accident', hight=10, width=20)
display(by_severity_count(df, 'C_WDAY', 'P_ISEV'))

#### Clean: C_HOUR

In [None]:
#C_HOUR
display(by_severity_count(df, 'C_HOUR', 'P_ISEV')) 
cleanData(df, 'C_HOUR', 'P_ISEV', ['UU'], 1, 25, True)
plotSimpleHist(df['C_HOUR'], 'Time of Accident', hight=10, width=20)
display(by_severity_count(df, 'C_HOUR', 'P_ISEV'))

#### Clean: C_VEHS

In [None]:
# C_VEHS
display(by_severity_count(df, 'C_VEHS', 'P_ISEV')) 
cleanData(df, 'C_VEHS', 'P_ISEV', ['UU'], 1, 78, True)
plotSimpleHist(df['C_VEHS'], 'Number of vehicles involved in collision', hight=10, width=20)
display(by_severity_count(df, 'C_VEHS', 'P_ISEV'))

#### Clean: C_CONF

In [None]:
display(by_severity_count(df, 'C_CONF', 'P_ISEV')) 
cleanData(df, 'C_CONF', 'P_ISEV', ['UU', 'QQ'], 1, 42, True)
plotSimpleHist(df['C_CONF'], 'Collision configuration', hight=10, width=20)
display(by_severity_count(df, 'C_CONF', 'P_ISEV'))

#### Clean: C_RCFG

In [None]:
display(by_severity_count(df, 'C_RCFG', 'P_ISEV')) 
cleanData(df, 'C_RCFG', 'P_ISEV', ['UU', 'QQ'], 1, 11, True)
plotSimpleHist(df['C_RCFG'], 'Roadway configuration', hight=10, width=20)
display(by_severity_count(df, 'C_RCFG', 'P_ISEV'))

#### Clean: C_WTHR

In [None]:
#C_WTHR
display(by_severity_count(df, 'C_WTHR', 'P_ISEV')) 
cleanData(df, 'C_WTHR', 'P_ISEV', ['U', 'Q'], 1, 8, False)
plotSimpleHist(df['C_WTHR'], 'Weather condition', hight=10, width=20)
display(by_severity_count(df, 'C_WTHR', 'P_ISEV'))

#### Clean: Road Surface (C_RSUR)

In [None]:
#C_RSUR
display(by_severity_count(df, 'C_RSUR', 'P_ISEV')) 
cleanData(df, 'C_RSUR', 'P_ISEV', ['U', 'Q'], 1, 10, False)
plotSimpleHist(df['C_RSUR'], 'Road surface', hight=10, width=20)
display(by_severity_count(df, 'C_RSUR', 'P_ISEV'))

#### Clean: Road alignment (C_RALN)

In [None]:
#C_RALN
display(by_severity_count(df, 'C_RALN', 'P_ISEV')) 
cleanData(df, 'C_RALN', 'P_ISEV', ['U', 'Q'], 1, 7, False)
plotSimpleHist(df['C_RALN'], 'Road alignment', hight=10, width=20)
display(by_severity_count(df, 'C_RALN', 'P_ISEV'))

#### Clean: Traffic control (C_TRAF)

In [None]:
#C_TRAF
display(by_severity_count(df, 'C_TRAF', 'P_ISEV')) 
cleanData(df, 'C_TRAF', 'P_ISEV', ['UU', 'QQ'], 1, 19, True)
plotSimpleHist(df['C_TRAF'], 'Traffic control', hight=10, width=20)
display(by_severity_count(df, 'C_TRAF', 'P_ISEV'))

In [None]:
#### Clean: Vehicle sequence number (V_ID)

In [None]:
display(by_severity_count(df, 'V_ID', 'P_ISEV')) 
cleanData(df, 'V_ID', 'P_ISEV', ['UU'], 1, 100, True)
plotSimpleHist(df['V_ID'], 'Vehicle sequence number', hight=10, width=20)
display(by_severity_count(df, 'V_ID', 'P_ISEV'))

#### Clean: Vehicle type (V_TYPE)

In [None]:
#V_TYPE
display(by_severity_count(df, 'V_TYPE', 'P_ISEV')) 
cleanData(df, 'V_TYPE', 'P_ISEV', ['UU', 'QQ', 'NN'], 1, 24, True)
plotSimpleHist(df['V_TYPE'], 'Vehicle type', hight=10, width=20)
display(by_severity_count(df, 'V_TYPE', 'P_ISEV'))

#### Clean: Vehicle model year ('V_YEAR)

In [None]:
#V_YEAR
display(by_severity_count(df, 'V_YEAR', 'P_ISEV')) 
#cleanData(df, 'V_YEAR', 'P_ISEV', ['UUUU', 'NNNN'], 1, 117, False)

#V_YEAR - Vehicle Year
data_to_impute = df['V_YEAR'].astype('str').str.contains('NNNN')
df['V_YEAR'][data_to_impute] = '2017'

#For now set UUUU to year 2018, unti we find out why the code is not working
data_to_impute = df['V_YEAR'].astype('str').str.contains('UUUU')
df['V_YEAR'][data_to_impute] = '2018'

plotSimpleHist(df['V_YEAR'], 'Vehicle model year', hight=10, width=20)
display(by_severity_count(df, 'V_YEAR', 'P_ISEV'))

In [None]:
#df = dfcopy.copy()
display(by_severity_count(df, 'P_SEX', 'P_ISEV')) 

data_to_impute = df['P_SEX'].astype('str').str.contains('F')
df['P_SEX'][data_to_impute] = '0'
data_to_impute = df['P_SEX'].astype('str').str.contains('M')
df['P_SEX'][data_to_impute] = '1'

cleanData(df, 'P_SEX', 'P_ISEV', ['U', 'N'], 0,2, False)

In [None]:
display(by_severity_count(df, 'P_SEX', 'P_ISEV')) 

#### Clean: Person sequence number (P_ID)

In [None]:
#P_ID
display(by_severity_count(df, 'P_ID', 'P_ISEV')) 
cleanData(df, 'P_ID', 'P_ISEV', ['UU'], 1, 100, True)
plotSimpleHist(df['P_ID'], 'Person sequence number', hight=10, width=20)
display(by_severity_count(df, 'P_ID', 'P_ISEV'))

#### Clean: Person sex (P_SEX)

In [None]:
#P_SEX
display(by_severity_count(df, 'P_SEX', 'P_ISEV')) 

data_to_impute = df['P_SEX'].astype('str').str.contains('F')
df['P_SEX'][data_to_impute] = '0'
data_to_impute = df['P_SEX'].astype('str').str.contains('M')
df['P_SEX'][data_to_impute] = '1'

cleanData(df, 'P_SEX', 'P_ISEV', ['U', 'N'], 1,3, False)
plotSimpleHist(df['P_SEX'], 'Person sex', hight=10, width=20)
display(by_severity_count(df, 'P_SEX', 'P_ISEV'))

#### Clean: Person age (P_AGE)

In [None]:
#P_AGE
display(by_severity_count(df, 'P_AGE', 'P_ISEV')) 
cleanData(df, 'P_AGE', 'P_ISEV', ['UU', 'NN'], 1, 100, True)
plotSimpleHist(df['P_AGE'], 'Person age', hight=10, width=20)
display(by_severity_count(df, 'P_AGE', 'P_ISEV'))

#### Clean: Person position

In [None]:
#P_PSN
display(by_severity_count(df, 'P_PSN', 'P_ISEV')) 
cleanData(df, 'P_PSN', 'P_ISEV', ['UU','NN', 'QQ'], 1, 100, True)
plotSimpleHist(df['P_PSN'], 'Person position', hight=10, width=20)
display(by_severity_count(df, 'P_PSN', 'P_ISEV'))

#### Clean: Safety device used

In [None]:
#P_SAFE
display(by_severity_count(df, 'P_SAFE', 'P_ISEV')) 
cleanData(df, 'P_SAFE', 'P_ISEV', ['UU','NN', 'QQ'], 1, 14, True)
plotSimpleHist(df['P_SAFE'], 'Safety device used', hight=10, width=20)
display(by_severity_count(df, 'P_SAFE', 'P_ISEV'))

#### Clean: Road user class

In [None]:
#P_USER
display(by_severity_count(df, 'P_USER', 'P_ISEV')) 
cleanData(df, 'P_USER', 'P_ISEV', ['U'], 1, 6, False)
plotSimpleHist(df['P_USER'], 'Road user class', hight=10, width=20)
display(by_severity_count(df, 'P_USER', 'P_ISEV'))

#### Check that we have no non numeric values

In [None]:
print("This dataset has {0} missing value".format(df.isnull().sum().sum()))
print("This dataset has {0} NA value".format(df.isna().sum().sum()))
print("This dataset has {0} non numeric value".format(df[df.index.astype('str').str.contains('[^0-9]')].sum().sum()))

### Write Dataset to clean 

In [None]:
df.to_csv('data01_clean.csv', encoding='utf-8', index=False)

In [None]:
print(df.shape)

In [None]:
#### Clean Accident month column
#import random


#Collisions by Month
#print("{count} non-numeric rows observed".format(count = df['C_MNTH'].astype('str').str.contains('[^0-9]').sum()))
#by_month = df.groupby(['C_MNTH','P_ISEV']).size()
#plt_enabled = False
#if (plt_enabled):
#    plot = by_month.plot(figsize=(20,15), kind='bar',title='Collisions by Year');
#
#tmp = df.index[df['C_MNTH'].astype('str').str.contains('[^0-9]')]
    
## Not much variations accross the months, we are going to randomly (with equal probability)
#months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

#df['C_MNTH'] = df['C_MNTH'].apply(lambda x: x.replace('UU', np.random.choice(months)))

#print("{count} non-numeric rows observed".format(count = df['C_MNTH'].astype('str').str.contains('[^0-9]').sum()))



In [None]:
#### Clean Accident Hour column

#Collisions by weekday
#print("{count} non-numeric rows observed".format(count = df['C_HOUR'].astype('str').str.contains('[^0-9]').sum()))
#by_weekday = df.groupby(['C_HOUR','P_ISEV']).size().unstack()
#plt_enabled = False
#if (plt_enabled):
#    plot = by_weekday.plot(figsize=(20,15), kind='bar',title='Collisions by HOUR', stacked = False);

#To many levels, lets reduce the number of levels by combining the hours
#[8pm-6am], [7am-7pm]
#night = ['22', '23', '00', '01', '02', '03', '04', '05']
#morning = ['06', '07', '08', '09', '10', '11']
#activeHours = ['12', '13', '14', '15', '16', '17', '18', '19', '20', '21'] 

#df['C_HOUR'] = df['C_HOUR'].apply(lambda x: x.replace(Night, '1'))
#df['C_HOUR'] = df['C_HOUR'].apply(lambda x: x.replace(Night, '1'))
#df['C_HOUR'] = df['C_HOUR'].apply(lambda x: '1' if x in night else '2' if x in morning else '3')
#print("{count} non-numeric rows observed".format(count = df['C_HOUR'].astype('str').str.contains('[^0-9]').sum()))

#if (plt_enabled):
#    plot = by_weekday.plot(figsize=(20,15), kind='bar',title='Collisions by HOUR', stacked = False);


In [None]:
#set the correct type for each variable
#df = df.astype('category')
#df['C_YEAR'] = df['C_YEAR'].astype('int64')
#df['C_VEHS'] = df['C_VEHS'].astype('int64')
#df['V_YEAR'] = df['V_YEAR'].astype('int64')
#df['P_AGE'] = df['P_AGE'].astype('int64')

In [None]:
#df of categorical and numberic varaibales
#df_cat = df.select_dtypes(include=['category']).copy()
#df_int = df.select_dtypes(include=['int64']).copy()

In [None]:
#print(df_cat.head())
#print(df_int.head())

In [None]:
#pd.options.display.float_format = '{:.4g}'.format
#df_int.describe()

In [None]:
#df_int[['C_VEHS']].boxplot()

In [None]:
#df_int[['P_AGE']].boxplot()

In [None]:
if debug:
    #The overall diagram
    plot1 = df1.plot(figsize=(10,10),title='Collision overall statistics')
    plot1.set_xlabel("Year")
    plot1.set_ylabel("Number of collisions");

In [None]:
if debug:
    #Collisions by weekday
    by_weekday = df_b.groupby('C_WDAY')['P_ISEV'].count()
    by_weekday.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    plot2 = by_weekday.plot(kind='bar',title='Collisions by day');

In [None]:
if debug:
    #Collisions by hour
    plt.figure(figsize=(15,5))
    by_hour = df_b.groupby('C_HOUR')['P_ISEV'].count()
    plot3 = by_hour.plot(kind='bar',title='Collisions by hour',color='B');

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_YEAR', 'P_ISEV']).size().unstack().plot(figsize=(10,10), kind='bar', title = 'Year vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_MNTH', 'P_ISEV']).size().unstack().plot(figsize=(10,10), kind='bar', title = 'Month vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_WDAY', 'P_ISEV']).size().unstack().plot(figsize=(10,10), kind='bar', title = 'Day vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_HOUR', 'P_ISEV']).size().unstack().plot(figsize=(10,10), kind='bar', title = 'HOUR vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_VEHS', 'P_ISEV']).size().unstack().plot(figsize=(10,10), kind='bar', title = 'C_VEHS vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_CONF', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'C_CONF vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_RCFG', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'C_RCFG vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_WTHR', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'C_WTHR vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_RSUR', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'C_WTHR vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_RALN', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'C_RALN vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['C_TRAF', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'C_TRAF vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['V_ID', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'V_ID vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['V_TYPE', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'V_TYPE vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['V_YEAR', 'P_ISEV']).size().unstack().plot(figsize=(25,20), kind='bar', title = 'V_YEAR vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_ID', 'P_ISEV']).size().unstack().plot(figsize=(25,20), kind='bar', title = 'P_ID vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_SEX', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'P_SEX vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_AGE', 'P_ISEV']).size().unstack().plot(figsize=(25,20), kind='bar', title = 'P_AGE vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_PSN', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'P_PSN vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_SAFE', 'P_ISEV']).size().unstack().plot(figsize=(10,20), kind='bar', title = 'P_SAFE vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_USER', 'P_ISEV']).size().unstack().plot(figsize=(10,15), kind='bar', title = 'P_USER vs Accident Severity', stacked = True)

In [None]:
if debug:
    #Year vs Severity
    df.groupby(['P_ISEV', 'P_ISEV']).size().unstack().plot(figsize=(8,10), kind='bar', title = 'P_ISEV vs Accident Severity', stacked = True)

In [None]:
#%matplotlib inline
#df_int[['C_VEHS', 'P_AGE']].boxplot()

In [None]:
#check if dataset contains nulls
if df.isnull().sum().sum():
    print("Dataset contains Null values")
else:
    print("No Null values in the Dataset")

In [None]:
if df.isna().sum().sum():
    print("Dataset contains missing values")
else:
    print("No missing values in the Dataset")