In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
acc_cas = {}
for i in range(2012, 2017):
    idx = str(i)
    acc = pd.read_excel('./input/{0}/Acc_cas {0}.xls'.format(idx))
    acc = acc[['ACC_TIME', 'WEEK_DAY', 'RAIN', 'SPEED_LMT', 'TRAFF_CONG', 'JCN_CTRL',
               'RD_TYPE', 'CAS_AGE', 'CAS_SEX', 'INJURY', 'ROLE', 'SB_WORN', 'SEAT', 
               'PED_LOCATN']]

    acc['YEAR'] = idx
    acc_cas[idx] = acc
    print('The Year %s is read.' % idx)

The Year 2012 is read.
The Year 2013 is read.
The Year 2014 is read.
The Year 2015 is read.
The Year 2016 is read.


In [4]:
acc = acc_cas['2012'].copy()
for i in range(2013, 2017):
    acc = acc.append(acc_cas[str(i)].copy())

In [5]:
def str_append(x):
    if isinstance(x, str):
        counter = ['0' for i in range(4 - len(x))]
        counter = ''.join(counter)
        x = counter + x
        return x
    else:
        raise TypeError('The function only support string-type input.')


acc['ACC_TIME'] = acc['ACC_TIME'].map(
    lambda x: str_append(str(x)))
acc['HOUR'] = pd.to_datetime(
    acc['ACC_TIME'], format='%H%M').dt.hour
t0 = acc['HOUR'][acc['HOUR'].isin(
    [7, 8, 9])].count()  # 700am-959am
t1 = acc['HOUR'][acc['HOUR'].isin(
    [10, 11, 12, 13, 14, 15])].count()  # 1000am-0359pm
t2 = acc['HOUR'][acc['HOUR'].isin(
    [16, 17, 18])].count()  # 0400pm-0659pm
t3 = acc['HOUR'][acc['HOUR'].isin(
    [19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6])].count()  # 0700pm-0659am
print('Time: Count(Proportion)')
print('7:00 - 9:59 a.m.: %d(%.1f%%)' %
      (t0, t0/acc['HOUR'].count()*100))
print('10:00 a.m. - 3:59 p.m.: %d(%.1f%%)' %
      (t1, t1/acc['HOUR'].count()*100))
print('4:00 - 6:59 p.m.: %d(%.1f%%)' %
      (t2, t2/acc['HOUR'].count()*100))
print('7:00 p.m. - 6:59 a.m.: %d(%.1f%%)' %
      (t3, t3/acc['HOUR'].count()*100))

Time: Count(Proportion)
7:00 - 9:59 a.m.: 15596(15.4%)
10:00 a.m. - 3:59 p.m.: 33959(33.6%)
4:00 - 6:59 p.m.: 20287(20.1%)
7:00 p.m. - 6:59 a.m.: 31331(31.0%)


In [6]:
weekday_row = acc['WEEK_DAY'].isin([1, 2, 3, 4, 5])
weekday = acc['WEEK_DAY'][weekday_row].count()
weekend = acc['WEEK_DAY'][~weekday_row].count()
print('Day of week: Count(Proportion)')
print('Monday - Friday: %d(%.1f%%)' %
      (weekday, weekday/acc['WEEK_DAY'].count()*100))
print('Weekend: %d(%.1f%%)' %
      (weekend, weekend/acc['WEEK_DAY'].count()*100))

Day of week: Count(Proportion)
Monday - Friday: 70624(69.8%)
Weekend: 30549(30.2%)


In [7]:
r0 = acc['RAIN'][acc['RAIN'] == 1].count()
r1 = acc['RAIN'][acc['RAIN'] == 2].count()
r2 = acc['RAIN'][acc['RAIN'] == 3].count()
r3 = acc['RAIN'][acc['RAIN'].isin([9, 0])].count()

print('Rain: Count(Proportion)')
print('No Rain: %d(%.1f%%)' % (r0, r0/acc['RAIN'].count()*100))
print('Light Rain: %d(%.1f%%)' % (r1, r1/acc['RAIN'].count()*100))
print('Heavy Rain: %d(%.1f%%)' % (r2, r2/acc['RAIN'].count()*100))
print('Unknown: %d(%.1f%%)' % (r3, r3/acc['RAIN'].count()*100))

Rain: Count(Proportion)
No Rain: 86443(85.4%)
Light Rain: 12315(12.2%)
Heavy Rain: 2158(2.1%)
Unknown: 257(0.3%)


In [8]:
s0 = acc['SPEED_LMT'][acc['SPEED_LMT'] == 50].count()
s1 = acc['SPEED_LMT'][acc['SPEED_LMT'] < 50].count()
s2 = acc['SPEED_LMT'][acc['SPEED_LMT'] > 50].count()

print('Speed Limit: Count(Proportion)')
print('50km/h: %d(%.1f%%)' % (s0, s0/acc['SPEED_LMT'].count()*100))
print('Under 50km/h: %d(%.1f%%)' % (s1, s1/acc['SPEED_LMT'].count()*100))
print('Above 50km/h: %d(%.1f%%)' % (s2, s2/acc['SPEED_LMT'].count()*100))

Speed Limit: Count(Proportion)
50km/h: 86065(85.1%)
Under 50km/h: 2039(2.0%)
Above 50km/h: 13069(12.9%)


In [9]:
t0 = acc['TRAFF_CONG'][acc['TRAFF_CONG'] == 3].count()
t1 = acc['TRAFF_CONG'][acc['TRAFF_CONG'] == 1].count()
t2 = acc['TRAFF_CONG'][acc['TRAFF_CONG'] == 2].count()
t3 = acc['TRAFF_CONG'][acc['TRAFF_CONG'].isin([9, 0])].count()

print('Traffic Congestion: Count(Proportion)')
print('None: %d(%.1f%%)' % (t0, t0/acc['TRAFF_CONG'].count()*100))
print('Severe Congestion: %d(%.1f%%)' % (t1, t1/acc['TRAFF_CONG'].count()*100))
print('Moderate Congestion: %d(%.1f%%)' % (t2, t2/acc['TRAFF_CONG'].count()*100))
print('Unknown: %d(%.1f%%)' % (t3, t3/acc['TRAFF_CONG'].count()*100))

Traffic Congestion: Count(Proportion)
None: 55863(55.2%)
Severe Congestion: 15458(15.3%)
Moderate Congestion: 28381(28.1%)
Unknown: 1471(1.5%)


In [10]:
j0 = acc['JCN_CTRL'][acc['JCN_CTRL'] == 6].count()
j1 = acc['JCN_CTRL'][acc['JCN_CTRL'] == 4].count()
j2 = acc['JCN_CTRL'][acc['JCN_CTRL'].isin([2, 3, 5])].count()
j3 = acc['JCN_CTRL'][acc['JCN_CTRL'] == 1].count()
j4 = acc['JCN_CTRL'][acc['JCN_CTRL'] == 0].count()

print('Traffic Congestion: Count(Proportion)')
print('Non Junction: %d(%.1f%%)' % (j0, j0/acc['JCN_CTRL'].count()*100))
print('Traffic Signal: %d(%.1f%%)' % (j1, j1/acc['JCN_CTRL'].count()*100))
print('Other Control Types: %d(%.1f%%)' % (j2, j2/acc['JCN_CTRL'].count()*100))
print('No Control: %d(%.1f%%)' % (j3, j3/acc['JCN_CTRL'].count()*100))
print('Unknown: %d(%.1f%%)' % (j4, j4/acc['JCN_CTRL'].count()*100))

Traffic Congestion: Count(Proportion)
Non Junction: 77440(76.5%)
Traffic Signal: 13228(13.1%)
Other Control Types: 4762(4.7%)
No Control: 5711(5.6%)
Unknown: 32(0.0%)


In [11]:
r0 = acc['RD_TYPE'][acc['RD_TYPE'].isin([3, 4])].count()
r1 = acc['RD_TYPE'][acc['RD_TYPE'] == 1].count()
r2 = acc['RD_TYPE'][acc['RD_TYPE'] == 2].count()
r3 = acc['RD_TYPE'][acc['RD_TYPE'] == 0].count()

print('Traffic Congestion: Count(Proportion)')
print('Multi-/dual Carriageway: %d(%.1f%%)' % (r0, r0/acc['RD_TYPE'].count()*100))
print('One-way Carriageway: %d(%.1f%%)' % (r1, r1/acc['RD_TYPE'].count()*100))
print('Two-way Carriageway: %d(%.1f%%)' % (r2, r2/acc['RD_TYPE'].count()*100))
print('Unknown: %d(%.1f%%)' % (r3, r3/acc['RD_TYPE'].count()*100))

Traffic Congestion: Count(Proportion)
Multi-/dual Carriageway: 24412(24.1%)
One-way Carriageway: 41196(40.7%)
Two-way Carriageway: 35558(35.1%)
Unknown: 7(0.0%)


In [12]:
# split out the values of unknown
unknown_row = acc['CAS_AGE'].isin([0, 99, 999])
unknown = acc['CAS_AGE'][unknown_row]
dta_age = acc['CAS_AGE'][~unknown_row]

age_grouped = pd.cut(dta_age, [0, 15, 65, 120], labels=['<15', '15-65', '>65'])
age0 = age_grouped.value_counts()['<15']
age1 = age_grouped.value_counts()['15-65']
age2 = age_grouped.value_counts()['>65']
ageunknown = unknown.count()
print('Age: Count(Proportion)')
print('Under 15: %d(%.1f%%)' %
      (age0, age0/acc['CAS_AGE'].count()*100))
print('15 - 65: %d(%.1f%%)' %
      (age1, age1/acc['CAS_AGE'].count()*100))
print('Above 65: %d(%.1f%%)' %
      (age2, age2/acc['CAS_AGE'].count()*100))
print('Unknown: %d(%.1f%%)' %
      (ageunknown, ageunknown/acc['CAS_AGE'].count()*100))

Age: Count(Proportion)
Under 15: 6674(6.6%)
15 - 65: 84305(83.3%)
Above 65: 9125(9.0%)
Unknown: 1066(1.1%)


In [13]:
sex0 = acc['CAS_SEX'].value_counts()[1]
sex1 = acc['CAS_SEX'].value_counts()[2]
sex2 = acc['CAS_SEX'].value_counts()[9]
print('Sex: Count(Proportion)')
print('Male: %d(%.1f%%)' %
      (sex0, sex0/acc['CAS_SEX'].count()*100))
print('Famale: %d(%.1f%%)' %
      (sex1, sex1/acc['CAS_SEX'].count()*100))
print('Unknown: %d(%.1f%%)' %
      (sex2, sex2/acc['CAS_SEX'].count()*100))

Sex: Count(Proportion)
Male: 63618(62.9%)
Famale: 37439(37.0%)
Unknown: 116(0.1%)


In [14]:
injury0 = acc['INJURY'].value_counts()[1] + acc['INJURY'].value_counts()[2]
injury1 = acc['INJURY'].value_counts()[3]
print('Injury Severity: Count(Proportion)')
print('Killed or severe injury: %d(%.1f%%)' %
      (injury0, injury0/acc['INJURY'].count()*100))
print('Slight Injury: %d(%.1f%%)' %
      (injury1, injury1/acc['INJURY'].count()*100))

Injury Severity: Count(Proportion)
Killed or severe injury: 13565(13.4%)
Slight Injury: 87608(86.6%)


In [15]:
sb0 = acc['SB_WORN'][acc['SB_WORN'] == 1].count()
sb1 = acc['SB_WORN'][acc['SB_WORN'] == 2].count()
sb2 = acc['SB_WORN'][acc['SB_WORN'].isin([9, 0])].count()

print('Seat belt or crash helmet worn: Count(Proportion)')
print('Yes: %d(%.1f%%)' % (sb0, sb0/acc['SB_WORN'].count()*100))
print('No: %d(%.1f%%)' % (sb1, sb1/acc['SB_WORN'].count()*100))
print('Unknown: %d(%.1f%%)' % (sb2, sb2/acc['SB_WORN'].count()*100))

Seat belt or crash helmet worn: Count(Proportion)
Yes: 63143(62.4%)
No: 18650(18.4%)
Unknown: 19380(19.2%)


In [16]:
# Split into three group to check these variables proportion
accd = acc[acc['ROLE'] == 1]
accp = acc[acc['ROLE'] == 2]
accpd = acc[acc['ROLE'] == 3]

In [17]:
sb0 = accd['SB_WORN'][accd['SB_WORN'] == 1].count()
sb1 = accd['SB_WORN'][accd['SB_WORN'] == 2].count()
sb2 = accd['SB_WORN'][accd['SB_WORN'].isin([9, 0])].count()

print('Seat belt or crash helmet worn: Count(Proportion)')
print('Yes: %d(%.1f%%)' % (sb0, sb0/accd['SB_WORN'].count()*100))
print('No: %d(%.1f%%)' % (sb1, sb1/accd['SB_WORN'].count()*100))
print('Unknown: %d(%.1f%%)' % (sb2, sb2/accd['SB_WORN'].count()*100))

Seat belt or crash helmet worn: Count(Proportion)
Yes: 39749(80.9%)
No: 8868(18.0%)
Unknown: 544(1.1%)


In [18]:
sb0 = accp['SB_WORN'][accp['SB_WORN'] == 1].count()
sb1 = accp['SB_WORN'][accp['SB_WORN'] == 2].count()
sb2 = accp['SB_WORN'][accp['SB_WORN'].isin([9, 0])].count()

print('Seat belt or crash helmet worn: Count(Proportion)')
print('Yes: %d(%.1f%%)' % (sb0, sb0/accp['SB_WORN'].count()*100))
print('No: %d(%.1f%%)' % (sb1, sb1/accp['SB_WORN'].count()*100))
print('Unknown: %d(%.1f%%)' % (sb2, sb2/accp['SB_WORN'].count()*100))

Seat belt or crash helmet worn: Count(Proportion)
Yes: 23394(68.9%)
No: 9782(28.8%)
Unknown: 774(2.3%)


In [19]:
sb0 = accpd['SB_WORN'][accpd['SB_WORN'] == 1].count()
sb1 = accpd['SB_WORN'][accpd['SB_WORN'] == 2].count()
sb2 = accpd['SB_WORN'][accpd['SB_WORN'].isin([9, 0])].count()

print('Seat belt or crash helmet worn: Count(Proportion)')
print('Yes: %d(%.1f%%)' % (sb0, sb0/accpd['SB_WORN'].count()*100))
print('No: %d(%.1f%%)' % (sb1, sb1/accpd['SB_WORN'].count()*100))
print('Unknown: %d(%.1f%%)' % (sb2, sb2/accpd['SB_WORN'].count()*100))

Seat belt or crash helmet worn: Count(Proportion)
Yes: 0(0.0%)
No: 0(0.0%)
Unknown: 18062(100.0%)


In [20]:
s0 = acc['SEAT'][acc['SEAT'] == 3].count()
s1 = acc['SEAT'][acc['SEAT'] == 2].count()
s2 = acc['SEAT'][acc['SEAT'] == 1].count()
s3 = acc['SEAT'][acc['SEAT'].isin([4, 8])].count()
s4 = accd['SEAT'][accd['SEAT'].isin([6, 7])].count()
s5 = accd['SEAT'][accd['SEAT'].isin([0, 9])].count()

print('Seat Occupied: Count(Proportion)')
print('Driver/Rider: %d(%.1f%%)' % (s0, s0/acc['SEAT'].count()*100))
print('Front Nearside: %d(%.1f%%)' % (s1, s1/acc['SEAT'].count()*100))
print('Rear: %d(%.1f%%)' % (s2, s2/acc['SEAT'].count()*100))
print('Standing: %d(%.1f%%)' % (s3, s3/acc['SEAT'].count()*100))
print('Others: %d(%.1f%%)' % (s4, s4/acc['SEAT'].count()*100))
print('Unknown: %d(%.1f%%)' % (s5, s5/acc['SEAT'].count()*100))

Seat Occupied: Count(Proportion)
Driver/Rider: 49161(48.6%)
Front Nearside: 6202(6.1%)
Rear: 20230(20.0%)
Standing: 6408(6.3%)
Others: 0(0.0%)
Unknown: 0(0.0%)


In [21]:
s0 = accd['SEAT'][accd['SEAT'] == 3].count()
s1 = accd['SEAT'][accd['SEAT'] == 2].count()
s2 = accd['SEAT'][accd['SEAT'] == 1].count()
s3 = accd['SEAT'][accd['SEAT'].isin([4, 8])].count()
s4 = accd['SEAT'][accd['SEAT'].isin([6, 7])].count()
s5 = accd['SEAT'][accd['SEAT'].isin([0, 9])].count()

print('Seat Occupied: Count(Proportion)')
print('Driver/Rider: %d(%.1f%%)' % (s0, s0/accd['SEAT'].count()*100))
print('Front Nearside: %d(%.1f%%)' % (s1, s1/accd['SEAT'].count()*100))
print('Rear: %d(%.1f%%)' % (s2, s2/accd['SEAT'].count()*100))
print('Standing: %d(%.1f%%)' % (s3, s3/accd['SEAT'].count()*100))
print('Others: %d(%.1f%%)' % (s4, s4/accd['SEAT'].count()*100))
print('Unknown: %d(%.1f%%)' % (s5, s5/accd['SEAT'].count()*100))

Seat Occupied: Count(Proportion)
Driver/Rider: 49161(100.0%)
Front Nearside: 0(0.0%)
Rear: 0(0.0%)
Standing: 0(0.0%)
Others: 0(0.0%)
Unknown: 0(0.0%)


In [22]:
s0 = accp['SEAT'][accp['SEAT'] == 3].count()
s1 = accp['SEAT'][accp['SEAT'] == 2].count()
s2 = accp['SEAT'][accp['SEAT'] == 1].count()
s3 = accp['SEAT'][accp['SEAT'].isin([4, 8])].count()
s4 = accp['SEAT'][accp['SEAT'].isin([6, 7])].count()
s5 = accp['SEAT'][accp['SEAT'].isin([0, 9])].count()

print('Seat Occupied: Count(Proportion)')
print('Driver/Rider: %d(%.1f%%)' % (s0, s0/accp['SEAT'].count()*100))
print('Front Nearside: %d(%.1f%%)' % (s1, s1/accp['SEAT'].count()*100))
print('Rear: %d(%.1f%%)' % (s2, s2/accp['SEAT'].count()*100))
print('Standing: %d(%.1f%%)' % (s3, s3/accp['SEAT'].count()*100))
print('Others: %d(%.1f%%)' % (s4, s4/accp['SEAT'].count()*100))
print('Unknown: %d(%.1f%%)' % (s5, s5/accp['SEAT'].count()*100))

Seat Occupied: Count(Proportion)
Driver/Rider: 0(0.0%)
Front Nearside: 6202(18.3%)
Rear: 20230(59.6%)
Standing: 6408(18.9%)
Others: 20(0.1%)
Unknown: 1090(3.2%)


In [23]:
s0 = accpd['SEAT'][accpd['SEAT'] == 3].count()
s1 = accpd['SEAT'][accpd['SEAT'] == 2].count()
s2 = accpd['SEAT'][accpd['SEAT'] == 1].count()
s3 = accpd['SEAT'][accpd['SEAT'].isin([4, 8])].count()
s4 = accpd['SEAT'][accpd['SEAT'].isin([6, 7])].count()
s5 = accpd['SEAT'][accpd['SEAT'].isin([0, 9])].count()

print('Seat Occupied: Count(Proportion)')
print('Driver/Rider: %d(%.1f%%)' % (s0, s0/accpd['SEAT'].count()*100))
print('Front Nearside: %d(%.1f%%)' % (s1, s1/accpd['SEAT'].count()*100))
print('Rear: %d(%.1f%%)' % (s2, s2/accpd['SEAT'].count()*100))
print('Standing: %d(%.1f%%)' % (s3, s3/accpd['SEAT'].count()*100))
print('Others: %d(%.1f%%)' % (s4, s4/accpd['SEAT'].count()*100))
print('Unknown: %d(%.1f%%)' % (s5, s5/accpd['SEAT'].count()*100))

Seat Occupied: Count(Proportion)
Driver/Rider: 0(0.0%)
Front Nearside: 0(0.0%)
Rear: 0(0.0%)
Standing: 0(0.0%)
Others: 0(0.0%)
Unknown: 18062(100.0%)


In [24]:
p0 = acc['PED_LOCATN'][acc['PED_LOCATN'] == 3].count()
p1 = acc['PED_LOCATN'][acc['PED_LOCATN'] == 4].count()
p2 = acc['PED_LOCATN'][acc['PED_LOCATN'].isin([1, 2, 5, 8])].count()
p3 = acc['PED_LOCATN'][acc['PED_LOCATN'].isin([0, 9])].count()

print('Pedestrian Location: Count(Proportion)')
print('On controlled crossing: %d(%.1f%%)' % (p0, p0/acc['PED_LOCATN'].count()*100))
print('Within 15M of controlled crossing: %d(%.1f%%)' % (p1, p1/acc['PED_LOCATN'].count()*100))
print('Others: %d(%.1f%%)' % (p2, p2/acc['PED_LOCATN'].count()*100))
print('Unknown: %d(%1.f%%)' % (p3, p3/acc['PED_LOCATN'].count()*100))

Pedestrian Location: Count(Proportion)
On controlled crossing: 2591(2.6%)
Within 15M of controlled crossing: 1059(1.0%)
Others: 14134(14.0%)
Unknown: 83389(82%)


In [25]:
pp0 = accpd['PED_LOCATN'][accpd['PED_LOCATN'] == 3].count()
pp1 = accpd['PED_LOCATN'][accpd['PED_LOCATN'] == 4].count()
pp2 = accpd['PED_LOCATN'][accpd['PED_LOCATN'].isin([1, 2, 5, 8])].count()
pp3 = accpd['PED_LOCATN'][accpd['PED_LOCATN'].isin([0, 9])].count()

print('Pedestrian Location: Count(Proportion)')
print('On controlled crossing: %d(%.1f%%)' % (pp0, pp0/accpd['PED_LOCATN'].count()*100))
print('Within 15M of controlled crossing: %d(%.1f%%)' % (pp1, pp1/accpd['PED_LOCATN'].count()*100))
print('Others: %d(%.1f%%)' % (pp2, pp2/accpd['PED_LOCATN'].count()*100))
print('Unkown: %d(%1.f%%)' % (pp3, pp3/accpd['PED_LOCATN'].count()*100))

Pedestrian Location: Count(Proportion)
On controlled crossing: 2591(14.3%)
Within 15M of controlled crossing: 1059(5.9%)
Others: 14134(78.3%)
Unkown: 278(2%)


In [26]:
print(acc['YEAR'].value_counts())
print(acc['YEAR'].value_counts(normalize=True))

2013    20596
2015    20381
2012    20210
2016    20132
2014    19854
Name: YEAR, dtype: int64
2013    0.203572
2015    0.201447
2012    0.199757
2016    0.198986
2014    0.196238
Name: YEAR, dtype: float64


In [27]:
# ACC_TIME
acc['ACC_TIME'] = acc['ACC_TIME'].map(
    lambda x: str_append(str(x)))
acc['HOUR'] = pd.to_datetime(acc['ACC_TIME'], format='%H%M').dt.hour
# Split into four group,
acc['HOUR'][acc['HOUR'].isin([7, 8, 9])] = '7:00 - 9:59 a.m.'
acc['HOUR'][acc['HOUR'].isin(
    [10, 11, 12, 13, 14, 15])] = '10:00 a.m. - 3:59 p.m.'
acc['HOUR'][acc['HOUR'].isin([16, 17, 18])] = '4:00 - 6:59 p.m.'
acc['HOUR'][acc['HOUR'].isin(
    [19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6])] = '7:00 p.m. - 6:59 a.m.'
# Convert to dummy variables
acc['HOUR'][acc['HOUR'] == '7:00 p.m. - 6:59 a.m.'] = 0
acc['HOUR'][acc['HOUR'] == '7:00 - 9:59 a.m.'] = 1
acc['HOUR'][acc['HOUR'] == '10:00 a.m. - 3:59 p.m.'] = 2
acc['HOUR'][acc['HOUR'] == '4:00 - 6:59 p.m.'] = 3
# Merge into acc
dummy = pd.get_dummies(acc['HOUR'], drop_first=True).rename(columns={1: '7:00 - 9:59 a.m.',
                                                                     2: '10:00 a.m. - 3:59 p.m.',
                                                                     3: '4:00 - 6:59 p.m.'})
acc = pd.concat([acc, dummy], axis=1).drop(['HOUR', 'ACC_TIME'], axis=1)

In [28]:
# Similarly, group and dummy WEEK_DAY,
# Split into two group, weekday and weekend, and Convert to dummy variables
acc['WEEK_DAY'][acc['WEEK_DAY'].isin([1, 2, 3, 4, 5])] = 1
acc['WEEK_DAY'][acc['WEEK_DAY'].isin([6, 7])] = 0
# Merge into acc
dummy = pd.get_dummies(acc['WEEK_DAY'], drop_first=True).rename(columns={1: 'Monday - Friday'})
acc = pd.concat([acc, dummy], axis=1).drop('WEEK_DAY', axis=1)

In [29]:
# Similarly, group and dummy RAIN
# Remove Unknown rows,
acc = acc[~acc['RAIN'].isin([9, 0])]
# Convert to dummy variables
acc['RAIN'][acc['RAIN'] == 1] = 0
acc['RAIN'][acc['RAIN'] == 2] = 1
acc['RAIN'][acc['RAIN'] == 3] = 2
# Merge into acc
dummy = pd.get_dummies(acc['RAIN'], drop_first=True).rename(columns={1: 'Light rain',
                                                                     2: 'Heavy rain'})
acc = pd.concat([acc, dummy], axis=1).drop('RAIN', axis=1)

In [30]:
# Similarly, group and dummy SPEED_LMT
# Convert to dummy variables
acc['SPEED_LMT'][acc['SPEED_LMT'] > 50] = 0
acc['SPEED_LMT'][acc['SPEED_LMT'] < 50] = 1
acc['SPEED_LMT'][acc['SPEED_LMT'] == 50] = 2
# Merge into acc
dummy = pd.get_dummies(acc['SPEED_LMT'], drop_first=True).rename(columns={1: 'Below 50 km/h',
                                                                          2: '50 km/h'})
acc = pd.concat([acc, dummy], axis=1).drop('SPEED_LMT', axis=1)

In [31]:
# Similarly, group and dummy TRAFF_CONG
# Remove Unknown rows,
acc = acc[~acc['TRAFF_CONG'].isin([9, 0])]
# Convert to dummy variables
acc['TRAFF_CONG'][acc['TRAFF_CONG'] == 3] = 0
# Merge into acc
dummy = pd.get_dummies(acc['TRAFF_CONG'], drop_first=True).rename(columns={1: 'Severe Congestion',
                                                                           2: 'Moderate Congestion'})
acc = pd.concat([acc, dummy], axis=1).drop('TRAFF_CONG', axis=1)

In [32]:
# Similarly, group and dummy JCN_CTRL
# Remove Unknown rows,
acc = acc[~acc['JCN_CTRL'].isin([9, 0])]
# Convert to dummy variables
acc['JCN_CTRL'][acc['JCN_CTRL'] == 6] = 0
acc['JCN_CTRL'][acc['JCN_CTRL'] == 4] = 1
acc['JCN_CTRL'][acc['JCN_CTRL'].isin([2, 3, 5])] = 2
acc['JCN_CTRL'][acc['JCN_CTRL'] == 1] = 3
# Merge into acc
dummy = pd.get_dummies(acc['JCN_CTRL'], drop_first=True).rename(columns={1: 'Non Junction',
                                                                         2: 'Traffic Signal',
                                                                         3: 'Other Control Types',
                                                                         4: 'No Control'})
acc = pd.concat([acc, dummy], axis=1).drop('JCN_CTRL', axis=1)

In [33]:
# Similarly, group and dummy RD_TYPE
# Remove Unknown rows
acc = acc[~acc['RD_TYPE'].isin([9, 0])]
# Convert to dummy variables
acc['RD_TYPE'][acc['RD_TYPE'].isin([3, 4])] = 0
# Merge into acc
dummy = pd.get_dummies(acc['RD_TYPE'], drop_first=True).rename(columns={1: 'One-way Carriageway',
                                                                        2: 'Two-way Carriageway'})
acc = pd.concat([acc, dummy], axis=1).drop('RD_TYPE', axis=1)

In [34]:
# Similarly, group and dummy CAS_AGE
# Remove Unknown rows
acc = acc[~acc['CAS_AGE'].isin([0, 99, 999])]
# Convert to dummy variables
acc['CAS_AGE'] = pd.cut(acc['CAS_AGE'], [0, 15, 66, 120], labels=[0, 1, 2])
# Merge into acc
dummy = pd.get_dummies(acc['CAS_AGE']).rename(columns={0: 'Under 15',
                                                       1: '15 - 65',
                                                       2: 'Above 65'})
acc = pd.concat([acc, dummy], axis=1).drop(['CAS_AGE', '15 - 65'], axis=1)

In [35]:
# Similarly, group and dummy CAS_SEX
# Remove Unknown rows
acc = acc[~acc['CAS_SEX'].isin([0, 9])]
# Convert to dummy variables
acc['CAS_SEX'][acc['CAS_SEX'] == 2] = 0
# Merge into acc
dummy = pd.get_dummies(acc['CAS_SEX'], drop_first=True).rename(columns={1: 'Male'})
acc = pd.concat([acc, dummy], axis=1).drop('CAS_SEX', axis=1)

In [36]:
# Similarly, group and dummy Year
dummy = pd.get_dummies(acc['YEAR'], prefix='YEAR').drop('YEAR_2016', axis=1)
acc = pd.concat([acc, dummy], axis=1).drop(['YEAR'], axis=1)

In [37]:
# Group INJURY for binary classification
acc['INJURY'][acc['INJURY'].isin([1, 2])] = 1
acc['INJURY'][acc['INJURY'] == 3] = 0
# which means the model would find the prob of seriously injury with the given conditions.

In [38]:
acc.head()

Unnamed: 0,INJURY,ROLE,SB_WORN,SEAT,PED_LOCATN,7:00 - 9:59 a.m.,10:00 a.m. - 3:59 p.m.,4:00 - 6:59 p.m.,Monday - Friday,Light rain,...,Other Control Types,One-way Carriageway,Two-way Carriageway,Under 15,Above 65,Male,YEAR_2012,YEAR_2013,YEAR_2014,YEAR_2015
0,0,3,0,0,2,0,0,0,0,0,...,0,0,1,0,0,1,1,0,0,0
1,0,2,1,2,0,0,0,0,0,0,...,1,1,0,0,0,1,1,0,0,0
2,0,1,1,3,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
3,0,1,1,3,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
4,0,1,1,3,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [39]:
acc.columns

Index(['INJURY', 'ROLE', 'SB_WORN', 'SEAT', 'PED_LOCATN', '7:00 - 9:59 a.m.',
       '10:00 a.m. - 3:59 p.m.', '4:00 - 6:59 p.m.', 'Monday - Friday',
       'Light rain', 'Heavy rain', '50 km/h', 'Severe Congestion',
       'Moderate Congestion', 'Traffic Signal', 'Other Control Types',
       'One-way Carriageway', 'Two-way Carriageway', 'Under 15', 'Above 65',
       'Male', 'YEAR_2012', 'YEAR_2013', 'YEAR_2014', 'YEAR_2015'],
      dtype='object')

In [40]:
# split dataset to into 3 group, driver, passenger and pedestrian

In [41]:
acc_driver = acc[acc['ROLE'] == 1].drop(['ROLE', 'SEAT', 'PED_LOCATN'], axis=1)
acc_passen = acc[acc['ROLE'] == 2].drop(['ROLE', 'PED_LOCATN'], axis=1)
acc_pedest = acc[acc['ROLE'] == 3].drop(['ROLE', 'SEAT', 'SB_WORN'], axis=1)

In [42]:
# Similarly, group and dummy SB_WORN, but this is only for driver and passen
# Remove Unknown rows
acc_driver = acc_driver[~acc_driver['SB_WORN'].isin([0, 9])]
acc_passen = acc_passen[~acc_passen['SB_WORN'].isin([0, 9])]
# Convert to dummy variables
acc_driver['SB_WORN'][acc_driver['SB_WORN'] == 2] = 0
acc_passen['SB_WORN'][acc_passen['SB_WORN'] == 2] = 0
# Merge into acc
dummy = pd.get_dummies(acc_driver['SB_WORN'], drop_first=True).rename(columns={1: 'With belt or helmet'})
acc_driver = pd.concat([acc_driver, dummy], axis=1).drop('SB_WORN', axis=1)
dummy = pd.get_dummies(acc_passen['SB_WORN'], drop_first=True).rename(columns={1: 'With belt or helmet'})
acc_passen = pd.concat([acc_passen, dummy], axis=1).drop('SB_WORN', axis=1)

In [43]:
# Similarly, group and dummy SEAT, but this is only for passen
# Convert to dummy variables
acc_passen['SEAT'][acc_passen['SEAT'].isin([0, 6, 7, 9])] = 5
acc_passen['SEAT'][acc_passen['SEAT'].isin([4, 8])] = 4
acc_passen['SEAT'][acc_passen['SEAT'] == 3] = 0
# Merge into acc
# rmk Control variables is Driver/Rider
dummy = pd.get_dummies(acc_passen['SEAT'], drop_first=True).rename(
    columns={1: 'Rear', 2: 'Front nearside', 4: 'Standing', 5: 'Others'})
acc_passen = pd.concat([acc_passen, dummy], axis=1).drop('SEAT', axis=1)

In [44]:
# Similarly, group and dummy PED_LOCATN, but this is only for pedest
acc_pedest['PED_LOCATN'][acc_pedest['PED_LOCATN'].isin([0, 1, 2, 5, 8, 9])] = 0
acc_pedest['PED_LOCATN'][acc_pedest['PED_LOCATN'] == 3] = 1
acc_pedest['PED_LOCATN'][acc_pedest['PED_LOCATN'] == 4] = 2
# Merge into acc
# rmk Control variables is other
dummy = pd.get_dummies(acc_pedest['PED_LOCATN'], drop_first=True).rename(
    columns={1: 'On controlled crossing', 2: 'Within 15M of controlled crossing'})
acc_pedest = pd.concat([acc_pedest, dummy], axis=1).drop('PED_LOCATN', axis=1)

In [45]:
X_driver, y_driver = acc_driver.drop('INJURY', axis=1), acc_driver['INJURY']
X_passen, y_passen = acc_passen.drop('INJURY', axis=1), acc_passen['INJURY']
X_pedest, y_pedest = acc_pedest.drop('INJURY', axis=1), acc_pedest['INJURY']

In [46]:
X_driver = sm.add_constant(X_driver)
m_driver = sm.Logit(y_driver, X_driver)
r_driver = m_driver.fit()

Optimization terminated successfully.
         Current function value: 0.389944
         Iterations 6


In [47]:
r_driver.summary()

0,1,2,3
Dep. Variable:,INJURY,No. Observations:,47838.0
Model:,Logit,Df Residuals:,47816.0
Method:,MLE,Df Model:,21.0
Date:,"Sat, 10 Feb 2018",Pseudo R-squ.:,0.02473
Time:,22:55:58,Log-Likelihood:,-18654.0
converged:,True,LL-Null:,-19127.0
,,LLR p-value:,7.977999999999999e-187

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2983,0.071,-18.215,0.000,-1.438,-1.159
7:00 - 9:59 a.m.,-0.2100,0.044,-4.818,0.000,-0.295,-0.125
10:00 a.m. - 3:59 p.m.,-0.1367,0.033,-4.124,0.000,-0.202,-0.072
4:00 - 6:59 p.m.,-0.2102,0.039,-5.444,0.000,-0.286,-0.135
Monday - Friday,-0.0996,0.029,-3.489,0.000,-0.156,-0.044
Light rain,-0.1419,0.043,-3.320,0.001,-0.226,-0.058
Heavy rain,-0.1377,0.100,-1.383,0.167,-0.333,0.057
50 km/h,-0.1851,0.038,-4.873,0.000,-0.260,-0.111
Severe Congestion,-0.2038,0.042,-4.846,0.000,-0.286,-0.121


In [48]:
p_driver = r_driver.params
c_driver = r_driver.conf_int()
c_driver['OR'] = p_driver
c_driver.columns = ['2.5%', '97.5%', 'Odds Ratio']
print(np.exp(c_driver))

                            2.5%     97.5%  Odds Ratio
const                   0.237408  0.313932    0.273002
7:00 - 9:59 a.m.        0.744208  0.882862    0.810576
10:00 a.m. - 3:59 p.m.  0.817346  0.930774    0.872218
4:00 - 6:59 p.m.        0.751341  0.874132    0.810414
Monday - Friday         0.855918  0.957276    0.905180
Light rain              0.797944  0.943507    0.867678
Heavy rain              0.716905  1.059166    0.871390
50 km/h                 0.771344  0.895233    0.830983
Severe Congestion       0.751066  0.885699    0.815609
Moderate Congestion     0.753412  0.860422    0.805141
Traffic Signal          0.645952  0.855301    0.743293
Other Control Types     0.655348  0.780512    0.715197
One-way Carriageway     0.848681  0.986005    0.914770
Two-way Carriageway     1.259799  1.460042    1.356230
Under 15                0.512257  0.705961    0.601360
Above 65                1.612554  2.149036    1.861568
Male                    1.122256  1.315800    1.215181
YEAR_2012 

In [49]:
#X_passen = sm.add_constant(X_passen)
m_passen = sm.Logit(y_passen, X_passen)
r_passen = m_passen.fit()

Optimization terminated successfully.
         Current function value: 0.273087
         Iterations 8


In [50]:
r_passen.summary()

0,1,2,3
Dep. Variable:,INJURY,No. Observations:,31905.0
Model:,Logit,Df Residuals:,31881.0
Method:,MLE,Df Model:,23.0
Date:,"Sat, 10 Feb 2018",Pseudo R-squ.:,-0.000183
Time:,22:55:59,Log-Likelihood:,-8712.8
converged:,True,LL-Null:,-8711.2
,,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
7:00 - 9:59 a.m.,-0.6272,0.067,-9.423,0.000,-0.758,-0.497
10:00 a.m. - 3:59 p.m.,-0.3960,0.048,-8.258,0.000,-0.490,-0.302
4:00 - 6:59 p.m.,-0.5656,0.060,-9.383,0.000,-0.684,-0.447
Monday - Friday,-0.3434,0.041,-8.420,0.000,-0.423,-0.263
Light rain,-0.0564,0.063,-0.900,0.368,-0.179,0.066
Heavy rain,-0.5154,0.161,-3.202,0.001,-0.831,-0.200
50 km/h,-0.6233,0.049,-12.781,0.000,-0.719,-0.528
Severe Congestion,-0.1200,0.058,-2.056,0.040,-0.234,-0.006
Moderate Congestion,-0.3827,0.048,-7.949,0.000,-0.477,-0.288


In [51]:
p_passen = r_passen.params
c_passen = r_passen.conf_int()
c_passen['OR'] = p_passen
c_passen.columns = ['2.5%', '97.5%', 'Odds Ratio']
print(np.exp(c_passen))

                            2.5%     97.5%  Odds Ratio
7:00 - 9:59 a.m.        0.468741  0.608492    0.534065
10:00 a.m. - 3:59 p.m.  0.612646  0.739343    0.673019
4:00 - 6:59 p.m.        0.504704  0.639230    0.567998
Monday - Friday         0.654891  0.768407    0.709382
Light rain              0.835879  1.068672    0.945135
Heavy rain              0.435646  0.818791    0.597246
50 km/h                 0.487280  0.589933    0.536155
Severe Congestion       0.791058  0.994438    0.886938
Moderate Congestion     0.620606  0.749509    0.682019
Traffic Signal          0.740601  1.141789    0.919571
Other Control Types     0.671463  0.841297    0.751598
One-way Carriageway     0.628805  0.762373    0.692376
Two-way Carriageway     0.819954  1.005671    0.908077
Under 15                0.203975  0.339195    0.263034
Above 65                1.858333  2.320094    2.076417
Male                    0.775930  0.920249    0.845014
YEAR_2012               0.448411  0.568926    0.505087
YEAR_2013 

In [52]:
#X_pedest = sm.add_constant(X_pedest)
m_pedest = sm.Logit(y_pedest, X_pedest)
r_pedest = m_pedest.fit()

Optimization terminated successfully.
         Current function value: 0.524771
         Iterations 5


In [53]:
r_pedest.summary()

0,1,2,3
Dep. Variable:,INJURY,No. Observations:,17450.0
Model:,Logit,Df Residuals:,17428.0
Method:,MLE,Df Model:,21.0
Date:,"Sat, 10 Feb 2018",Pseudo R-squ.:,0.04114
Time:,22:56:00,Log-Likelihood:,-9157.3
converged:,True,LL-Null:,-9550.1
,,LLR p-value:,9.56e-153

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
7:00 - 9:59 a.m.,-0.0570,0.056,-1.011,0.312,-0.167,0.053
10:00 a.m. - 3:59 p.m.,-0.3060,0.047,-6.557,0.000,-0.397,-0.215
4:00 - 6:59 p.m.,-0.3433,0.054,-6.342,0.000,-0.449,-0.237
Monday - Friday,-0.0506,0.040,-1.263,0.207,-0.129,0.028
Light rain,0.0139,0.061,0.228,0.820,-0.106,0.133
Heavy rain,0.1886,0.131,1.438,0.150,-0.068,0.446
50 km/h,-0.6500,0.061,-10.600,0.000,-0.770,-0.530
Severe Congestion,-0.0388,0.052,-0.746,0.455,-0.141,0.063
Moderate Congestion,-0.1675,0.042,-3.955,0.000,-0.251,-0.084


In [54]:
p_pedest = r_pedest.params
c_pedest = r_pedest.conf_int()
c_pedest['OR'] = p_pedest
c_pedest.columns = ['2.5%', '97.5%', 'Odds Ratio']
print(np.exp(c_pedest))

                                       2.5%     97.5%  Odds Ratio
7:00 - 9:59 a.m.                   0.845872  1.054907    0.944625
10:00 a.m. - 3:59 p.m.             0.672036  0.806926    0.736399
4:00 - 6:59 p.m.                   0.637973  0.788813    0.709395
Monday - Friday                    0.878949  1.028315    0.950703
Light rain                         0.899819  1.142621    1.013978
Heavy rain                         0.933831  1.561499    1.207549
50 km/h                            0.462927  0.588713    0.522045
Severe Congestion                  0.868847  1.065075    0.961971
Moderate Congestion                0.778409  0.918975    0.845777
Traffic Signal                     0.673050  0.942121    0.796300
Other Control Types                0.771804  0.920788    0.843011
One-way Carriageway                0.539796  0.657437    0.595720
Two-way Carriageway                0.781510  0.958647    0.865559
Under 15                           0.825336  1.060566    0.935587
Above 65  