# Analysis of accidents along road 90

In [21]:
import pandas as pd
from siuba import *
import plotly.express as px
import numpy as np

## Read in the accident count data and clean it

In [82]:
df = pd.read_excel("../data/accidents_with_casualties/file_6123bff4-2115-49ca-8ced-0e62569f28fc.xlsx",
header=22, usecols="A:F")

df.columns = ['location', 'year', 'total', 'fatal', 'severe', 'light']

total_str = df.iloc[0,0]
unknown_str = df.iloc[16, 0]

df_clean = (
    df
    .replace(total_str, np.nan)
    .replace(unknown_str, np.nan)
    .dropna(how='any', axis=0, subset=['location', 'year'])
    .fillna(0)
)
df_clean.location = df_clean.location.str.replace(".x", ".0")
df_clean.location = df_clean.location.astype(float)
df_clean


The default value of regex will change from True to False in a future version.



Unnamed: 0,location,year,total,fatal,severe,light
18,9.9,2021.0,1,0.0,0.0,1.0
20,10.0,2017.0,1,0.0,1.0,0.0
22,14.0,2011.0,1,0.0,0.0,1.0
24,14.3,2020.0,1,0.0,0.0,1.0
26,14.9,2011.0,1,0.0,0.0,1.0
...,...,...,...,...,...,...
1183,475.0,2020.0,1,0.0,0.0,1.0
1185,476.0,2011.0,1,0.0,0.0,1.0
1187,476.2,2019.0,1,0.0,1.0,0.0
1189,476.6,2015.0,1,0.0,0.0,1.0


## Distinguish the different parts of Road 90 based on km

In [86]:
single_lane = [62.7, 183]
double_lane = [14.8, 62.7]

In [87]:
df_place = (
    df_clean
    >> mutate(place=if_else((_.location >= single_lane[0]) & (_.location<=single_lane[1]), 'Single Lane', 
        if_else((_.location < double_lane[1]) & (_.location >= single_lane[0]), 'Double Lane', 'other')))
    >> filter(_.place != 'other')
)
df_place

Unnamed: 0,location,year,total,fatal,severe,light,place
129,64.0,2018.0,1,0.0,1.0,0.0,Single Lane
131,64.3,2010.0,1,0.0,0.0,1.0,Single Lane
133,65.5,2011.0,1,0.0,0.0,1.0,Single Lane
135,67.9,2010.0,1,0.0,0.0,1.0,Single Lane
137,68.0,2016.0,1,1.0,0.0,0.0,Single Lane
...,...,...,...,...,...,...,...
287,180.0,2011.0,1,0.0,0.0,1.0,Single Lane
288,180.0,2013.0,1,0.0,1.0,0.0,Single Lane
289,180.0,2017.0,2,0.0,2.0,0.0,Single Lane
291,181.0,2020.0,1,0.0,1.0,0.0,Single Lane


In [80]:
length_single_lane = single_lane[1] - single_lane[0]
length_double_lane = double_lane[1] - double_lane[0]
df_agg = (
    df_place
    >> group_by('year', 'place')
    >> summarize(total=_.total.sum(), fatal=_.fatal.sum(), severe=_.severe.sum(),
        light=_.light.sum())
    >> mutate(per_km_total=if_else(_.place=='Single Lane', _.total / length_single_lane,
                                                    _.total / length_double_lane))
    >> mutate(per_km_fatal=if_else(_.place=='Single Lane', _.fatal / length_single_lane,
                                                    _.fatal / length_double_lane))
)

df_agg

Unnamed: 0,year,place,total,fatal,severe,light,per_km_total,per_km_fatal
0,2010.0,Arava,11,2.0,2.0,7.0,0.091438,0.016625
1,2010.0,South,5,0.0,2.0,3.0,0.104384,0.0
2,2011.0,Arava,7,0.0,2.0,5.0,0.058188,0.0
3,2011.0,South,8,2.0,0.0,6.0,0.167015,0.041754
4,2012.0,Arava,6,2.0,0.0,4.0,0.049875,0.016625
5,2012.0,South,10,1.0,2.0,7.0,0.208768,0.020877
6,2013.0,Arava,7,1.0,4.0,2.0,0.058188,0.008313
7,2013.0,South,1,0.0,0.0,1.0,0.020877,0.0
8,2014.0,Arava,8,2.0,1.0,5.0,0.0665,0.016625
9,2014.0,South,7,1.0,0.0,6.0,0.146138,0.020877


In [81]:
px.line(df_agg, x='year', y='per_km_fatal', color='place')