# Preprocess CBS extracts

This notebook will prepare the CBS excel extracts for easy analysis.

In [2]:
import pandas as pd
import numpy as np
from siuba import *

## Accident count

In [3]:
double_lane = [11.9, 60.4]   # From Eilot junction to Ktora Junction
single_lane = [60.4, 180.5]  # Till Arava Junction

length_single_lane = single_lane[1] - single_lane[0]
length_double_lane = double_lane[1] - double_lane[0]

In [15]:

df = pd.read_excel("../data/accidents_with_casualties/file_084b50e9-6039-406d-9600-32d5122ca3d0_longer.xlsx",
    header=20, usecols="A:F", names=['location', 'year', 'total', 'fatal', 'severe', 'light'])

total_str = df.iloc[0,1]
unknown_str = df.loc[44, 'location']

df = (
    df
    .replace(total_str, np.nan)
    .replace(unknown_str, np.nan)
    .dropna(how='any', axis=0, subset=['location', 'year'])
    .fillna(0)
)
df.location = df.location.str.replace(".x", ".0")
df.location = df.location.astype(float)
df = df[df.location > 0.0]   # 0 is a summary location
# df = df >> filter(_.location >= double_lane[0], _.location <= single_lane[1])
# df = df >> mutate(place = if_else(
#     _.location >= single_lane[0], 'single lane', 'double lane'
# ))
df

  df.location = df.location.str.replace(".x", ".0")


Unnamed: 0,location,year,total,fatal,severe,light
84,0.1,2003.0,7.0,0.0,2.0,5.0
85,0.1,2004.0,8.0,0.0,1.0,7.0
86,0.1,2005.0,9.0,0.0,1.0,8.0
87,0.1,2006.0,5.0,0.0,1.0,4.0
88,0.1,2007.0,6.0,0.0,0.0,6.0
...,...,...,...,...,...,...
23833,476.0,2011.0,1.0,0.0,0.0,1.0
23835,476.2,2009.0,1.0,1.0,0.0,0.0
23836,476.2,2019.0,1.0,0.0,1.0,0.0
23838,476.6,2015.0,1.0,0.0,0.0,1.0


In [16]:
df.to_parquet("../data/preprocessed/accident_counts_2003-2022.parquet")
df.to_csv("../data/preprocessed/accident_counts_2003-2022.csv", index=False)


In [17]:
! head ../data/preprocessed/accident_counts_2003-2022.csv

location,year,total,fatal,severe,light
0.1,2003.0,7.0,0.0,2.0,5.0
0.1,2004.0,8.0,0.0,1.0,7.0
0.1,2005.0,9.0,0.0,1.0,8.0
0.1,2006.0,5.0,0.0,1.0,4.0
0.1,2007.0,6.0,0.0,0.0,6.0
0.1,2008.0,3.0,0.0,0.0,3.0
0.1,2009.0,10.0,0.0,0.0,10.0
0.1,2010.0,4.0,0.0,0.0,4.0
0.1,2011.0,7.0,0.0,2.0,5.0


## Casualty count

In [18]:

df_casualties = pd.read_excel("../data/casualties_in_accidents/file_5c6cf05f-2fa2-4ffb-b06a-a721caca2d1e_more_years_real.xlsx",
    header=22, usecols="A:F", names=['year', 'location', 'total', 'killed', 'severe_injury', 'light_injury'])

total_str = df_casualties.iloc[0, 0]
unknown_str = df_casualties.iloc[0, 1]
df_casualties = (
    df_casualties
    .replace(total_str, np.nan)
    .replace(unknown_str, np.nan)
    .dropna(how='any', axis=0, subset=['location', 'year'])
    .fillna(0)
)
df_casualties.location = df_casualties.location.str.replace(".x", ".0")
df_casualties.location = df_casualties.location.astype(float)
df_casualties

  df_casualties.location = df_casualties.location.str.replace(".x", ".0")


Unnamed: 0,year,location,total,killed,severe_injury,light_injury
820,2003.0,14.6,4,0.0,0.0,4.0
821,2003.0,16.5,4,0.0,0.0,4.0
822,2003.0,24.3,11,0.0,0.0,11.0
823,2003.0,39.9,4,0.0,0.0,4.0
824,2003.0,43.0,27,1.0,4.0,22.0
...,...,...,...,...,...,...
2271,2022.0,436.3,6,0.0,0.0,6.0
2272,2022.0,441.2,1,0.0,0.0,1.0
2273,2022.0,443.6,6,0.0,0.0,6.0
2274,2022.0,450.5,4,0.0,0.0,4.0


In [13]:
df_casualties.to_parquet("../data/preprocessed/casualty_counts_2003-2022.parquet")
df_casualties.to_csv("../data/preprocessed/casualty_counts_2003-2022.csv", index=False)

In [14]:
! head ../data/preprocessed/casualty_counts_2003-2022.csv

year,location,total,killed,severe_injury,light_injury
2003.0,14.6,4,0.0,0.0,4.0
2003.0,16.5,4,0.0,0.0,4.0
2003.0,24.3,11,0.0,0.0,11.0
2003.0,39.9,4,0.0,0.0,4.0
2003.0,43.0,27,1.0,4.0,22.0
2003.0,45.1,2,1.0,0.0,1.0
2003.0,47.5,4,0.0,0.0,4.0
2003.0,48.7,4,0.0,0.0,4.0
2003.0,51.0,9,1.0,4.0,4.0


# Preprocess new excel table from map generator

In [31]:
df = pd.read_excel("../data/accidents_90_from_maps.xlsx", header=1)
columns = ['year', 'month', 'day of week', 'day/night', 'accident_severity', 'accident_type', 'killed', 'severly_injured',
    'lightly_injured', 'injured_pedestrians', 'casualties_ages_0-19', 'casualties_ages_20-64', 
    'casualties_ages_65_plus', 'total_casualties', 'vehicle_count', 'drivers', 'road_type',
    'localization_quality', 'settlement', 'road1', 'location', 'road2', 'road3', 'road4'
]
df.columns = columns
df

Unnamed: 0,year,month,day of week,day/night,accident_severity,accident_type,killed,severly_injured,lightly_injured,injured_pedestrians,...,vehicle_count,drivers,road_type,localization_quality,settlement,road1,location,road2,road3,road4
0,2003,אפריל,ראשון,לילה,קשה,התנגשות חזית בצד,0,1,8,0,...,2,2,לא-עירונית לא בצומת,עיגון מדויק,,90,100.3,,,
1,2003,מאי,שישי,לילה,קלה,התהפכות,0,0,3,0,...,1,1,לא-עירונית לא בצומת,עיגון מדויק,,90,75.1,,,
2,2003,מאי,שבת,לילה,קלה,התהפכות,0,0,4,0,...,1,1,לא-עירונית לא בצומת,עיגון מדויק,,90,39.9,,,
3,2003,יוני,חמישי,לילה,קלה,התנגשות חזית באחור,0,0,2,0,...,2,2,לא-עירונית לא בצומת,עיגון מדויק,,90,135.8,,,
4,2003,מרס,שני,לילה,קטלנית,פגיעה בהולך רגל,1,4,22,1,...,1,1,לא-עירונית לא בצומת,עיגון מדויק,,90,43.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2021,אפריל,שלישי,יום,קשה,התהפכות,0,3,0,0,...,1,1,לא-עירונית לא בצומת,עיגון מדויק,,90,131.0,,,
344,2021,אפריל,רביעי,לילה,קטלנית,התנגשות חזית בחזית,2,0,1,0,...,2,2,לא-עירונית לא בצומת,עיגון מדויק,,90,118.5,,,
345,2022,מרס,ראשון,יום,קלה,התנגשות חזית בצד,0,0,4,0,...,3,3,לא-עירונית בצומת,עיגון מדויק,,90,,1093.0,,
346,2022,מרס,שבת,לילה,קלה,התנגשות עם עצם דומם,0,0,4,0,...,2,2,לא-עירונית לא בצומת,עיגון מדויק,,90,38.9,,,


In [33]:
df.to_csv("../data/preprocessed/accidents_from_mapper.csv", index=False)
df.to_parquet("../data/preprocessed/accidents_from_mapper.parquet")

# Junctions dataframe

In [36]:

df_junctions = pd.DataFrame({
    'name': ['Eilot',
        'Meches', 'Beer Ora', 'Timna', 'Samar', 'Yotvata', 'Grofit', 'Ktora', 'Yahel', 'Menuha', 'Paran',
        'Tsukim', 'Tzofar', 'Sapir', 'Ein Yahav', 'Hazeva', 'Idan', 'Haarava Junc'
        ],
    'location':[
        11.9, 15.6, 27, 36, 41.3, 49.5, 54, 60.4, 72.3, 97.6, 105.9, 120.4, 128.9, 134.8, 138.6, 153.4, 158.2, 180.5]
})

df_junctions.to_csv("../data/preprocessed/junctions.csv", index=False)