In [1]:
# Set the ignore timezone flag for pyarrow
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [2]:
import databricks.koalas as ks
import opendatablend as odb

In [3]:
# Set the path for the dataset metadata. Find this using the 'Get metadata' button on a dataset e.g. https://www.opendatablend.io/dataset?name=open-data-blend-road-safety
dataset_path = 'https://packages.opendatablend.io/v1/open-data-blend-road-safety/datapackage.json'

# Set your acess key. Note: leaving this blank will result in anonymous/public calls and will consume your free API call allowance. 
access_key = '' 

# We want Pandas to display up to 100 rows for dataframes
ks.options.display.max_rows = 20

In [4]:
# Download the date dimension and load it into a DataFrame
resource_name = 'date-parquet'
output = odb.get_data(dataset_path, resource_name, access_key=access_key)
df_date = ks.read_parquet(output.data_file_name, columns=['drv_date_key', 'drv_date', 'drv_month_name', 'drv_month_number', 'drv_quarter_name', 'drv_quarter_number', 'drv_year'])

In [5]:
df_date.head()

Unnamed: 0,drv_date_key,drv_date,drv_month_name,drv_month_number,drv_quarter_name,drv_quarter_number,drv_year
0,-999999,9999-12-31,**Not Provided**,-999999,**Not Provided**,-999999,-999999
1,18000101,1800-01-01,January 1800,180001,Quarter 1 1800,180001,1800
2,18000102,1800-01-02,January 1800,180001,Quarter 1 1800,180001,1800
3,18000103,1800-01-03,January 1800,180001,Quarter 1 1800,180001,1800
4,18000104,1800-01-04,January 1800,180001,Quarter 1 1800,180001,1800


In [6]:
# Download the road safety-accident-info dimension and load it into a DataFrame
resource_name = 'road-safety-accident-info-parquet'
output = odb.get_data(dataset_path, resource_name, access_key=access_key)
df_accident_info = ks.read_parquet(output.data_file_name, columns=['drv_road_safety_accident_info_key', 'src_road_surface_condition', 'src_speed_limit', 'src_weather_condition', 'src_police_force'])

In [7]:
df_accident_info.head()

Unnamed: 0,drv_road_safety_accident_info_key,src_road_surface_condition,src_speed_limit,src_weather_condition,src_police_force
0,-999999,**Not Provided**,**Not Provided**,**Not Provided**,**Not Provided**
1,-999998,**Not Applicable**,**Not Applicable**,**Not Applicable**,**Not Applicable**
2,1,Dry,30,Fine no high winds,Thames Valley
3,2,Dry,30,Fine no high winds,Staffordshire
4,3,Frost or ice,30,Unknown,South Wales


In [8]:
# Download the road safety accident fact data for 2017 and load it into a DataFrame
resource_name  = 'road-safety-accident-2017-parquet'
output = odb.get_data(dataset_path, resource_name, access_key=access_key)
df_accidents_2017 = ks.read_parquet(output.data_file_name, columns=['drv_accident_date_key', 'drv_road_safety_accident_info_key', 'src_number_of_casualties', 'src_number_of_vehicles'])

In [9]:
# Download the road safety accident fact data for 2018 and load it into a DataFrame
resource_name  = 'road-safety-accident-2018-parquet'
output = odb.get_data(dataset_path, resource_name, access_key=access_key)
df_accidents_2018 = ks.read_parquet(output.data_file_name, columns=['drv_accident_date_key', 'drv_road_safety_accident_info_key', 'src_number_of_casualties', 'src_number_of_vehicles'])

In [10]:
# Download the road safety accident fact data for 2019 and load it into a DataFrame
resource_name  = 'road-safety-accident-2019-parquet'
output = odb.get_data(dataset_path, resource_name, access_key=access_key)
df_accidents_2019 = ks.read_parquet(output.data_file_name, columns=['drv_accident_date_key', 'drv_road_safety_accident_info_key', 'src_number_of_casualties', 'src_number_of_vehicles'])

In [11]:
df_accidents_combined = ks.concat([df_accidents_2017, df_accidents_2018, df_accidents_2019])
df_accidents_combined

Unnamed: 0,drv_accident_date_key,drv_road_safety_accident_info_key,src_number_of_casualties,src_number_of_vehicles
0,20170101,361843,1,2
1,20170101,352612,1,3
2,20170101,366882,1,1
3,20170101,126468,1,1
4,20170101,35211,2,2
5,20170101,49883,1,2
6,20170101,51576,1,1
7,20170101,370419,2,3
8,20170101,34364,1,2
9,20170101,77298,2,2


In [12]:
# Extend the accident with the data and road safety accident into dimensions
df_accidents = df_date.merge(df_accidents_combined, left_on='drv_date_key', right_on='drv_accident_date_key').merge(df_accident_info, on='drv_road_safety_accident_info_key')
df_accidents

Unnamed: 0,drv_date_key,drv_date,drv_month_name,drv_month_number,drv_quarter_name,drv_quarter_number,drv_year,drv_accident_date_key,drv_road_safety_accident_info_key,src_number_of_casualties,src_number_of_vehicles,src_road_surface_condition,src_speed_limit,src_weather_condition,src_police_force
0,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,361843,1,2,Wet or damp,30,Raining no high winds,West Mercia
1,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,352612,1,3,Wet or damp,30,Raining no high winds,Bedfordshire
2,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,366882,1,1,Dry,30,Fine no high winds,Sussex
3,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,126468,1,1,Wet or damp,50,Fine no high winds,Sussex
4,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,35211,2,2,Wet or damp,30,Raining no high winds,Lancashire
5,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,49883,1,2,Wet or damp,30,Raining no high winds,Devon and Cornwall
6,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,51576,1,1,Wet or damp,30,Fine no high winds,Metropolitan Police
7,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,370419,2,3,Dry,30,Fine no high winds,Nottinghamshire
8,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,34364,1,2,Dry,40,Fine no high winds,Strathclyde
9,20170101,2017-01-01,January 2017,201701,Quarter 1 2017,201701,2017,20170101,77298,2,2,Wet or damp,60,Raining no high winds,Suffolk


In [13]:
# Total number of accidents per year
df_accidents.groupby('drv_year').agg({"drv_year": 'count'}).rename(columns={"drv_year": "total_accidents"}).sort_index()

Unnamed: 0_level_0,total_accidents
drv_year,Unnamed: 1_level_1
2017,129982
2018,122635
2019,117536


In [14]:
df_accidents['number_of_accidents'] = 1

In [15]:
# High-level stats by year
df_accidents.filter(items=['drv_year', 'number_of_accidents', 'src_number_of_casualties', 'src_number_of_vehicles']).groupby(by=['drv_year']).sum().sort_index()

Unnamed: 0_level_0,number_of_accidents,src_number_of_casualties,src_number_of_vehicles
drv_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,129982,170993,238926
2018,122635,160597,226409
2019,117536,153158,216381


In [16]:
# High-level stats by year and police force
df_accidents[['drv_year', 'src_police_force', 'number_of_accidents', 'src_number_of_casualties', 'src_number_of_vehicles']].groupby(by=['drv_year', 'src_police_force']).sum().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_accidents,src_number_of_casualties,src_number_of_vehicles
drv_year,src_police_force,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,Avon and Somerset,2774,3747,5306
2017,Bedfordshire,1659,2319,3155
2017,Cambridgeshire,2004,2791,3768
2017,Central,406,528,755
2017,Cheshire,2210,2835,4065
2017,City of London,313,347,522
2017,Cleveland,679,941,1266
2017,Cumbria,1291,1770,2289
2017,Derbyshire,1643,2126,3030
2017,Devon and Cornwall,3568,5005,6403


In [17]:
# High-level stats by year and speed limit
df_accidents[['drv_year', 'src_speed_limit', 'number_of_accidents', 'src_number_of_casualties', 'src_number_of_vehicles']].groupby(by=['drv_year', 'src_speed_limit']).sum().sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_accidents,src_number_of_casualties,src_number_of_vehicles
drv_year,src_speed_limit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017,20,9633,11208,16334
2017,30,79569,98574,142710
2017,40,10615,14966,20725
2017,50,5286,8015,10927
2017,60,16723,25212,30507
2017,70,8156,13018,17723
2018,20,10661,12235,18293
2018,30,73479,90896,132425
2018,40,10229,14547,19965
2018,50,5053,7496,10380
