In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
year_min = 2012
year_max = 2020
columns = ["REPORT_ID", "Year", "Month", "Day", "Time", "ACCLOC_X", "ACCLOC_Y"]
result = None
for y in range(year_min, year_max + 1):
    df = (pd.read_csv('data/{}_DATA_SA_Crash.csv'.format(y)))[columns]
    if (result is None): 
        result = df
    else:
        result = result.append(df, ignore_index=True)
# Drop rows with NaNs and write the output to a CSV file.
result = result.dropna()
# Transform to integer coordinates.
result['ACCLOC_X'] *= 100
result['ACCLOC_Y'] *= 100
result = result.astype({'ACCLOC_X': int, 'ACCLOC_Y': int})
# Rename columns.
result = result.rename(columns={'REPORT_ID': 'ID', 'ACCLOC_X':'x', 'ACCLOC_Y':'y'})
# Save to file.
result.to_csv('data/crash_data.csv', index=False)

In [10]:
df = pd.read_csv('data/crash_data.csv')
print('Number of records: {}'.format(len(df)))
df.head()

Number of records: 135333


Unnamed: 0,ID,Year,Month,Day,Time,x,y
0,2012-1-27/05/2021,2012,January,Sunday,04:30 pm,133065971,167179587
1,2012-2-27/05/2021,2012,January,Sunday,09:10 am,132940015,166846266
2,2012-3-27/05/2021,2012,January,Wednesday,11:30 am,131374822,162424128
3,2012-4-27/05/2021,2012,January,Wednesday,10:20 am,132532677,167242555
4,2012-5-27/05/2021,2012,January,Wednesday,03:30 pm,132605645,167302842


In [11]:
xmin, ymin, xmax, ymax = np.min(df['x']), np.min(df['y']), np.max(df['x']), np.max(df['y'])
print('xmin = {}\nymin = {}\nxmax = {}\nymax = {}\n'.format(xmin, ymin, xmax, ymax))

xmin = 41223737
ymin = 131498456
xmax = 156569723
ymax = 266016550



Example of a query: find all accidents whose position $(x, y)$ is in the range $50000000 \leq x \leq 60000000 \wedge 200000000 \leq y \leq 210000000$.

In [12]:
result = df[(df.x >= 50000000) & (df.x <= 60000000) & (df.y >= 200000000) & (df.y <= 210000000)]
print('N. of records: {}'.format(len(result)))
result

N. of records: 18


Unnamed: 0,ID,Year,Month,Day,Time,x,y
5352,2012-6232-27/05/2021,2012,April,Thursday,01:30 pm,51982880,203573093
26623,2013-7448-27/05/2021,2013,June,Wednesday,04:15 pm,56030307,203750673
47760,2014-12075-27/05/2021,2014,August,Saturday,04:30 pm,58071299,204089529
49461,2014-13784-27/05/2021,2014,November,Tuesday,05:30 pm,53564018,203708523
59760,2015-7930-27/05/2021,2015,July,Wednesday,01:30 am,58980918,204264541
74204,2016-6959-27/05/2021,2016,April,Wednesday,10:00 pm,51670965,203544806
86431,2017-2426-27/05/2021,2017,March,Friday,03:00 pm,52154293,203588525
86702,2017-2697-27/05/2021,2017,March,Saturday,10:20 am,51014726,203360432
93146,2017-9143-27/05/2021,2017,September,Friday,08:00 pm,57766960,204067526
104540,2018-7301-27/05/2021,2018,July,Wednesday,04:00 pm,52498175,203619499


In [13]:
def range_query(df, lx, ly, ux, uy):
    return (df[(df.ACCLOC_X >= lx) & (df.ACCLOC_X <= ux) & 
              (df.ACCLOC_Y >= ly) & (df.ACCLOC_Y <= uy)])

def bounding_rect(df):
    return (np.min(df['x']), np.min(df['y']), 
            np.max(df['x']), np.max(df['y']))

# Generate test data sets

Generate test data sets with sizes $10^3 \leq m \leq 10^5$. 
Each data set contains $m$ random records selected from <code>crash_data.csv</code>.

In [14]:
sizes = [int(x) for x in np.linspace(1000, 100000, 25)]
print(sizes)

[1000, 5125, 9250, 13375, 17500, 21625, 25750, 29875, 34000, 38125, 42250, 46375, 50500, 54625, 58750, 62875, 67000, 71125, 75250, 79375, 83500, 87625, 91750, 95875, 100000]


In [15]:
for m in sizes:
    sampled_df = df.sample(n=m).reset_index(drop=True)
    sampled_df.to_csv('data/crash_data_{}.csv'.format(m), index=False)
print('Done!')

Done!


In [16]:
m = 30000
sampled_df = df.sample(n=m).reset_index(drop=True)
sampled_df.to_csv('data/crash_data_{}.csv'.format(m), index=False)
print('Done!')

Done!
