In [51]:
from pathlib import Path
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

In [52]:
# Loading the CSV files from the "resource" folder

# Defining the file paths
crashes_file = Path("resources/Cleaned_Fatal_Crash_Data.csv")
fatalities_file = Path("resources/Cleaned_Fatalities_Data.csv")

# Reading the CSV files into Pandas DataFrames
crashes_data = pd.read_csv(crashes_file)
fatalities_data = pd.read_csv(fatalities_file)

In [53]:
# Columns to keep
crashes_columns = ['Crash ID','State','Month','Year','Dayweek','Time','Crash Type','Number of Fatalities','Speed Limit','National Remoteness Areas','SA4 Name 2021','National LGA Name 2021','National Road Type','Christmas Period','Easter Period','Day of week','Time of Day']
fatalities_columns = ['Crash ID','Road User','Gender','Age','Age Group',]

In [54]:
# merging the CSVs
merged_data = pd.merge(crashes_data[crashes_columns], fatalities_data[fatalities_columns], on='Crash ID', how='inner')

In [55]:
# Cleaning data
merged_data['National Road Type'] = merged_data['National Road Type'].str.title()
merged_data['State'] = merged_data['State'].str.upper()

In [56]:
# Displaying the result df
merged_data

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Number of Fatalities,Speed Limit,National Remoteness Areas,...,National LGA Name 2021,National Road Type,Christmas Period,Easter Period,Day of week,Time of Day,Road User,Gender,Age,Age Group
0,20235016,WA,9,2023,Wednesday,17:10,Single,3,70,Undetermined,...,Undetermined,Undetermined,No,No,Weekday,Day,Passenger,Male,24,17_to_25
1,20235016,WA,9,2023,Wednesday,17:10,Single,3,70,Undetermined,...,Undetermined,Undetermined,No,No,Weekday,Day,Driver,Male,24,17_to_25
2,20235016,WA,9,2023,Wednesday,17:10,Single,3,70,Undetermined,...,Undetermined,Undetermined,No,No,Weekday,Day,Passenger,Male,21,17_to_25
3,20231205,NSW,9,2023,Saturday,0:01,Single,1,100,Outer Regional Australia,...,Murray River,Arterial Road,No,No,Weekend,Night,Driver,Male,20,17_to_25
4,20233054,QLD,9,2023,Saturday,23:00,Single,1,80,Major Cities of Australia,...,Gold Coast,Sub-Arterial Road,No,No,Weekend,Night,Driver,Male,25,17_to_25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55259,19892548,VIC,1,1989,Sunday,1:00,Multiple,1,60,Undetermined,...,Undetermined,Undetermined,No,No,Weekend,Night,Pedestrian,Male,23,17_to_25
55260,19891122,NSW,1,1989,Tuesday,14:10,Multiple,1,60,Undetermined,...,Undetermined,Undetermined,No,No,Weekday,Day,Pedestrian,Male,71,65_to_74
55261,19894151,SA,1,1989,Wednesday,11:45,Multiple,1,60,Undetermined,...,Undetermined,Undetermined,No,No,Weekday,Day,Pedestrian,Male,73,65_to_74
55262,19892576,VIC,1,1989,Friday,17:15,Multiple,1,60,Undetermined,...,Undetermined,Undetermined,No,No,Weekday,Day,Pedestrian,Female,6,0_to_16


In [57]:
print(merged_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55264 entries, 0 to 55263
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Crash ID                   55264 non-null  int64 
 1   State                      55264 non-null  object
 2   Month                      55264 non-null  int64 
 3   Year                       55264 non-null  int64 
 4   Dayweek                    55264 non-null  object
 5   Time                       55264 non-null  object
 6   Crash Type                 55264 non-null  object
 7   Number of Fatalities       55264 non-null  int64 
 8   Speed Limit                55264 non-null  int64 
 9   National Remoteness Areas  55264 non-null  object
 10  SA4 Name 2021              55264 non-null  object
 11  National LGA Name 2021     55264 non-null  object
 12  National Road Type         55264 non-null  object
 13  Christmas Period           55264 non-null  object
 14  Easter

In [58]:
# Specifying the output folder 
output_folder = "output_folder/Leaflet_Output"

# Creating the output folder 
import os
os.makedirs(output_folder, exist_ok=True)

# Defining the file path for the CSV
output_csv_file = os.path.join(output_folder, "final_merged_csv.csv")

# Saving the DataFrame as a CSV file
merged_data.to_csv(output_csv_file, index=False)

In [59]:
# # Remove duplicate fatalities for the same 'Crash ID' while keeping the first occurrence
# fatality_duplicates_dropped = data_from_2014.drop_duplicates(subset='Crash ID', keep='first')
# fatality_duplicates_dropped

In [61]:
# Joining for geojson
# Loading the CSV files from the folder
# Defining the file paths
final_file = Path("output_folder/Leaflet_Output/final_merged_csv.csv")
coordinates_file = Path("resources/au_postcodes.csv")

# Reading the CSV files into Pandas DataFrames
final_data = pd.read_csv(final_file)
coordinates_data = pd.read_csv(coordinates_file)

# Adding another column named city to final data duplicating the lgan column
final_data['city'] = final_data['National LGA Name 2021']

# Columns to keep
coordinates_data_columns = ['city','lat','lng']

# merging the CSVs
merged_geojsondata = pd.merge(final_data, coordinates_data[coordinates_data_columns], on='city', how='inner')

In [62]:
merged_geojsondata

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Number of Fatalities,Speed Limit,National Remoteness Areas,...,Easter Period,Day of week,Time of Day,Road User,Gender,Age,Age Group,city,lat,lng
0,20233054,QLD,9,2023,Saturday,23:00,Single,1,80,Major Cities of Australia,...,No,Weekend,Night,Driver,Male,25,17_to_25,Gold Coast,-28.0167,153.4000
1,20233098,QLD,9,2023,Friday,13:00,Single,1,60,Major Cities of Australia,...,No,Weekday,Day,Pedestrian,Male,62,40_to_64,Gold Coast,-28.0167,153.4000
2,20233067,QLD,9,2023,Tuesday,11:00,Multiple,1,70,Inner Regional Australia,...,No,Weekday,Day,Motorcycle rider,Male,71,65_to_74,Gold Coast,-28.0167,153.4000
3,20233131,QLD,8,2023,Saturday,2:00,Multiple,1,60,Major Cities of Australia,...,No,Weekend,Night,Passenger,Male,17,17_to_25,Gold Coast,-28.0167,153.4000
4,20233171,QLD,8,2023,Wednesday,10:00,Multiple,1,70,Major Cities of Australia,...,No,Weekday,Day,Driver,Female,80,75_or_older,Gold Coast,-28.0167,153.4000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,20145004,WA,4,2014,Thursday,2:10,Multiple,2,60,Major Cities of Australia,...,No,Weekday,Night,Passenger,Male,20,17_to_25,Fremantle,-32.0542,115.7476
1915,20194027,SA,10,2019,Friday,0:10,Single,1,50,Outer Regional Australia,...,No,Weekday,Night,Pedestrian,Male,51,40_to_64,Mount Gambier,-37.8294,140.7828
1916,20194028,SA,5,2019,Saturday,17:46,Single,1,50,Outer Regional Australia,...,No,Weekend,Day,Pedestrian,Male,20,17_to_25,Mount Gambier,-37.8294,140.7828
1917,20181238,NSW,4,2018,Sunday,15:30,Single,1,100,Very Remote Australia,...,No,Weekend,Day,Motorcycle rider,Male,62,40_to_64,Bourke,-30.1000,145.9333


In [63]:
# changing the column order
new_column_order = [
    'Crash ID', 'State', 'Month', 'Year', 'Dayweek', 'Time', 'Crash Type', 'Number of Fatalities',
    'Speed Limit', 'National Remoteness Areas', 'SA4 Name 2021', 'National LGA Name 2021',
    'city', 'lat', 'lng', 'National Road Type', 'Christmas Period', 'Easter Period',
    'Day of week', 'Time of Day', 'Road User', 'Gender',
    'Age', 'Age Group'
]

# Reorder the columns
merged_geojsondata = merged_geojsondata[new_column_order]
merged_geojsondata

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Number of Fatalities,Speed Limit,National Remoteness Areas,...,lng,National Road Type,Christmas Period,Easter Period,Day of week,Time of Day,Road User,Gender,Age,Age Group
0,20233054,QLD,9,2023,Saturday,23:00,Single,1,80,Major Cities of Australia,...,153.4000,Sub-Arterial Road,No,No,Weekend,Night,Driver,Male,25,17_to_25
1,20233098,QLD,9,2023,Friday,13:00,Single,1,60,Major Cities of Australia,...,153.4000,Local Road,No,No,Weekday,Day,Pedestrian,Male,62,40_to_64
2,20233067,QLD,9,2023,Tuesday,11:00,Multiple,1,70,Inner Regional Australia,...,153.4000,Sub-Arterial Road,No,No,Weekday,Day,Motorcycle rider,Male,71,65_to_74
3,20233131,QLD,8,2023,Saturday,2:00,Multiple,1,60,Major Cities of Australia,...,153.4000,Sub-Arterial Road,No,No,Weekend,Night,Passenger,Male,17,17_to_25
4,20233171,QLD,8,2023,Wednesday,10:00,Multiple,1,70,Major Cities of Australia,...,153.4000,Sub-Arterial Road,No,No,Weekday,Day,Driver,Female,80,75_or_older
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1914,20145004,WA,4,2014,Thursday,2:10,Multiple,2,60,Major Cities of Australia,...,115.7476,Arterial Road,No,No,Weekday,Night,Passenger,Male,20,17_to_25
1915,20194027,SA,10,2019,Friday,0:10,Single,1,50,Outer Regional Australia,...,140.7828,Collector Road,No,No,Weekday,Night,Pedestrian,Male,51,40_to_64
1916,20194028,SA,5,2019,Saturday,17:46,Single,1,50,Outer Regional Australia,...,140.7828,Local Road,No,No,Weekend,Day,Pedestrian,Male,20,17_to_25
1917,20181238,NSW,4,2018,Sunday,15:30,Single,1,100,Very Remote Australia,...,145.9333,Arterial Road,No,No,Weekend,Day,Motorcycle rider,Male,62,40_to_64


In [64]:
# Specifying the output folder 
output_folder = "output_folder/Leaflet_Output"

# Creating the output folder 
import os
os.makedirs(output_folder, exist_ok=True)

# Defining the file path for the CSV
output_csv_file = os.path.join(output_folder, "merged_geojsondata.csv")

# Saving the DataFrame as a CSV file
merged_geojsondata.to_csv(output_csv_file, index=False)