In [None]:
# import libraries, other imports are as we go through encoding
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
accident = pd.read_csv('dataset/accident_data.csv')
accident.head()

## Data Cleaning

In [None]:
accident.shape

In [None]:
# get all the datatypes
# date is a str
accident.dtypes

In [None]:
# number of unique values in each column
for col in accident.columns:
  print(f'{col}: {accident[col].nunique()}')

In [None]:
# make a copy of accident dataframe
accident_copy = accident.copy()

In [None]:
# get general idea of min max for the numerical columns (no anomalies)
accident_copy.describe()

Accident Date

In [None]:
# turn dates column into three columns: day, month, year
accident_copy['Day'] = accident['Accident Date'].str.split('-').apply(lambda x: int(x[0]))
accident_copy['Month'] = accident['Accident Date'].str.split('-').apply(lambda x: int(x[1]))
accident_copy['Year'] = accident['Accident Date'].str.split('-').apply(lambda x: int(x[2]))

accident_copy['Accident_Date'] = pd.to_datetime(accident['Accident Date'], format="%d-%m-%Y")
accident_copy = accident_copy.drop(columns=['Accident Date'])

Latitude and Longitude

In [None]:
# Drop Longitude/ Latitude
accident_copy = accident_copy.dropna(subset=['Latitude'])
accident_copy = accident_copy.dropna(subset=['Longitude'])

Light_Conditions

In [None]:
# assume NaN for 'Darkness - lighting unknown'
accident_copy['Light_Conditions'] = accident['Light_Conditions'].replace('Darkness - lighting unknown', np.NaN)

District Area

In [None]:
# rename District Area so it is consistent in column formatting
accident_copy = accident_copy.rename(columns={'District Area': 'District_Area'})


Weather_Conditions

In [None]:
# weather conditions seem to have combinations, so we could one hot encode it instead
# the categories include 'Fine', 'High Winds', 'Raining', 'Snowing', 'Fog or mist', 'Other'

# first split up the labels into lists
accident_copy['Weather_Conditions'] = accident['Weather_Conditions'].apply(
    lambda x: ['Fine'] if x == 'Fine no high winds'
    else ['Raining'] if x == 'Raining no high winds'
    else ['Fine', 'High winds'] if x == 'Fine + high winds'
    else ['Raining', 'High winds'] if x == 'Raining + high winds'
    else ['Snowing'] if x == 'Snowing no high winds'
    else ['Fog or mist'] if x == 'Fog or mist'
    else ['Snowing', 'High winds'] if x == 'Snowing + high winds'
    else ['Other'] if x == 'Other'
    else np.NaN
)

Vehicle_Type

In [None]:
# assume NaN for 'Data missing or out of range'
accident_copy['Vehicle_Type'] = accident['Vehicle_Type'].replace('Data missing or out of range', np.NaN)

Urban_or_Rural_Area

In [None]:
# assume NaN for 'Data missing or out of range'
accident_copy['Urban_or_Rural_Area'] = accident['Urban_or_Rural_Area'].replace('Unallocated', np.NaN)

In [None]:
# find out the current number of missing values in the dataset
accident_copy.isnull().sum()

In [None]:
# drop all NaN values
accident_copy = accident_copy.dropna()
print(f'Before dropping missing values: {accident.shape[0]} rows')
print(f'After dropping missing values: {accident_copy.shape[0]} rows')

In [None]:
# cleaned, but before encoding
accident_copy.head()

## Exploring Data

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(accident_copy.select_dtypes(include=['float64', 'int64']).corr(), vmin=-1, vmax=1, center=0, annot=True, cmap= 'coolwarm')
plt.show()

In [None]:
# our data has a lot more Slight casualties
sns.countplot(data = accident_copy, x = 'Accident_Severity', order=['Slight', 'Serious', 'Fatal'])
plt.title('Counts of Accident Severity')
plt.show()

In [None]:
fig, ax = plt.subplots()

sns.kdeplot(accident_copy[accident_copy["Accident_Severity"]=="Slight"]["Number_of_Casualties"], fill=True, color="blue", label="Slight", ax=ax)
sns.kdeplot(accident_copy[accident_copy["Accident_Severity"]=="Serious"]["Number_of_Casualties"], fill=True, color="green", label="Serious", ax=ax)
sns.kdeplot(accident_copy[accident_copy["Accident_Severity"]=="Fatal"]["Number_of_Casualties"], fill=True, color="orange", label="Fatal", ax=ax)

ax.set_xlabel("Number of Casualties")
ax.set_ylabel("Density")

fig.suptitle("Number of Casualties vs. Road Accident Severity")

ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()

sns.kdeplot(accident_copy[accident_copy["Accident_Severity"]=="Slight"]["Number_of_Vehicles"], fill=True, color="blue", label="Slight", ax=ax)
sns.kdeplot(accident_copy[accident_copy["Accident_Severity"]=="Serious"]["Number_of_Vehicles"], fill=True, color="green", label="Serious", ax=ax)
sns.kdeplot(accident_copy[accident_copy["Accident_Severity"]=="Fatal"]["Number_of_Vehicles"], fill=True, color="orange", label="Fatal", ax=ax)

ax.set_xlabel("Number of Vehicles")
ax.set_ylabel("Density")

fig.suptitle("Number of Vehicles vs. Road Accident Severity")

ax.legend()
plt.show()

In [None]:
sns.catplot(data=accident_copy, x="Light_Conditions", y="Number_of_Casualties", 
            hue='Accident_Severity', hue_order=['Slight', 'Serious', 'Fatal'], kind="bar")
plt.xticks(rotation=20)
plt.title('Number of Casualties For Each Light Condition')
plt.show()

In [None]:
sns.catplot(data=accident_copy, x="Road_Surface_Conditions", y="Number_of_Casualties", 
            hue='Accident_Severity', hue_order=['Slight', 'Serious', 'Fatal'], kind="bar")
plt.xticks(rotation=20)
plt.title('Number of Casualties For Each Road Condition')
plt.show()

In [None]:
sns.catplot(data=accident_copy, x="Road_Type", y="Number_of_Casualties", 
            hue='Accident_Severity', hue_order=['Slight', 'Serious', 'Fatal'], kind="bar")
plt.xticks(rotation=20)
plt.title('Number of Casualties For Each Road Type')
plt.show()

In [None]:
sns.catplot(data=accident_copy, x="Urban_or_Rural_Area", y="Number_of_Casualties", 
            hue='Accident_Severity', hue_order=['Slight', 'Serious', 'Fatal'], kind="bar")
plt.xticks(rotation=20)
plt.title('Number of Casualties For Each Area Type')
plt.show()

In [None]:
#accident counts over time for each severity
sns.barplot(data=accident_copy, x='Accident_Severity', y='Number_of_Casualties', 
            hue='Year', order=['Slight', 'Serious', 'Fatal'])
plt.xlabel('Accident Severity')
plt.ylabel('Number of Casualties')
plt.title('Distribution of Casualty vs Severity Over Years')
plt.show()

In [None]:
sns.lineplot(data=accident_copy, x='Month', y='Number_of_Casualties',
              hue='Accident_Severity', palette=['green', 'blue', 'red'],
              hue_order=['Slight', 'Serious', 'Fatal'])
plt.xlabel('Months')
plt.ylabel('Number of Casualties')
plt.title('Distribution of Casualty vs Severity Over Months')
plt.xticks(np.arange(1, 13))
plt.show()

## Encoding

In [None]:
# Use One Hot Encoder on all categorical columns except Weather_Conditions
from sklearn.preprocessing import OneHotEncoder
# categorical columns include Light_Conditions, District_Area,
# Road_Surface_Conditions, Road_Type,	Urban_or_Rural_Area,
# Weather_Conditions, Vehicle_Type

categorical = ['Road_Surface_Conditions', 'Road_Type',	'Urban_or_Rural_Area', 'Vehicle_Type']

for col in categorical:
  ohe = OneHotEncoder()
  categorical_ohe = ohe.fit_transform(accident_copy[[col]]).toarray()
  df = pd.DataFrame(categorical_ohe, columns=list(ohe.categories_[0]))
  accident_copy[ohe.categories_[0]] = categorical_ohe
  accident_copy = accident_copy.drop(columns=[col])

In [None]:
# use MultiLabelBinarizer on Weather_Conditions??
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
weather = mlb.fit_transform(accident_copy['Weather_Conditions'])
# one for everything that was in the weather condition that day
accident_copy[mlb.classes_] = weather
accident_copy = accident_copy.drop(columns=['Weather_Conditions'])

In [None]:
# while there isn't a big difference in Slight accidents, Fatal changes a lot over the years
accident_19 = accident_copy[accident_copy['Year'] == 2019]
accident_20 = accident_copy[accident_copy['Year'] == 2020]
accident_21 = accident_copy[accident_copy['Year'] == 2021]
accident_22 = accident_copy[accident_copy['Year'] == 2022]

In [None]:
accident_20