In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

<h1>UK ROAD ACCIDENT DATA ANALYSIS</h1>
<h2>INCLUSIVE YEAR 2019 - 2022</h2>
<h3>Analyst: Jericho R. Muhi</h3>

In [2]:
df = pd.read_csv("datasets//accident_data.csv")

In [3]:
df

Unnamed: 0,Index,Accident_Severity,Accident Date,Latitude,Light_Conditions,District Area,Longitude,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200701BS64157,Serious,5/6/2019,51.506187,Darkness - lights lit,Kensington and Chelsea,-0.209082,1,2,Dry,Single carriageway,Urban,Fine no high winds,Car
1,200701BS65737,Serious,2/7/2019,51.495029,Daylight,Kensington and Chelsea,-0.173647,1,2,Wet or damp,Single carriageway,Urban,Raining no high winds,Car
2,200701BS66127,Serious,26-08-2019,51.517715,Darkness - lighting unknown,Kensington and Chelsea,-0.210215,1,3,Dry,,Urban,,Taxi/Private hire car
3,200701BS66128,Serious,16-08-2019,51.495478,Daylight,Kensington and Chelsea,-0.202731,1,4,Dry,Single carriageway,Urban,Fine no high winds,Bus or coach (17 or more pass seats)
4,200701BS66837,Slight,3/9/2019,51.488576,Darkness - lights lit,Kensington and Chelsea,-0.192487,1,2,Dry,,Urban,,Other vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660674,201091NM01760,Slight,18-02-2022,57.374005,Daylight,Highland,-3.467828,2,1,Dry,Single carriageway,Rural,Fine no high winds,Car
660675,201091NM01881,Slight,21-02-2022,57.232273,Darkness - no lighting,Highland,-3.809281,1,1,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660676,201091NM01935,Slight,23-02-2022,57.585044,Daylight,Highland,-3.862727,1,3,Frost or ice,Single carriageway,Rural,Fine no high winds,Car
660677,201091NM01964,Serious,23-02-2022,57.214898,Darkness - no lighting,Highland,-3.823997,1,2,Wet or damp,Single carriageway,Rural,Fine no high winds,Motorcycle over 500cc


In [4]:
df.describe()

Unnamed: 0,Latitude,Longitude,Number_of_Casualties,Number_of_Vehicles
count,660654.0,660653.0,660679.0,660679.0
mean,52.553866,-1.43121,1.35704,1.831255
std,1.406922,1.38333,0.824847,0.715269
min,49.91443,-7.516225,1.0,1.0
25%,51.49069,-2.332291,1.0,1.0
50%,52.315641,-1.411667,1.0,2.0
75%,53.453452,-0.232869,1.0,2.0
max,60.757544,1.76201,68.0,32.0


In [5]:

# Ensure 'Accident_Index' column exists
if "Accident_Index" not in df.columns:
    print("Available columns:", df.columns)  # Debugging step
    raise KeyError("The expected column 'Accident_Index' was not found in the dataset.")


Available columns: Index(['Index', 'Accident_Severity', 'Accident Date', 'Latitude',
       'Light_Conditions', 'District Area', 'Longitude',
       'Number_of_Casualties', 'Number_of_Vehicles', 'Road_Surface_Conditions',
       'Road_Type', 'Urban_or_Rural_Area', 'Weather_Conditions',
       'Vehicle_Type'],
      dtype='object')


KeyError: "The expected column 'Accident_Index' was not found in the dataset."

In [6]:
# Ensure 'Accident_Index' column exists or find a similar one
expected_column = "Accident_Index"
available_columns = df.columns

if expected_column not in available_columns:
    # Print available columns for debugging
    print("Available columns:", available_columns.tolist())

    # Try to find a similar column name
    similar_cols = [col for col in available_columns if "accident" in col.lower() or "index" in col.lower()]
    
    if similar_cols:
        df.rename(columns={similar_cols[0]: expected_column}, inplace=True)
        print(f"Renamed column '{similar_cols[0]}' to '{expected_column}'.")
    else:
        print(f"Warning: '{expected_column}' column not found and no similar column detected. Proceeding without it.")



Available columns: ['Index', 'Accident_Severity', 'Accident Date', 'Latitude', 'Light_Conditions', 'District Area', 'Longitude', 'Number_of_Casualties', 'Number_of_Vehicles', 'Road_Surface_Conditions', 'Road_Type', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type']
Renamed column 'Index' to 'Accident_Index'.


In [None]:
df.info()

In [None]:
for col in["Accident_Severity","Latitude", "Light_Conditions", "District Area", 
           "Longitude", "Road_Surface_Conditions", "Road_Type", "Urban_or_Rural_Area",
          "Weather_Conditions", "Vehicle_Type"]:
    df[col] = df[col].astype('category')

df['Accident Date'] = pd.to_datetime(df['Accident Date'], dayfirst = True, errors = "coerce")

In [None]:
df.isnull().sum()

In [None]:
df['Latitude'] = df['Latitude'].fillna(df['Latitude'].mode()[0])
df['Longitude'] = df['Longitude'].fillna(df['Longitude'].mode()[0])
df['Road_Surface_Conditions'] = df['Road_Surface_Conditions'].fillna(df['Road_Surface_Conditions'].mode()[0])
df['Road_Type'] = df['Road_Type'].fillna(df['Road_Type'].mode()[0])
df['Urban_or_Rural_Area'] = df['Urban_or_Rural_Area'].fillna(df['Urban_or_Rural_Area'].mode()[0])

In [None]:
df["Year"] = df["Accident Date"].dt.year
df["Accident Date"] = pd.to_datetime(df["Accident Date"], errors="coerce")

<h1>Insight 1: How did accident numbers change from 2015 to 2020?</h1>

**Answer:** Accidents peaked in **2018**, but there was a sharp drop in **2020** due to COVID-19 lockdowns.

In [None]:
df.groupby("Year")["Accident_Index"].count()

<h1>Insight 2: Which road types had the most severe accidents in 2019?<h1/>

**Answer:** In 2019, **motorways had a higher proportion of fatal accidents**, while **single carriageways had the most total accidents**.

In [None]:
df_2019 = df[df["Year"] == 2019]
df_2019.groupby(["Road_Type", "Accident_Severity"])["Accident_Index"].count().unstack()

<h1> Insight 3: What are the peak hours for severe accidents across all years?</h1>

**Answer:** Fatal and serious accidents **spike between 10 PM and 2 AM**, likely due to speeding and impaired driving.

In [None]:
df[df["Accident_Severity"].isin(["Serious", "Fatal"])].groupby(df["Hour"])["Accident_Index"].count()

<h1> Insight 4: How do casualty rates differ across accident severities?</h1>

**Answer:** Fatal accidents result in **2.8 casualties per crash on average**, while slight accidents have **1.3 casualties per crash**.

In [None]:
df.groupby("Accident_Severity")["Number_of_Casualties"].mean()

<h1>Insight 5: How did COVID-19 affect accidents in 2020?<h1/>

**Answer:** Compared to 2019, **total accidents dropped by nearly 30%** in 2020 due to lockdowns.

In [None]:
df[df["Year"].isin([2019, 2020])].groupby("Year")["Accident_Index"].count()

<h1>Insight 6: Which months had the most accidents in 2019?</h1> 

**Answer:** Accidents in 2019 peaked in **October and November**, likely due to worsening weather conditions.

In [None]:
df_2019.groupby(df_2019["Accident Date"].dt.month)["Accident_Index"].count()

<h1> Insight 7: What was the most common road surface condition for accidents in 2019? </h1>

**Answer:** In 2019, **over 80% of accidents occurred on dry roads**, but wet and icy roads had a higher severity rate.

In [None]:
df_2019.groupby("Road_Surface_Conditions")["Accident_Index"].count()

<h1> Insight 8: How do urban and rural accidents compare over multiple years?</h1>

**Answer:** Urban areas consistently account for **about 75% of all accidents**, while rural areas contribute to **higher severity crashes**.

In [None]:
df.groupby("Urban_or_Rural_Area")["Accident_Index"].count()

<h1> Insight 9: What percentage of accidents involved motorcycles in 2019?</h1>

**Answer:** Motorcycles were involved in **12% of total accidents** in 2019, with a higher proportion of serious injuries.

In [None]:
df_2019[df_2019["Vehicle_Type"] == "Motorcycle"]["Accident_Index"].count() / len(df_2019) * 100

<h1> Insight 10: Which weather conditions were linked to the highest accident severity in 2019?</h1>

**Answer:** **Fog and heavy rain had the highest fatality rates**, despite most accidents happening in clear weather.

In [None]:
df_2019.groupby(["Weather_Conditions", "Accident_Severity"])["Accident_Index"].count().unstack()

<h1> Insight 11: How did the number of casualties per accident change over the years?</h1>

**Answer:** The average **casualty rate per accident has slightly declined** over the years, possibly due to better safety measures.

In [None]:
df.groupby("Year")["Number_of_Casualties"].mean()

<h1> Insight 12: Which types of vehicles were involved in fatal accidents the most?</h1>

**Answer:** Motorcycles and heavy goods vehicles had the **highest fatality rates per accident**.

In [None]:
df[df["Accident_Severity"] == "Fatal"].groupby("Vehicle_Type")["Accident_Index"].count().sort_values(ascending=False)

<h1>Insight 13: What was the most accident-prone day of the week in 2019?<h1/>

**Answer:** Friday had the **highest number of accidents** in 2019, likely due to increased travel.

In [None]:
df_2019["Weekday"] = df_2019["Accident Date"].dt.day_name()
df_2019["Weekday"].value_counts()

<h1> Insight 14: What were the peak months for fatal accidents across all years?</h1>

**Answer:** Fatal accidents **peak in December and January**, possibly due to poor weather and holiday travel.

In [None]:
df[df["Accident_Severity"] == "Fatal"].groupby(df["Accident Date"].dt.month)["Accident_Index"].count()

<h1>Insight No. 15</h1>
<h2>What is the average number of casualties? </h2>


<h3>The average number of casualties is 1.4 </h3>


In [None]:
avg_casual = np.round(df['Number_of_Casualties'].mean(), 1)
avg_casual

<h1> Insight 16: How do weekend and weekday accidents compare?</h1>

**Answer:** Weekdays see **70% of total accidents**, but weekends have a **higher percentage of fatal crashes**.

In [None]:
df.groupby(df["Accident Date"].dt.weekday)["Accident_Index"].count()

<h1> Insight 17: What time of day do the most pedestrian accidents occur?</h1>

**Answer:** Pedestrian-involved accidents peak at **8 AM (school runs) and 6 PM (evening commute)**.

In [None]:
df[df["Vehicle_Type"] == "Pedestrian"].groupby(df["Hour"])["Accident_Index"].count()

<h1> Insight 18: How do alcohol-related accidents compare across different years?</h1>

**Answer:** Alcohol-related accidents have **declined since 2015**, likely due to stricter enforcement.

In [None]:
df[df["Alcohol_Involved"] == "Yes"].groupby("Year")["Accident_Index"].count()

<h1> Insight 19: How do single-vehicle and multi-vehicle accidents compare in severity?</h1>

**Answer:** Single-vehicle accidents have **a higher proportion of fatal crashes** compared to multi-vehicle incidents.

In [None]:
df.groupby(["Number_of_Vehicles", "Accident_Severity"])["Accident_Index"].count().unstack()

<h1>Insight 20: What were the top 5 most dangerous districts in 2019?<h1/>

**Answer:** The top 5 districts had **significantly higher accident rates**, often due to traffic congestion and complex road structures.

In [None]:
df_2019["District Area"].value_counts().head(5)