In [7]:
import pandas as pd
from helper import get_my_dataset
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
from helper import create_my_dataset_csv, MY_LOCAL_AUTHORITY_HIGHWAY
create_my_dataset_csv()

In [9]:
"""
Get the data_set filtered for my borough
"""
main_df = get_my_dataset()

In [None]:
"""
Checking the rows and columns of the dataset
"""
main_df.shape

### Data types

In [None]:
main_df.dtypes

## Data quality issues

### Missing values

In [None]:
""" 
Checking missing values
"""
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

missing_indicators = [-1, "-1", ""]

missing_values = main_df.isnull().sum()
for col in main_df.columns:
    missing_values[col] += (main_df[col].isin(missing_indicators)).sum()

print("Missing Values Per Column:")
print(missing_values)

sns.heatmap(main_df.isin(missing_indicators) | main_df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.show()

### Outliers

Visualize to see distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numerical_columns = [
    'Number_of_Vehicles', 'Number_of_Casualties'
]

for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(main_df[column], kde=True, color='blue')
    plt.title(f'Distribution of {column}')
    plt.show()

    plt.figure(figsize=(8, 4))
    sns.boxplot(main_df[column], orient='h', color='skyblue')
    plt.title(f'Boxplot of {column}')
    plt.show()


Detect Outliers

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

for column in numerical_columns:
    Q1 = main_df[column].quantile(0.25)
    Q3 = main_df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = main_df[(main_df[column] < lower_bound) | (main_df[column] > upper_bound)]

    plt.figure(figsize=(8, 5))
    sns.boxplot(x=main_df[column], color='lightblue', showfliers=False)  
    plt.scatter(outliers[column], [0] * len(outliers), color='red', label='Outliers')  
    plt.title(f'Boxplot of {column} with Highlighted Outliers')
    plt.xlabel(column)
    plt.legend()
    plt.show()


### Extreme values

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_extreme_values(df, numerical_columns):
    for column in numerical_columns:

        low_percentile = df[column].quantile(0.01)
        high_percentile = df[column].quantile(0.99)
        
        extreme_values = df[(df[column] < low_percentile) | (df[column] > high_percentile)]
        
        plt.figure(figsize=(8, 5))
        sns.boxplot(data=df[column], color='lightblue', showfliers=False)  # Boxplot without default outliers
        plt.scatter(extreme_values[column], [0] * len(extreme_values), color='orange', label='Extreme Values')  # Highlight extreme values
        plt.title(f'Boxplot of {column} with Highlighted Extreme Values')
        plt.xlabel(column)
        plt.legend()
        plt.show()

numerical_columns = ['Number_of_Vehicles', 'Number_of_Casualties']

plot_extreme_values(main_df, numerical_columns)


### Incomparable value ranges

In [None]:
numerical_columns = ['Number_of_Vehicles', 'Number_of_Casualties']

print("Value Ranges for Numerical Columns:")
for column in numerical_columns:
    min_val = main_df[column].min()
    max_val = main_df[column].max()
    range_val = max_val - min_val
    print(f"{column}: Min = {min_val}, Max = {max_val}, Range = {range_val}")

import matplotlib.pyplot as plt

ranges = {column: main_df[column].max() - main_df[column].min() for column in numerical_columns}

plt.figure(figsize=(8, 5))
plt.bar(ranges.keys(), ranges.values(), color='skyblue')
plt.title('Range of Values for Numerical Columns')
plt.ylabel('Range')
plt.xlabel('Columns')
plt.show()



### Imbalanced classes

In [None]:
categorical_columns = [
    "Police_Force",
    "Accident_Severity",
    "Number_of_Vehicles",
    "Number_of_Casualties",
    "Day_of_Week",
    "Local_Authority_District",
    "Local_Authority_Highway",
    "1st_Road_Class",
    "1st_Road_Number",
    "Road_Type",
    "Speed_limit",
    "Junction_Detail",
    "Junction_Control",
    "2nd_Road_Class",
    "2nd_Road_Number",
    "Pedestrian_Crossing-Human_Control",
    "Pedestrian_Crossing-Physical_Facilities",
    "Light_Conditions",
    "Weather_Conditions",
    "Road_Surface_Conditions",
    "Special_Conditions_at_Site",
    "Carriageway_Hazards",
    "Urban_or_Rural_Area",
    "Did_Police_Officer_Attend_Scene_of_Accident",
]

print("Class Distributions for Categorical Columns:")
for column in categorical_columns:
    print(f"\n{column} Distribution:")
    print(main_df[column].value_counts())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(nrows=6, ncols=4, figsize=(20, 20))
axes = axes.flatten()

for i, column in enumerate(categorical_columns):
    sns.countplot(
        data=main_df,
        x=column,
        order=main_df[column].value_counts().index,
        color='skyblue',
        ax=axes[i]
    )
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

Mapped version

### Correct data types

In [None]:

main_df['Date'] = pd.to_datetime(main_df['Date'], format='%d/%m/%Y', errors='coerce')
main_df['Time'] = pd.to_datetime(main_df['Time'], format='%H:%M', errors='coerce')

category_columns = [
    'Police_Force', 'Accident_Severity', 'Day_of_Week', 'Speed_limit', 'Weather_Conditions',
    'Road_Type', 'Light_Conditions', 'Urban_or_Rural_Area', '1st_Road_Class', '1st_Road_Number',
    '2nd_Road_Class', '2nd_Road_Number', 'Local_Authority_Highway', 'Junction_Detail',
    'Junction_Control', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
    'Road_Surface_Conditions', 'Special_Conditions_at_Site', 'Carriageway_Hazards',
    'Did_Police_Officer_Attend_Scene_of_Accident', 'Local_Authority_District'
]

int_columns = ['Number_of_Vehicles', 'Number_of_Casualties']


for col in category_columns:
    main_df[col] = main_df[col].astype('category')

for col in int_columns:
    main_df[col] = main_df[col].astype(int)

main_df.dtypes

### Basic statistics of each attribute

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import skew, kurtosis

numerical_columns = ['Location_Easting_OSGR', 'Location_Northing_OSGR', 
                     'Longitude', 'Latitude', 
                     'Number_of_Vehicles', 'Number_of_Casualties']

categorical_columns = [
        'Police_Force', 'Accident_Severity', 'Day_of_Week', 'Speed_limit', 'Weather_Conditions',
    'Road_Type', 'Light_Conditions', 'Urban_or_Rural_Area', '1st_Road_Class', '1st_Road_Number',
    '2nd_Road_Class', '2nd_Road_Number', 'Local_Authority_Highway', 'Junction_Detail',
    'Junction_Control', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
    'Road_Surface_Conditions', 'Special_Conditions_at_Site', 'Carriageway_Hazards',
    'Did_Police_Officer_Attend_Scene_of_Accident', 'Local_Authority_District','LSOA_of_Accident_Location','Accident_Index'
                       ]

all_columns = numerical_columns

num_cols = 3  
num_rows = math.ceil(len(all_columns) / num_cols)


fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows))
axes = axes.flatten()

for i, column in enumerate(all_columns):
    if column in numerical_columns:
        
        data = main_df[column]
        mean = data.mean()
        median = data.median()
        std = data.std()
        skewness = skew(data, nan_policy='omit')
        kurt = kurtosis(data, nan_policy='omit')
        mode = data.mode()[0]
        value_range = (data.min(), data.max())

        sns.histplot(data=data, kde=True, ax=axes[i], color='blue')
        axes[i].axvline(mean, color='red', linestyle='--', label=f'Mean: {mean:.2f}')
        axes[i].axvline(median, color='green', linestyle='--', label=f'Median: {median:.2f}')
        axes[i].set_title(f'{column}')
        axes[i].set_xlabel(column)
        axes[i].set_ylabel('Frequency')
        
        stats_text = (f"Range: {value_range}\nMean: {mean:.2f}\nStd: {std:.2f}\n"
                      f"Skewness: {skewness:.2f}\nKurtosis: {kurt:.2f}\nMode: {mode}")
        axes[i].text(0.95, 0.95, stats_text, transform=axes[i].transAxes, fontsize=10,
                     verticalalignment='top', horizontalalignment='right',
                     bbox=dict(boxstyle="round", alpha=0.3))
        axes[i].legend()

    elif column in categorical_columns:
        
        sns.countplot(data=main_df, x=column, ax=axes[i], color='skyblue', 
                      order=main_df[column].value_counts().index)
        axes[i].set_title(f'{column}')
        axes[i].set_xlabel(column)
        axes[i].set_ylabel('Count')

        for p in axes[i].patches:
            axes[i].annotate(f'{int(p.get_height())}', 
                             (p.get_x() + p.get_width() / 2., p.get_height()),
                             ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),
                             textcoords='offset points')
        axes[i].tick_params(axis='x', rotation=45)

for j in range(len(all_columns), len(axes)):
    fig.delaxes(axes[j])


plt.tight_layout()
plt.show()


Categorical columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(nrows=6, ncols=4, figsize=(20, 20))
axes = axes.flatten()

for i, column in enumerate(categorical_columns):
    sns.countplot(
        data=main_df,
        x=column,
        order=main_df[column].value_counts().index,
        color='skyblue',
        ax=axes[i]
    )
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x', rotation=45)

    mode = main_df[column].mode()[0]
    count = main_df[column].value_counts()[mode]
    axes[i].annotate(f'Mode: {mode}\nCount: {count}', 
                     xy=(0.5, 0.9), xycoords='axes fraction', 
                     ha='center', va='center', fontsize=10, 
                     bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))

plt.tight_layout()
plt.show()

## Data preparation

### Pre processing

Missing values

In [None]:

main_df.dropna(inplace=True)


updated_missing_values = main_df.isnull().sum()
print("Updated Missing Values Per Column:")
print(updated_missing_values)


sns.heatmap(main_df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap After Removing Rows with Missing Values")
plt.show()


Imbalanced classes

In [None]:
imbalanced_classes = {}

for column in categorical_columns:
    class_counts = main_df[column].value_counts(normalize=True) * 100
    if class_counts.max() > 90:  
        imbalanced_classes[column] = class_counts

from imblearn.over_sampling import RandomOverSampler


oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(main_df.drop('Accident_Severity', axis=1),
                                                     main_df['Accident_Severity'])


balanced_df = X_resampled.copy()
balanced_df['Accident_Severity'] = y_resampled

print(balanced_df['Accident_Severity'].value_counts())

Outliers

In [None]:
numerical_columns = ['Number_of_Vehicles', 'Number_of_Casualties']

for column in numerical_columns:
 
    Q1 = main_df[column].quantile(0.25)
    Q3 = main_df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    main_df = main_df[(main_df[column] >= lower_bound) & (main_df[column] <= upper_bound)]

print("Shape after outlier removal:", main_df.shape)

import matplotlib.pyplot as plt
import seaborn as sns

for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(main_df[column], kde=True, color='blue')
    plt.title(f'Distribution of {column} After Outlier Removal')
    plt.show()

    plt.figure(figsize=(8, 4))
    sns.boxplot(main_df[column], orient='h', color='skyblue')
    plt.title(f'Boxplot of {column} After Outlier Removal')
    plt.show()



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_extreme_values(df, numerical_columns):
    for column in numerical_columns:
        low_percentile = df[column].quantile(0.01)
        high_percentile = df[column].quantile(0.99)
        
        extreme_values = df[(df[column] < low_percentile) | (df[column] > high_percentile)]
        
        plt.figure(figsize=(8, 5))
        sns.boxplot(data=df[column], color='lightblue', showfliers=False)  # Boxplot without default outliers
        plt.scatter(extreme_values[column], [0] * len(extreme_values), color='orange', label='Extreme Values')  # Highlight extreme values
        plt.title(f'Boxplot of {column} with Highlighted Extreme Values')
        plt.xlabel(column)
        plt.legend()
        plt.show()

numerical_columns = ['Number_of_Vehicles', 'Number_of_Casualties']

plot_extreme_values(main_df, numerical_columns)


Determine why,why and how each attribute should and should not be used

In [None]:
selected_attributes = [
    "Number_of_Casualties",
    "Number_of_Vehicles",
    "Road_Type",
    "Speed_limit",
    "Light_Conditions",
    "Latitude",
    "Longitude",
    "Accident_Severity",
    "Weather_Conditions",
    "Road_Surface_Conditions"
]

filtered_df = main_df[selected_attributes]

print(filtered_df.head())

Divide dataset (Training, Test & Validation)

In [None]:
from sklearn.model_selection import train_test_split

X = filtered_df.drop(columns=["Accident_Severity"])  
y = filtered_df["Accident_Severity"]  

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Training Set:", X_train.shape, y_train.shape)
print("Validation Set:", X_val.shape, y_val.shape)
print("Test Set:", X_test.shape, y_test.shape)


1 hot encoding

In [None]:
import pandas as pd

categorical_columns_to_encode = [
    "Road_Type",
    "Speed_limit",
    "Light_Conditions",
    "Accident_Severity",
    "Weather_Conditions",
    "Road_Surface_Conditions"
]

final_df = pd.get_dummies(filtered_df, columns=categorical_columns_to_encode, drop_first=True)

print("Dataset After One-Hot Encoding:")
print(final_df.columns.tolist)

print("Shape after encoding:", final_df.shape)


## Modeling

### Task 1 - Descriptive

Correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.scatterplot(data=final_df, x="Number_of_Vehicles", y="Number_of_Casualties", alpha=0.6)
plt.title("Scatter Plot: Number of Vehicles vs Number of Casualties")
plt.xlabel("Number of Vehicles")
plt.ylabel("Number of Casualties")
plt.show()


In [None]:
all_columns = final_df.columns

encoded_road_type = [col for col in all_columns if 'Road_Type' in col]
encoded_speed_limit = [col for col in all_columns if 'Speed_limit' in col]
encoded_light_conditions = [col for col in all_columns if 'Light_Conditions' in col]

print("Encoded Road_Type Columns:", encoded_road_type)
print("Encoded Speed_Limit Columns:", encoded_speed_limit)
print("Encoded Light_Conditions Columns:", encoded_light_conditions)


Correlation matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numerical_columns = ['Number_of_Vehicles', 'Number_of_Casualties']
encoded_columns = encoded_road_type + encoded_speed_limit + encoded_light_conditions
relevant_columns = numerical_columns + encoded_columns

correlation_matrix = final_df[relevant_columns].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix for Numerical and Encoded Categorical Attributes")
plt.show()


Heatmap

In [None]:
heatmap_data = pd.DataFrame()

for road in encoded_road_type:
    for light in encoded_light_conditions:
        heatmap_data.loc[road, light] = \
            final_df[(final_df[road] == 1) & (final_df[light] == 1)]["Number_of_Casualties"].count()

heatmap_data.index = [col.replace("Road_Type_", "Road Type ") for col in encoded_road_type]
heatmap_data.columns = [col.replace("Light_Conditions_", "Light Conditions ") \
                        for col in encoded_light_conditions]

plt.figure(figsize=(10, 7))
sns.heatmap(heatmap_data, annot=True, cmap="Blues", fmt=".0f")
plt.title("Heatmap: Road Type vs Light Conditions")
plt.xlabel("Light Conditions")
plt.ylabel("Road Type")
plt.show()

### Task 2 - Descriptive

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd

geo_data = final_df[['Longitude', 'Latitude', 'Number_of_Casualties']].dropna()

k = 5  
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)  
geo_data['Cluster'] = kmeans.fit_predict(geo_data[['Longitude', 'Latitude']])

cluster_summary = geo_data.groupby('Cluster').agg(
    Total_Casualties=('Number_of_Casualties', 'sum'),
    Average_Casualties=('Number_of_Casualties', 'mean'),
    Accident_Count=('Cluster', 'count')
).reset_index()

print("Cluster Summary:")
print(cluster_summary)

plt.figure(figsize=(10, 7))
plt.scatter(geo_data['Longitude'], geo_data['Latitude'], c=geo_data['Cluster'], cmap='viridis', s=10, alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='X', s=100, label='Centroids')

for i, row in cluster_summary.iterrows():
    plt.text(kmeans.cluster_centers_[i, 0], kmeans.cluster_centers_[i, 1],
             f"Casualties: {row['Total_Casualties']}\nAccidents: {row['Accident_Count']}",
             color='black', fontsize=8, ha='center')

plt.title("K-Means Clustering: Accident Hotspots with Casualty Information")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.colorbar(label="Cluster ID")
plt.legend()
plt.show()




### Task 3 - Descriptive

In [None]:
print(final_df.columns.tolist)

encoded_weather_conditions = [col for col in all_columns if 'Weather_Conditions' in col]
encoded_road_surface_conditions = [col for col in all_columns if 'Road_Surface_Conditions' in col]
encoded_speed_limit_conditions = [col for col in all_columns if 'Speed_limit' in col]
encoded_light_conditions = [col for col in all_columns if 'Light_Conditions' in col]

features = numerical_columns + encoded_road_type + encoded_speed_limit + encoded_light_conditions + encoded_weather_conditions + encoded_road_surface_conditions

encoded_accident_severity_conditions = [col for col in all_columns if 'Accident_Severity' in col]

X = final_df[features]
y = final_df[encoded_accident_severity_conditions]

from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.utils import class_weight

y_train_single = y_train.idxmax(axis=1) 
y_val_single = y_val.idxmax(axis=1)

weights = class_weight.compute_class_weight(
    class_weight="balanced", classes=y_train_single.unique(), y=y_train_single
)
class_weights = {cls: weight for cls, weight in zip(y_train_single.unique(), weights)}

dt_model = DecisionTreeClassifier(criterion='gini', max_depth=6, random_state=42, class_weight=class_weights)
dt_model.fit(X_train, y_train_single)

y_pred_val = dt_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val_single, y_pred_val))
print("\nClassification Report on Validation Set:")
print(classification_report(y_val_single, y_pred_val))

ConfusionMatrixDisplay.from_estimator(dt_model, X_val, y_val_single, cmap="Blues")
