In [1]:
import pandas as pd
import numpy as np

# Data Preparation, Modeling, and Evaluation (SKLearn Modules)
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler,RobustScaler, StandardScaler,  PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

# Class Imbalance
from imblearn.over_sampling import SMOTE

## Model Selection and Tuning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

# Machine Learning Model Evaluation Methods
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
data = pd.read_csv("road-casualty-statistics-collision.csv", low_memory=False)
data.shape

(104258, 37)

In [3]:
data.head()

Unnamed: 0,accident_index,accident_year,accident_reference,location_easting_osgr,location_northing_osgr,longitude,latitude,police_force,accident_severity,number_of_vehicles,...,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,trunk_road_flag,lsoa_of_accident_location,enhanced_severity_collision
0,2023010419171,2023,10419171,525060.0,170416.0,-0.202878,51.418974,1,3,1,...,4,8,2,0,0,1,1,2,E01003383,-1
1,2023010419183,2023,10419183,535463.0,198745.0,-0.042464,51.671155,1,3,3,...,4,1,1,0,0,1,1,2,E01001547,-1
2,2023010419189,2023,10419189,508702.0,177696.0,-0.435789,51.487777,1,3,2,...,4,1,1,0,0,1,1,2,E01002448,-1
3,2023010419191,2023,10419191,520341.0,190175.0,-0.263972,51.597575,1,3,2,...,4,9,1,0,0,1,1,2,E01000129,-1
4,2023010419192,2023,10419192,527255.0,176963.0,-0.168976,51.477324,1,3,2,...,4,1,1,0,0,1,1,2,E01004583,-1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104258 entries, 0 to 104257
Data columns (total 37 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   accident_index                               104258 non-null  object 
 1   accident_year                                104258 non-null  int64  
 2   accident_reference                           104258 non-null  object 
 3   location_easting_osgr                        104246 non-null  float64
 4   location_northing_osgr                       104246 non-null  float64
 5   longitude                                    104246 non-null  float64
 6   latitude                                     104246 non-null  float64
 7   police_force                                 104258 non-null  int64  
 8   accident_severity                            104258 non-null  int64  
 9   number_of_vehicles                           104258 non-nul

In [5]:
data.columns.tolist()

['accident_index',
 'accident_year',
 'accident_reference',
 'location_easting_osgr',
 'location_northing_osgr',
 'longitude',
 'latitude',
 'police_force',
 'accident_severity',
 'number_of_vehicles',
 'number_of_casualties',
 'date',
 'day_of_week',
 'time',
 'local_authority_district',
 'local_authority_ons_district',
 'local_authority_highway',
 'first_road_class',
 'first_road_number',
 'road_type',
 'speed_limit',
 'junction_detail',
 'junction_control',
 'second_road_class',
 'second_road_number',
 'pedestrian_crossing_human_control',
 'pedestrian_crossing_physical_facilities',
 'light_conditions',
 'weather_conditions',
 'road_surface_conditions',
 'special_conditions_at_site',
 'carriageway_hazards',
 'urban_or_rural_area',
 'did_police_officer_attend_scene_of_accident',
 'trunk_road_flag',
 'lsoa_of_accident_location',
 'enhanced_severity_collision']

## **Observations**

### Target Variable: accident_severity

*Features not relevant at:*

- accident_index (id)
- accident_year (same)
- accident_reference (id)
- number_of_vehicles
- number_of_casualties
- local_authority_district
- local_authority_highway
- did_police_officer_attend_scene_of_accident
- trunk_road_flag
- lsoa_of_accident_location
- enhanced_severity_collision
- location_easting_osgr
- location_northing_osgr

*Features relevant:*

**Location based:**
- longitude
- latitude 
- local_authority_ons_district
- urban_or_rural_area

**Time based:**
- time
- day_of_week
- date(day,month),

**Road features:**
- first_road_class
- first_road_number
- second_road_class
- second_road_number
- road_type
- junction_detail
- junction_control
- pedestrian_crossing_human_control
- pedestrian_crossing_physical_facilities
- road_surface_conditions
- special_conditions_at_site
- carriageway_hazards

**Others**
- speed_limit
- light_conditions
- weather_conditions

In [6]:
data.shape

(104258, 37)

In [7]:
data_used= data [['longitude', 'latitude',
            'local_authority_ons_district',
            'urban_or_rural_area','accident_severity',
            'time', 'day_of_week','date',
            'first_road_class','first_road_number',
            'second_road_class', 'second_road_number',
            'road_type', 'junction_detail',
            'junction_control', 'pedestrian_crossing_human_control',
            'pedestrian_crossing_physical_facilities','road_surface_conditions',
            'special_conditions_at_site', 'carriageway_hazards',
            'speed_limit','light_conditions', 'weather_conditions'
]]

In [8]:
data.shape

(104258, 37)

In [9]:
data_used.shape

(104258, 23)

In [10]:
data_used.head()

Unnamed: 0,longitude,latitude,local_authority_ons_district,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,...,junction_detail,junction_control,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions
0,-0.202878,51.418974,E09000024,1,3,01:24,1,01/01/2023,5,0,...,9,4,2,5,2,0,0,20,4,8
1,-0.042464,51.671155,E09000010,1,3,02:25,1,01/01/2023,6,0,...,3,4,0,1,1,0,0,30,4,1
2,-0.435789,51.487777,E09000017,1,3,03:50,1,01/01/2023,3,437,...,1,4,0,0,1,0,0,30,4,1
3,-0.263972,51.597575,E09000003,1,3,02:13,1,01/01/2023,3,5,...,3,4,0,0,1,0,0,30,4,9
4,-0.168976,51.477324,E09000032,1,3,01:42,1,01/01/2023,3,3220,...,8,4,0,0,1,0,0,30,4,1


In [11]:
data_used.tail()

Unnamed: 0,longitude,latitude,local_authority_ons_district,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,...,junction_detail,junction_control,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions
104253,-4.44749,55.819059,S12000038,1,2,08:43,5,07/12/2023,6,0,...,0,-1,-1,-1,2,-1,-1,30,5,2
104254,-4.7522,55.09792,S12000028,2,3,10:10,6,06/10/2023,6,0,...,9,4,0,0,1,0,0,60,1,9
104255,-2.276957,57.148422,S12000034,1,3,17:00,4,15/11/2023,6,0,...,0,-1,-1,-1,2,-1,-1,30,4,2
104256,-4.047591,55.775637,S12000029,1,3,21:40,3,07/02/2023,6,0,...,9,4,-1,-1,1,-1,-1,20,4,8
104257,-3.163102,55.88835,S12000019,2,3,16:17,6,07/07/2023,3,701,...,1,2,0,0,1,0,0,50,1,1


In [12]:
data_used.isnull().sum()

longitude                                  12
latitude                                   12
local_authority_ons_district                0
urban_or_rural_area                         0
accident_severity                           0
time                                        0
day_of_week                                 0
date                                        0
first_road_class                            0
first_road_number                           0
second_road_class                           0
second_road_number                          0
road_type                                   0
junction_detail                             0
junction_control                            0
pedestrian_crossing_human_control           0
pedestrian_crossing_physical_facilities     0
road_surface_conditions                     0
special_conditions_at_site                  0
carriageway_hazards                         0
speed_limit                                 0
light_conditions                  

In [13]:
(data_used.isnull().sum()/ len(data_used))*100

longitude                                  0.01151
latitude                                   0.01151
local_authority_ons_district               0.00000
urban_or_rural_area                        0.00000
accident_severity                          0.00000
time                                       0.00000
day_of_week                                0.00000
date                                       0.00000
first_road_class                           0.00000
first_road_number                          0.00000
second_road_class                          0.00000
second_road_number                         0.00000
road_type                                  0.00000
junction_detail                            0.00000
junction_control                           0.00000
pedestrian_crossing_human_control          0.00000
pedestrian_crossing_physical_facilities    0.00000
road_surface_conditions                    0.00000
special_conditions_at_site                 0.00000
carriageway_hazards            

**Observation**

% missing is so minimal, so impute with mode value as it is a categorical value column

In [14]:
missing_data_col= ['longitude', 'latitude']

for col in missing_data_col:
    data_used[col] = data_used[col].fillna(data_used[col].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used[col] = data_used[col].fillna(data_used[col].mode()[0])


In [15]:
data_used.isnull().sum()

longitude                                  0
latitude                                   0
local_authority_ons_district               0
urban_or_rural_area                        0
accident_severity                          0
time                                       0
day_of_week                                0
date                                       0
first_road_class                           0
first_road_number                          0
second_road_class                          0
second_road_number                         0
road_type                                  0
junction_detail                            0
junction_control                           0
pedestrian_crossing_human_control          0
pedestrian_crossing_physical_facilities    0
road_surface_conditions                    0
special_conditions_at_site                 0
carriageway_hazards                        0
speed_limit                                0
light_conditions                           0
weather_co

**Data Cleaning**

In [16]:
data_used.head()

Unnamed: 0,longitude,latitude,local_authority_ons_district,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,...,junction_detail,junction_control,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions
0,-0.202878,51.418974,E09000024,1,3,01:24,1,01/01/2023,5,0,...,9,4,2,5,2,0,0,20,4,8
1,-0.042464,51.671155,E09000010,1,3,02:25,1,01/01/2023,6,0,...,3,4,0,1,1,0,0,30,4,1
2,-0.435789,51.487777,E09000017,1,3,03:50,1,01/01/2023,3,437,...,1,4,0,0,1,0,0,30,4,1
3,-0.263972,51.597575,E09000003,1,3,02:13,1,01/01/2023,3,5,...,3,4,0,0,1,0,0,30,4,9
4,-0.168976,51.477324,E09000032,1,3,01:42,1,01/01/2023,3,3220,...,8,4,0,0,1,0,0,30,4,1


In [17]:
# Identify numerical columns
numerical_cols = data_used.select_dtypes(include=[np.number]).columns.tolist()
# Remove target from numerical analysis
if 'accident_severity' in numerical_cols:
    numerical_cols.remove('accident_severity')

print(f"Numerical columns: {numerical_cols}")
print(f"Number of numerical features: {len(numerical_cols)}")

Numerical columns: ['longitude', 'latitude', 'urban_or_rural_area', 'day_of_week', 'first_road_class', 'first_road_number', 'second_road_class', 'second_road_number', 'road_type', 'junction_detail', 'junction_control', 'pedestrian_crossing_human_control', 'pedestrian_crossing_physical_facilities', 'road_surface_conditions', 'special_conditions_at_site', 'carriageway_hazards', 'speed_limit', 'light_conditions', 'weather_conditions']
Number of numerical features: 19


In [18]:
# Identify categorical columns (including encoded ones)
categorical_cols = []
for col in data_used.columns:
    if col not in numerical_cols and col != 'accident_severity':
        categorical_cols.append(col)

print(f"Categorical/Encoded columns: {categorical_cols}")
print(f"Number of categorical features: {len(categorical_cols)}")

# Unique values count for each categorical variable
print("\nUnique Values Count in Categorical Features:")
for col in categorical_cols:
    unique_count = data[col].nunique()
    print(f"{col}: {unique_count} unique values")


Categorical/Encoded columns: ['local_authority_ons_district', 'time', 'date']
Number of categorical features: 3

Unique Values Count in Categorical Features:
local_authority_ons_district: 351 unique values
time: 1440 unique values
date: 365 unique values


In [19]:
from scipy.stats import chi2_contingency
for col in categorical_cols:
    table = pd.crosstab(data[col], data['accident_severity'])
    chi2, p, dof, ex = chi2_contingency(table)
    print(f"{col}: p={p}")

local_authority_ons_district: p=0.0
time: p=4.470185787298401e-82
date: p=5.631704306722273e-11


In [20]:
data_used.head()

Unnamed: 0,longitude,latitude,local_authority_ons_district,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,...,junction_detail,junction_control,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions
0,-0.202878,51.418974,E09000024,1,3,01:24,1,01/01/2023,5,0,...,9,4,2,5,2,0,0,20,4,8
1,-0.042464,51.671155,E09000010,1,3,02:25,1,01/01/2023,6,0,...,3,4,0,1,1,0,0,30,4,1
2,-0.435789,51.487777,E09000017,1,3,03:50,1,01/01/2023,3,437,...,1,4,0,0,1,0,0,30,4,1
3,-0.263972,51.597575,E09000003,1,3,02:13,1,01/01/2023,3,5,...,3,4,0,0,1,0,0,30,4,9
4,-0.168976,51.477324,E09000032,1,3,01:42,1,01/01/2023,3,3220,...,8,4,0,0,1,0,0,30,4,1


In [21]:
#Dealing with longitude, latitude columns
from sklearn.cluster import KMeans
import numpy as np

coords = data_used[["longitude", "latitude"]].dropna()
kmeans = KMeans(n_clusters=10, random_state=42)
data_used["location_cluster"] = kmeans.fit_predict(coords)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used["location_cluster"] = kmeans.fit_predict(coords)


In [22]:
# Extract coordinates again
coords = data_used[["longitude", "latitude"]].dropna()

# Fit KMeans with 10 clusters
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
data_used.loc[coords.index, "region_cluster"] = kmeans.fit_predict(coords)

# Create a mapping: cluster -> centroid
cluster_centroids = pd.DataFrame(kmeans.cluster_centers_, columns=["longitude_centroid", "latitude_centroid"])
cluster_centroids["region_cluster"] = cluster_centroids.index

print("Cluster Mapping (region_cluster -> centroid coords):")
print(cluster_centroids)

# Optional: convert to dictionary for easy decoding later
cluster_dict = cluster_centroids.set_index("region_cluster")[["longitude_centroid", "latitude_centroid"]].to_dict("index")
print("\nCluster dictionary:\n", cluster_dict)


Cluster Mapping (region_cluster -> centroid coords):
   longitude_centroid  latitude_centroid  region_cluster
0           -0.158269          51.499878               0
1           -2.699442          53.546116               1
2           -2.685578          51.496981               2
3           -3.731186          56.110013               3
4           -0.359383          53.238561               4
5           -4.234061          50.850118               5
6           -1.613406          52.619560               6
7            0.880361          51.747092               7
8           -1.508546          54.046970               8
9           -1.320280          51.110596               9

Cluster dictionary:
 {0: {'longitude_centroid': -0.15826926218984938, 'latitude_centroid': 51.49987836305786}, 1: {'longitude_centroid': -2.6994416063084103, 'latitude_centroid': 53.54611621063983}, 2: {'longitude_centroid': -2.6855776567018705, 'latitude_centroid': 51.496980539561335}, 3: {'longitude_centroid': -3.73

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used.loc[coords.index, "region_cluster"] = kmeans.fit_predict(coords)


In [23]:
data_used.tail()

Unnamed: 0,longitude,latitude,local_authority_ons_district,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,...,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions,location_cluster,region_cluster
104253,-4.44749,55.819059,S12000038,1,2,08:43,5,07/12/2023,6,0,...,-1,-1,2,-1,-1,30,5,2,5,3.0
104254,-4.7522,55.09792,S12000028,2,3,10:10,6,06/10/2023,6,0,...,0,0,1,0,0,60,1,9,5,3.0
104255,-2.276957,57.148422,S12000034,1,3,17:00,4,15/11/2023,6,0,...,-1,-1,2,-1,-1,30,4,2,5,3.0
104256,-4.047591,55.775637,S12000029,1,3,21:40,3,07/02/2023,6,0,...,-1,-1,1,-1,-1,20,4,8,5,3.0
104257,-3.163102,55.88835,S12000019,2,3,16:17,6,07/07/2023,3,701,...,0,0,1,0,0,50,1,1,5,3.0


In [24]:
data_used.isnull().sum()

longitude                                  0
latitude                                   0
local_authority_ons_district               0
urban_or_rural_area                        0
accident_severity                          0
time                                       0
day_of_week                                0
date                                       0
first_road_class                           0
first_road_number                          0
second_road_class                          0
second_road_number                         0
road_type                                  0
junction_detail                            0
junction_control                           0
pedestrian_crossing_human_control          0
pedestrian_crossing_physical_facilities    0
road_surface_conditions                    0
special_conditions_at_site                 0
carriageway_hazards                        0
speed_limit                                0
light_conditions                           0
weather_co

In [25]:
counts = data_used["local_authority_ons_district"].value_counts()
rare_districts = counts[counts < 100].index
data_used["ons_district_grouped"] = data_used["local_authority_ons_district"].replace(rare_districts, "Other")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used["ons_district_grouped"] = data_used["local_authority_ons_district"].replace(rare_districts, "Other")


In [26]:
data["ons_district_grouped"] = data["local_authority_ons_district"].replace(
    rare_districts, "Other")

In [27]:
data_used.tail()

Unnamed: 0,longitude,latitude,local_authority_ons_district,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,...,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped
104253,-4.44749,55.819059,S12000038,1,2,08:43,5,07/12/2023,6,0,...,-1,2,-1,-1,30,5,2,5,3.0,S12000038
104254,-4.7522,55.09792,S12000028,2,3,10:10,6,06/10/2023,6,0,...,0,1,0,0,60,1,9,5,3.0,Other
104255,-2.276957,57.148422,S12000034,1,3,17:00,4,15/11/2023,6,0,...,-1,2,-1,-1,30,4,2,5,3.0,S12000034
104256,-4.047591,55.775637,S12000029,1,3,21:40,3,07/02/2023,6,0,...,-1,1,-1,-1,20,4,8,5,3.0,S12000029
104257,-3.163102,55.88835,S12000019,2,3,16:17,6,07/07/2023,3,701,...,0,1,0,0,50,1,1,5,3.0,Other


In [28]:
# Identify numerical columns
numerical_cols = data_used.select_dtypes(include=[np.number]).columns.tolist()
# Remove target from numerical analysis
if 'accident_severity' in numerical_cols:
    numerical_cols.remove('accident_severity')

print(f"Numerical columns: {numerical_cols}")
print(f"Number of numerical features: {len(numerical_cols)}")

# Identify categorical columns (including encoded ones)
categorical_cols = []
for col in data_used.columns:
    if col not in numerical_cols and col != 'accident_severity':
        categorical_cols.append(col)

print(f"Categorical/Encoded columns: {categorical_cols}")
print(f"Number of categorical features: {len(categorical_cols)}")

Numerical columns: ['longitude', 'latitude', 'urban_or_rural_area', 'day_of_week', 'first_road_class', 'first_road_number', 'second_road_class', 'second_road_number', 'road_type', 'junction_detail', 'junction_control', 'pedestrian_crossing_human_control', 'pedestrian_crossing_physical_facilities', 'road_surface_conditions', 'special_conditions_at_site', 'carriageway_hazards', 'speed_limit', 'light_conditions', 'weather_conditions', 'location_cluster', 'region_cluster']
Number of numerical features: 21
Categorical/Encoded columns: ['local_authority_ons_district', 'time', 'date', 'ons_district_grouped']
Number of categorical features: 4


In [29]:

# Unique values count for each categorical variable
print("Unique Values Count in Categorical Features:")
for col in categorical_cols:
    unique_count = data[col].nunique()
    print(f"{col}: {unique_count} unique values")

Unique Values Count in Categorical Features:
local_authority_ons_district: 351 unique values
time: 1440 unique values
date: 365 unique values
ons_district_grouped: 311 unique values


In [30]:
data_used = data_used.drop('local_authority_ons_district', axis=1)

In [31]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,second_road_class,...,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped
0,-0.202878,51.418974,1,3,01:24,1,01/01/2023,5,0,5,...,5,2,0,0,20,4,8,1,0.0,E09000024
1,-0.042464,51.671155,1,3,02:25,1,01/01/2023,6,0,6,...,1,1,0,0,30,4,1,1,0.0,E09000010
2,-0.435789,51.487777,1,3,03:50,1,01/01/2023,3,437,6,...,0,1,0,0,30,4,1,1,0.0,E09000017
3,-0.263972,51.597575,1,3,02:13,1,01/01/2023,3,5,6,...,0,1,0,0,30,4,9,1,0.0,E09000003
4,-0.168976,51.477324,1,3,01:42,1,01/01/2023,3,3220,6,...,0,1,0,0,30,4,1,1,0.0,E09000032


** **pin on longitude/latitude col**

In [32]:
#urban/rural col

# Count unique values in one column
print(data_used["urban_or_rural_area"].value_counts())

# If you also want percentages
print(data_used["urban_or_rural_area"].value_counts(normalize=True) * 100)


urban_or_rural_area
 1    70312
 2    33934
-1        8
 3        4
Name: count, dtype: int64
urban_or_rural_area
 1    67.440388
 2    32.548102
-1     0.007673
 3     0.003837
Name: proportion, dtype: float64


In [33]:
# Drop rows where urban_or_rural_area is -1 or 3
data = data[~data["urban_or_rural_area"].isin([-1, 3])]

# Check again
print(data["urban_or_rural_area"].value_counts(normalize=True) * 100)


urban_or_rural_area
1    67.448151
2    32.551849
Name: proportion, dtype: float64


In [34]:
# Keep only rows where urban_or_rural_area is 1 or 2
data = data[data["urban_or_rural_area"].isin([1, 2])]

# Verify cleanup
print(data["urban_or_rural_area"].value_counts(normalize=True) * 100)


urban_or_rural_area
1    67.448151
2    32.551849
Name: proportion, dtype: float64


In [35]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,time,day_of_week,date,first_road_class,first_road_number,second_road_class,...,pedestrian_crossing_physical_facilities,road_surface_conditions,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped
0,-0.202878,51.418974,1,3,01:24,1,01/01/2023,5,0,5,...,5,2,0,0,20,4,8,1,0.0,E09000024
1,-0.042464,51.671155,1,3,02:25,1,01/01/2023,6,0,6,...,1,1,0,0,30,4,1,1,0.0,E09000010
2,-0.435789,51.487777,1,3,03:50,1,01/01/2023,3,437,6,...,0,1,0,0,30,4,1,1,0.0,E09000017
3,-0.263972,51.597575,1,3,02:13,1,01/01/2023,3,5,6,...,0,1,0,0,30,4,9,1,0.0,E09000003
4,-0.168976,51.477324,1,3,01:42,1,01/01/2023,3,3220,6,...,0,1,0,0,30,4,1,1,0.0,E09000032


In [36]:
# Robust Time Conversion
print("Converting time column from 'HH:MM' format...")

# Convert the column to datetime, specifying the exact format
data_used['accident_time'] = pd.to_datetime(data_used['time'], format='%H:%M', errors='coerce')

# Time Engineering
print("Creating time-based features...")
# Extract components
data_used['hour'] = data_used['accident_time'].dt.hour
data_used['minute'] = data_used['accident_time'].dt.minute

# Create cyclical time features (CRITICAL for models to understand 23:00 is close to 00:00)
data_used['hour_sin'] = np.sin(2 * np.pi * data_used['hour'] / 24)
data_used['hour_cos'] = np.cos(2 * np.pi * data_used['hour'] / 24)

# Create interpretable binary features
data_used['is_rush_hour'] = ((data_used['hour'] >= 7) & (data_used['hour'] <= 9)) | ((data_used['hour'] >= 16) & (data_used['hour'] <= 18))
data_used['is_night'] = (data_used['hour'] >= 19) | (data_used['hour'] <= 6)  # Simple definition of night

# May not need the original 'time' and 'accident_time' column for modeling and consider dropping.
data_used = data_used.drop(columns=['time', 'accident_time'])

# Verify the new features
print(data_used[['hour', 'hour_sin', 'hour_cos', 'is_rush_hour', 'is_night']].tail())


Converting time column from 'HH:MM' format...
Creating time-based features...
        hour  hour_sin  hour_cos  is_rush_hour  is_night
104253     8  0.866025 -0.500000          True     False
104254    10  0.500000 -0.866025         False     False
104255    17 -0.965926 -0.258819          True     False
104256    21 -0.707107  0.707107         False      True
104257    16 -0.866025 -0.500000          True     False


In [37]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,date,first_road_class,first_road_number,second_road_class,second_road_number,...,weather_conditions,location_cluster,region_cluster,ons_district_grouped,hour,minute,hour_sin,hour_cos,is_rush_hour,is_night
0,-0.202878,51.418974,1,3,1,01/01/2023,5,0,5,0,...,8,1,0.0,E09000024,1,24,0.258819,0.965926,False,True
1,-0.042464,51.671155,1,3,1,01/01/2023,6,0,6,0,...,1,1,0.0,E09000010,2,25,0.5,0.866025,False,True
2,-0.435789,51.487777,1,3,1,01/01/2023,3,437,6,0,...,1,1,0.0,E09000017,3,50,0.707107,0.707107,False,True
3,-0.263972,51.597575,1,3,1,01/01/2023,3,5,6,0,...,9,1,0.0,E09000003,2,13,0.5,0.866025,False,True
4,-0.168976,51.477324,1,3,1,01/01/2023,3,3220,6,0,...,1,1,0.0,E09000032,1,42,0.258819,0.965926,False,True


In [38]:
drop_cols=['hour','minute','hour_sin','hour_cos']
data_used= data_used.drop(drop_cols, axis=1)
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,date,first_road_class,first_road_number,second_road_class,second_road_number,...,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped,is_rush_hour,is_night
0,-0.202878,51.418974,1,3,1,01/01/2023,5,0,5,0,...,0,0,20,4,8,1,0.0,E09000024,False,True
1,-0.042464,51.671155,1,3,1,01/01/2023,6,0,6,0,...,0,0,30,4,1,1,0.0,E09000010,False,True
2,-0.435789,51.487777,1,3,1,01/01/2023,3,437,6,0,...,0,0,30,4,1,1,0.0,E09000017,False,True
3,-0.263972,51.597575,1,3,1,01/01/2023,3,5,6,0,...,0,0,30,4,9,1,0.0,E09000003,False,True
4,-0.168976,51.477324,1,3,1,01/01/2023,3,3220,6,0,...,0,0,30,4,1,1,0.0,E09000032,False,True


In [39]:
# Identify numerical columns
numerical_cols = data_used.select_dtypes(include=[np.number]).columns.tolist()
# Remove target from numerical analysis
if 'accident_severity' in numerical_cols:
    numerical_cols.remove('accident_severity')

print(f"Numerical columns: {numerical_cols}")
print(f"Number of numerical features: {len(numerical_cols)}")


Numerical columns: ['longitude', 'latitude', 'urban_or_rural_area', 'day_of_week', 'first_road_class', 'first_road_number', 'second_road_class', 'second_road_number', 'road_type', 'junction_detail', 'junction_control', 'pedestrian_crossing_human_control', 'pedestrian_crossing_physical_facilities', 'road_surface_conditions', 'special_conditions_at_site', 'carriageway_hazards', 'speed_limit', 'light_conditions', 'weather_conditions', 'location_cluster', 'region_cluster']
Number of numerical features: 21


In [40]:
# Unique values count for each numerical variable
print("Unique Values Count in Numerical Features:")
for col in numerical_cols:
    unique_count = data_used[col].nunique()
    print(f"{col}: {unique_count} unique values")


Unique Values Count in Numerical Features:
longitude: 102157 unique values
latitude: 100922 unique values
urban_or_rural_area: 4 unique values
day_of_week: 7 unique values
first_road_class: 6 unique values
first_road_number: 3082 unique values
second_road_class: 8 unique values
second_road_number: 2360 unique values
road_type: 6 unique values
junction_detail: 11 unique values
junction_control: 6 unique values
pedestrian_crossing_human_control: 5 unique values
pedestrian_crossing_physical_facilities: 8 unique values
road_surface_conditions: 7 unique values
special_conditions_at_site: 10 unique values
carriageway_hazards: 8 unique values
speed_limit: 6 unique values
light_conditions: 5 unique values
weather_conditions: 9 unique values
location_cluster: 10 unique values
region_cluster: 10 unique values


In [41]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,date,first_road_class,first_road_number,second_road_class,second_road_number,...,special_conditions_at_site,carriageway_hazards,speed_limit,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped,is_rush_hour,is_night
0,-0.202878,51.418974,1,3,1,01/01/2023,5,0,5,0,...,0,0,20,4,8,1,0.0,E09000024,False,True
1,-0.042464,51.671155,1,3,1,01/01/2023,6,0,6,0,...,0,0,30,4,1,1,0.0,E09000010,False,True
2,-0.435789,51.487777,1,3,1,01/01/2023,3,437,6,0,...,0,0,30,4,1,1,0.0,E09000017,False,True
3,-0.263972,51.597575,1,3,1,01/01/2023,3,5,6,0,...,0,0,30,4,9,1,0.0,E09000003,False,True
4,-0.168976,51.477324,1,3,1,01/01/2023,3,3220,6,0,...,0,0,30,4,1,1,0.0,E09000032,False,True


In [42]:
#date col

# Handle date
data_used["date"] = pd.to_datetime(data["date"], format="%d/%m/%Y", errors="coerce")
#data_used["date"] = pd.to_datetime(data_used["date"], errors="coerce")
data_used["month"] = data_used["date"].dt.month
# data_used["day"] = data_used["date"].dt.day   # optional if you want day granularity
data_used["weekday"] = data_used["date"].dt.weekday
data_used["is_weekend"] = data_used["weekday"].apply(lambda x: 1 if x >= 5 else 0)

# Drop unused raw columns
data_used = data_used.drop(columns=["date",])


In [43]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,first_road_class,first_road_number,second_road_class,second_road_number,road_type,...,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped,is_rush_hour,is_night,month,weekday,is_weekend
0,-0.202878,51.418974,1,3,1,5,0,5,0,2,...,4,8,1,0.0,E09000024,False,True,1.0,6.0,1
1,-0.042464,51.671155,1,3,1,6,0,6,0,6,...,4,1,1,0.0,E09000010,False,True,1.0,6.0,1
2,-0.435789,51.487777,1,3,1,3,437,6,0,1,...,4,1,1,0.0,E09000017,False,True,1.0,6.0,1
3,-0.263972,51.597575,1,3,1,3,5,6,0,6,...,4,9,1,0.0,E09000003,False,True,1.0,6.0,1
4,-0.168976,51.477324,1,3,1,3,3220,6,0,6,...,4,1,1,0.0,E09000032,False,True,1.0,6.0,1


In [44]:
#first road class/second

# Unique value counts for the road features
print("First Road Class:")
print(data_used["first_road_class"].value_counts(), "\n")

print("Second Road Class:")
print(data_used["second_road_class"].value_counts(dropna=False), "\n")


First Road Class:
first_road_class
3    45872
6    37315
4    13117
5     4596
1     3121
2      237
Name: count, dtype: int64 

Second Road Class:
second_road_class
 6    43596
 0    42962
 3    10277
 4     4005
 5     2978
 1      312
-1       84
 2       44
Name: count, dtype: int64 



In [45]:
# Drop rows where second_road_class is -1
data_used = data_used[data_used["second_road_class"] != -1]

# # Reset index
# data_used = data_used.reset_index(drop=True)

print("Remaining unique values in second_road_class:")
print(data_used["second_road_class"].value_counts())


Remaining unique values in second_road_class:
second_road_class
6    43596
0    42962
3    10277
4     4005
5     2978
1      312
2       44
Name: count, dtype: int64


In [46]:
print("First Road Number:")
print(data_used["first_road_number"].nunique(), "unique values\n")

print("Second Road Number:")
print(data_used["second_road_number"].nunique(), "unique values\n")


First Road Number:
3081 unique values

Second Road Number:
2360 unique values



In [47]:
# Drop high-cardinality road number columns
data_used = data_used.drop(columns=["first_road_number", "second_road_number"])

(data_used.columns)


Index(['longitude', 'latitude', 'urban_or_rural_area', 'accident_severity',
       'day_of_week', 'first_road_class', 'second_road_class', 'road_type',
       'junction_detail', 'junction_control',
       'pedestrian_crossing_human_control',
       'pedestrian_crossing_physical_facilities', 'road_surface_conditions',
       'special_conditions_at_site', 'carriageway_hazards', 'speed_limit',
       'light_conditions', 'weather_conditions', 'location_cluster',
       'region_cluster', 'ons_district_grouped', 'is_rush_hour', 'is_night',
       'month', 'weekday', 'is_weekend'],
      dtype='object')

In [48]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,first_road_class,second_road_class,road_type,junction_detail,junction_control,...,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped,is_rush_hour,is_night,month,weekday,is_weekend
0,-0.202878,51.418974,1,3,1,5,5,2,9,4,...,4,8,1,0.0,E09000024,False,True,1.0,6.0,1
1,-0.042464,51.671155,1,3,1,6,6,6,3,4,...,4,1,1,0.0,E09000010,False,True,1.0,6.0,1
2,-0.435789,51.487777,1,3,1,3,6,1,1,4,...,4,1,1,0.0,E09000017,False,True,1.0,6.0,1
3,-0.263972,51.597575,1,3,1,3,6,6,3,4,...,4,9,1,0.0,E09000003,False,True,1.0,6.0,1
4,-0.168976,51.477324,1,3,1,3,6,6,8,4,...,4,1,1,0.0,E09000032,False,True,1.0,6.0,1


In [49]:
#junction cols

print(data_used["junction_detail"].value_counts(), "\n")

print(data_used["junction_control"].value_counts(),"")

junction_detail
 0     42978
 3     29013
 6      9497
 1      7537
 9      6167
 99     2393
 8      2253
 7      1629
 2      1579
 5      1127
-1         1
Name: count, dtype: int64 

junction_control
 4    45154
-1    43790
 2    11629
 9     2220
 3      763
 1      618
Name: count, dtype: int64 


In [50]:
data_used['junction_detail'] = data_used['junction_detail'].replace(-1, data_used['junction_detail'].mode()[0])
data_used['junction_control'] = data_used['junction_control'].replace(-1, data_used['junction_control'].mode()[0])

In [51]:
#check again
print(data_used["junction_detail"].value_counts(), "\n")

print(data_used["junction_control"].value_counts(),"")

junction_detail
0     42979
3     29013
6      9497
1      7537
9      6167
99     2393
8      2253
7      1629
2      1579
5      1127
Name: count, dtype: int64 

junction_control
4    88944
2    11629
9     2220
3      763
1      618
Name: count, dtype: int64 


** **NOTE: 99 is unknown**

In [52]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,first_road_class,second_road_class,road_type,junction_detail,junction_control,...,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped,is_rush_hour,is_night,month,weekday,is_weekend
0,-0.202878,51.418974,1,3,1,5,5,2,9,4,...,4,8,1,0.0,E09000024,False,True,1.0,6.0,1
1,-0.042464,51.671155,1,3,1,6,6,6,3,4,...,4,1,1,0.0,E09000010,False,True,1.0,6.0,1
2,-0.435789,51.487777,1,3,1,3,6,1,1,4,...,4,1,1,0.0,E09000017,False,True,1.0,6.0,1
3,-0.263972,51.597575,1,3,1,3,6,6,3,4,...,4,9,1,0.0,E09000003,False,True,1.0,6.0,1
4,-0.168976,51.477324,1,3,1,3,6,6,8,4,...,4,1,1,0.0,E09000032,False,True,1.0,6.0,1


In [53]:
data_used.columns

Index(['longitude', 'latitude', 'urban_or_rural_area', 'accident_severity',
       'day_of_week', 'first_road_class', 'second_road_class', 'road_type',
       'junction_detail', 'junction_control',
       'pedestrian_crossing_human_control',
       'pedestrian_crossing_physical_facilities', 'road_surface_conditions',
       'special_conditions_at_site', 'carriageway_hazards', 'speed_limit',
       'light_conditions', 'weather_conditions', 'location_cluster',
       'region_cluster', 'ons_district_grouped', 'is_rush_hour', 'is_night',
       'month', 'weekday', 'is_weekend'],
      dtype='object')

In [54]:
#Pedestrian cols

print(data_used["pedestrian_crossing_human_control"].value_counts(), "\n")

print(data_used["pedestrian_crossing_physical_facilities"].value_counts(),"")

pedestrian_crossing_human_control
 0    93781
 9     5876
-1     2788
 2     1294
 1      435
Name: count, dtype: int64 

pedestrian_crossing_physical_facilities
 0    74625
 5     8131
 4     6120
 9     5110
 1     4489
-1     2782
 8     2659
 7      258
Name: count, dtype: int64 


In [55]:
#replacing -1
data_used['pedestrian_crossing_human_control'] = data_used['pedestrian_crossing_human_control'].replace(-1, -99)
data_used['pedestrian_crossing_physical_facilities'] = data_used['pedestrian_crossing_physical_facilities'].replace(-1, -99)

In [56]:
#Pedestrian cols check again
print(data_used["pedestrian_crossing_human_control"].value_counts(), "\n")
print(data_used["pedestrian_crossing_physical_facilities"].value_counts(),"")

pedestrian_crossing_human_control
 0     93781
 9      5876
-99     2788
 2      1294
 1       435
Name: count, dtype: int64 

pedestrian_crossing_physical_facilities
 0     74625
 5      8131
 4      6120
 9      5110
 1      4489
-99     2782
 8      2659
 7       258
Name: count, dtype: int64 


In [57]:
data_used.head()

Unnamed: 0,longitude,latitude,urban_or_rural_area,accident_severity,day_of_week,first_road_class,second_road_class,road_type,junction_detail,junction_control,...,light_conditions,weather_conditions,location_cluster,region_cluster,ons_district_grouped,is_rush_hour,is_night,month,weekday,is_weekend
0,-0.202878,51.418974,1,3,1,5,5,2,9,4,...,4,8,1,0.0,E09000024,False,True,1.0,6.0,1
1,-0.042464,51.671155,1,3,1,6,6,6,3,4,...,4,1,1,0.0,E09000010,False,True,1.0,6.0,1
2,-0.435789,51.487777,1,3,1,3,6,1,1,4,...,4,1,1,0.0,E09000017,False,True,1.0,6.0,1
3,-0.263972,51.597575,1,3,1,3,6,6,3,4,...,4,9,1,0.0,E09000003,False,True,1.0,6.0,1
4,-0.168976,51.477324,1,3,1,3,6,6,8,4,...,4,1,1,0.0,E09000032,False,True,1.0,6.0,1


In [58]:
#light conditions col

print(data_used["light_conditions"].value_counts(),"")

light_conditions
1    74186
4    21978
6     5625
7     1634
5      751
Name: count, dtype: int64 


In [59]:
#weather conditions col
print(data_used["weather_conditions"].value_counts(),"")

weather_conditions
1    81962
2    12830
9     3332
8     3081
5     1224
4      975
3      395
7      333
6       42
Name: count, dtype: int64 


In [60]:
#light conditions col
print(data_used["road_surface_conditions"].value_counts(),"")

road_surface_conditions
 1    72700
 2    26914
 9     1615
 4     1461
-1     1064
 3      241
 5      179
Name: count, dtype: int64 


In [61]:
data_used['road_surface_conditions'] = data_used['road_surface_conditions'].replace(-1, -9)


In [62]:
data_used.isnull().sum()

longitude                                   0
latitude                                    0
urban_or_rural_area                         0
accident_severity                           0
day_of_week                                 0
first_road_class                            0
second_road_class                           0
road_type                                   0
junction_detail                             0
junction_control                            0
pedestrian_crossing_human_control           0
pedestrian_crossing_physical_facilities     0
road_surface_conditions                     0
special_conditions_at_site                  0
carriageway_hazards                         0
speed_limit                                 0
light_conditions                            0
weather_conditions                          0
location_cluster                            0
region_cluster                              0
ons_district_grouped                        0
is_rush_hour                      

In [63]:
data["date"] = pd.to_datetime(data["date"], errors="coerce")
print(data["date"].isna().sum())


63082


In [64]:
data.isnull().sum()

accident_index                                     0
accident_year                                      0
accident_reference                                 0
location_easting_osgr                              0
location_northing_osgr                             0
longitude                                          0
latitude                                           0
police_force                                       0
accident_severity                                  0
number_of_vehicles                                 0
number_of_casualties                               0
date                                           63082
day_of_week                                        0
time                                               0
local_authority_district                           0
local_authority_ons_district                       0
local_authority_highway                            0
first_road_class                                   0
first_road_number                             

In [65]:
#month
print(data_used["month"].value_counts(),"")

month
6.0     9736
7.0     9159
11.0    9149
10.0    9133
5.0     9069
9.0     9008
1.0     8746
8.0     8414
12.0    8269
3.0     8179
4.0     7744
2.0     7556
Name: count, dtype: int64 


In [66]:
#weekday
print(data_used["weekday"].value_counts(),"")

weekday
4.0    17097
3.0    15781
2.0    15738
1.0    15419
0.0    14278
5.0    14161
6.0    11688
Name: count, dtype: int64 


In [67]:
for col in data_used.columns:
    data_used[col] = data_used[col].fillna(data_used[col].mode()[0])

In [68]:
data_used.isnull().sum()

longitude                                  0
latitude                                   0
urban_or_rural_area                        0
accident_severity                          0
day_of_week                                0
first_road_class                           0
second_road_class                          0
road_type                                  0
junction_detail                            0
junction_control                           0
pedestrian_crossing_human_control          0
pedestrian_crossing_physical_facilities    0
road_surface_conditions                    0
special_conditions_at_site                 0
carriageway_hazards                        0
speed_limit                                0
light_conditions                           0
weather_conditions                         0
location_cluster                           0
region_cluster                             0
ons_district_grouped                       0
is_rush_hour                               0
is_night  

In [69]:
data.shape

(104246, 38)

In [70]:
data_used.shape

(104174, 26)

In [71]:
#save to new csv
data_used.to_csv('roadsafety_cleaned.csv', index=False)