## Pre-processing for machine learning

For the machine learning task, we will only use the collisions dataset, and only features that might be known before an accident happens. Morover, we will only use the data from years 2000 - 2022 for this task. Thus, the following steps will be taken in this notebook:
1. Read in the data
2. Filter the data based on years
3. Remove columns
4. Rename categorical values
5. Binning (speed limits and time)
6. Remove unknowns
7. Split the data into train, validation, and test sets
8. One-hot encoding
9. Balance the train data
10. Save the train, validation, and test sets as .csv files

#### 1. Read in the data

In [1]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from joblib import dump
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
acc_data_ML = pd.read_csv('../00_data/UK_road_casualty_collision_1979_2022.csv') 

In [3]:
acc_data_ML.shape

(8809915, 36)

#### 2. Filter the data based on years

In [4]:
# Remove all records prior to year 2000
acc_data_ML = acc_data_ML[acc_data_ML['accident_year'] >= 2000]

In [5]:
acc_data_ML.shape

(3691651, 36)

#### 3. Remove columns

In [6]:
acc_data_ML.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3691651 entries, 5118264 to 8809914
Data columns (total 36 columns):
 #   Column                                       Dtype  
---  ------                                       -----  
 0   accident_index                               object 
 1   accident_year                                int64  
 2   accident_reference                           object 
 3   location_easting_osgr                        float64
 4   location_northing_osgr                       float64
 5   longitude                                    float64
 6   latitude                                     float64
 7   police_force                                 int64  
 8   accident_severity                            int64  
 9   number_of_vehicles                           int64  
 10  number_of_casualties                         int64  
 11  date                                         object 
 12  day_of_week                                  int64  
 13  time  

In [7]:
# Remove the columns with (for ML) non-relevant information and columns with information that's unknown before an 
# accident occurs
cols_remove = ['accident_index', 'accident_year', 'accident_reference', 'location_easting_osgr', 'location_northing_osgr', 
               'longitude', 'latitude', 'police_force', 'number_of_vehicles', 'number_of_casualties', 'date',
               'local_authority_district', 'local_authority_ons_district', 'local_authority_highway', 'first_road_number', 
               'second_road_number', 'pedestrian_crossing_human_control', 'pedestrian_crossing_physical_facilities',
               'did_police_officer_attend_scene_of_accident', 'trunk_road_flag', 'lsoa_of_accident_location']


acc_data_ML = acc_data_ML.drop(cols_remove, axis = 1)

# Replace nan values with -1
acc_data_ML.fillna(-1, inplace = True)

In [8]:
acc_data_ML.shape

(3691651, 15)

In [9]:
# Check all unimportant features were removed
acc_data_ML.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3691651 entries, 5118264 to 8809914
Data columns (total 15 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   accident_severity           int64  
 1   day_of_week                 int64  
 2   time                        object 
 3   first_road_class            int64  
 4   road_type                   int64  
 5   speed_limit                 float64
 6   junction_detail             int64  
 7   junction_control            int64  
 8   second_road_class           int64  
 9   light_conditions            int64  
 10  weather_conditions          int64  
 11  road_surface_conditions     int64  
 12  special_conditions_at_site  int64  
 13  carriageway_hazards         int64  
 14  urban_or_rural_area         int64  
dtypes: float64(1), int64(13), object(1)
memory usage: 450.6+ MB


#### 4. Rename categorical values

In [10]:
# Dictionary for renaming the features

# Only interested in predicting fatal or non-fatal accidents, thus grouping severity 2 and 3 (severe and slight) into one
# group (0) and fatal as 1
accident_severity_dict = { 
    1: 1, 2: 0, 3: 0
    }
day_of_week_dict = {
    1: 'sunday', 2: 'monday', 3: 'tuesday', 4: 'wednesday', 5: 'thursday', 6: 'friday', 7: 'saturday'
    }
first_road_class_dict = {
    1: 'motorway', 2: 'motorway', 3: 'A', 4: 'B', 5: 'C', 6: 'unknown', -1: 'unknown'
    }
road_type_dict = {
    1: 'roundabout', 2: 'one_way_or_slip_road', 3: 'dual_carriageway', 6: 'single_carriageway', 7: 'one_way_or_slip_road', 
    9: 'unknown', 12: 'one_way_or_slip_road', -1: 'unknown'
    }
junction_detail_dict = {
    0: 'not_a_junction', 1: 'roundabout', 2: 'roundabout', 3: 'tor_staggered_junction', 5: 'slip_road', 6: 'crossroads', 
    7: 'more_than_4_arms', 8: 'private_drive_or_entrance', 9: 'other_junction', 99: 'unknown', -1: 'unknown'
    }
junction_control_dict = {
    0: 'not_a_junction', 1: 'authorised_person', 2: 'auto_traffic_signal', 3: 'stop_sign', 4: 'give_way_or_uncontrolled', 
    -1: 'unknown', 9: 'unknown'
    }
second_road_class_dict = {
    0: 'not_a_junction', 1: 'motorway', 2: 'motorway', 3: 'A', 4: 'B', 5: 'C', 6: 'unknown', 9: 'unknown', -1: 'unknown'
    }
light_conditions_dict = {
    1: 'daylight', 4: 'dark', 5: 'dark', 6: 'dark', 7: 'dark', -1: 'unknown'
    }
weather_conditions_dict = {
    1: 'fine_no_high_winds', 2: 'raining_no_high_winds', 3: 'snowing_no_high_winds', 4: 'fine_high_winds', 
    5: 'raining_high_winds', 6: 'snowing_high_winds', 7: 'fog_or_mist', 8: 'other_weather', 9: 'unknown', -1: 'unknown'
    }
road_surface_conditions_dict = {
    1: 'dry', 2: 'wet_or_damp', 3: 'snow', 4: 'frost_or_ice', 5: 'flood_over_3cm', 6: 'oil_or_diesel_road', 7: 'mud_road', 
    -1: 'unknown', 9: 'unknown'
    }
special_conditions_at_site_dict = {
    0: 'no_special_conditions_at_site', 1: 'auto_traffic_signal_out', 2: 'auto_signal_part_defective', 
    3: 'road_sign_or_marking_defective_or_obscured', 4: 'roadworks', 5: 'road_surface_defective', 6: 'oil_or_diesel_site', 
    7: 'mud_site', -1: 'unknown', 9: 'unknown'
    }
carriageway_hazards_dict = {
    0: 'none', 1: 'vehicle_load_on_road', 2: 'other_object_on_road', 3: 'previous_accident', 4: 'dog_on_road', 
    5: 'other_animal_on_road', 6: 'pedestrian_in_carriageway', 7: 'any_animal_in_carriageway_except_ridden_horse', 
    -1: 'unknown', 9: 'unknown'
    }
urban_or_rural_area_dict = {
    1: 'urban', 2: 'rural', 3: 'unknown', 4: 'unknown'
    }

# Dictionary of dictionaries for renaming
rename_dict = {
    'accident_severity': accident_severity_dict,
    'day_of_week': day_of_week_dict,
    'first_road_class': first_road_class_dict,
    'road_type': road_type_dict,
    'junction_detail' : junction_detail_dict,
    'junction_control' : junction_control_dict,
    'second_road_class' : second_road_class_dict,
    'light_conditions' : light_conditions_dict,
    'weather_conditions' : weather_conditions_dict,
    'road_surface_conditions' : road_surface_conditions_dict,
    'special_conditions_at_site' : special_conditions_at_site_dict,
    'carriageway_hazards' : carriageway_hazards_dict,
    'urban_or_rural_area' : urban_or_rural_area_dict
}

# Loop through the dictionary to rename values
for column, mapping_dict in rename_dict.items():
    if column in acc_data_ML.columns: 
        acc_data_ML[column] = acc_data_ML[column].replace(mapping_dict)
        if column != 'accident_severity':
            acc_data_ML[column] = acc_data_ML[column].astype('category')


In [11]:
acc_data_ML.shape

(3691651, 15)

#### 5. Binning 

##### Speed limits

In [12]:
print(np.sort(acc_data_ML['speed_limit'].unique()))

[-1.  0.  5. 10. 15. 20. 25. 30. 40. 50. 60. 70.]


In [13]:
# Bin the speed limits into 'unknown', 'under_20', '20-29', '30-39', '40-49', '50-59', '60-69', '70_or_more'
bin_edges = [-np.inf, -1, 19, 29, 39, 49, 59, 69, np.inf]
bin_labels = ['unknown', 'under_20', '20-29', '30-39', '40-49', '50-59', '60-69', '70_or_more']
acc_data_ML['speed_limit_bins'] = pd.cut(acc_data_ML['speed_limit'], bins = bin_edges, labels = bin_labels, include_lowest = True)

In [14]:
# Check that binning worked as intended
print(acc_data_ML[['speed_limit', 'speed_limit_bins']].head())

         speed_limit speed_limit_bins
5118264         70.0       70_or_more
5118265         70.0       70_or_more
5118266         60.0            60-69
5118267         70.0       70_or_more
5118268         70.0       70_or_more


##### Time

In [15]:
print(np.sort(acc_data_ML['time'].unique()))

['00:00' '00:01' '00:02' ... '23:57' '23:58' '23:59']


In [16]:
# Convert 'time' column to datetime to extract the hour
acc_data_ML['hour'] = pd.to_datetime(acc_data_ML['time'], format='%H:%M').dt.hour

# Define bin edges and labels
# 00.00 - 05.59: 'night', 06.00 - 09.59: 'morning_rush', 10.00 - 15.59: 'day', 16.00 - 19.59: 'evening_rush',
# 20.00 - 23.59: 'late_evening'
bin_edges = [0, 5, 9, 15, 19, 24]
bin_labels = ['night', 'morning_rush', 'day', 'evening_rush', 'late_evening']

# Bin the time data
acc_data_ML['time_of_day'] = pd.cut(acc_data_ML['hour'], 
                                    bins = bin_edges, labels = bin_labels, include_lowest = True, right = False)


In [17]:
# Check that binning worked as intended
print(acc_data_ML[['time', 'hour', 'time_of_day']].tail(10))

          time  hour   time_of_day
8809905  00:10     0         night
8809906  11:41    11           day
8809907  09:25     9           day
8809908  01:40     1         night
8809909  15:00    15  evening_rush
8809910  15:00    15  evening_rush
8809911  21:35    21  late_evening
8809912  11:44    11           day
8809913  16:45    16  evening_rush
8809914  19:05    19  late_evening


In [18]:
# Remove original speed limit, time and hour columns from the dataset
cols_remove = ['speed_limit', 'time', 'hour']
acc_data_ML = acc_data_ML.drop(cols_remove, axis = 1)

In [19]:
acc_data_ML.shape

(3691651, 15)

#### 6. Remove unknowns

In [20]:
# Remove rows where any column has 'unknown' as value
acc_data_ML_filtered = acc_data_ML.loc[~(acc_data_ML.isin(['unknown']).any(axis=1))]

In [21]:
# Check how many rows were affected
percentage_removed = (len(acc_data_ML) - len(acc_data_ML_filtered)) / len(acc_data_ML) * 100

print(f"Percentage of rows removed: {percentage_removed:.2f}%")
print(acc_data_ML_filtered.shape)

Percentage of rows removed: 82.65%
(640359, 15)


In [22]:
# Which features contain most unknown values

unknown_counts = acc_data_ML.apply(lambda x: (x == 'unknown').sum())
print(unknown_counts)


accident_severity                   0
day_of_week                         0
first_road_class              1110250
road_type                       32598
junction_detail                  8852
junction_control              1431745
second_road_class             1516396
light_conditions                   33
weather_conditions              73388
road_surface_conditions         13882
special_conditions_at_site      13387
carriageway_hazards             11946
urban_or_rural_area              3901
speed_limit_bins                  129
time_of_day                         0
dtype: int64


Quite a major chunk of data removed, but still have quite many records. Most unknowns from first_road_class, junction_control and second_road_class.

#### 7. Split the data into train, validation, and test sets

Use ratio of 70:15:15 for train, validation and test set, respectively.

In [23]:
# Split into train (70%) and temp (30%) datasets
train_data, temp_data = train_test_split(acc_data_ML_filtered, test_size = 0.3, random_state = 33)

# Split the temp dataset into validation and test datasets (50% each of temp_data)
val_data, test_data = train_test_split(temp_data, test_size = 0.5, random_state = 33)

In [24]:
# Splitting features and labels
X_train = train_data.drop('accident_severity', axis=1)
y_train = train_data['accident_severity']

X_val = val_data.drop('accident_severity', axis=1)
y_val = val_data['accident_severity']

X_test = test_data.drop('accident_severity', axis=1)
y_test = test_data['accident_severity']

In [25]:
# Check the sizes of datasets
print(X_train.shape) #(448251, 14)
print(y_train.shape) #(448251,)

print(X_val.shape) #(96054, 14)
print(y_val.shape) #(96054,)

print(X_test.shape) #(96054, 14)
print(y_test.shape) #(96054,)

(448251, 14)
(448251,)
(96054, 14)
(96054,)
(96054, 14)
(96054,)


#### 8. One-hot encoding

In [26]:
# Confirm all features are categorical
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 448251 entries, 6998066 to 5735441
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   day_of_week                 448251 non-null  category
 1   first_road_class            448251 non-null  category
 2   road_type                   448251 non-null  category
 3   junction_detail             448251 non-null  category
 4   junction_control            448251 non-null  category
 5   second_road_class           448251 non-null  category
 6   light_conditions            448251 non-null  category
 7   weather_conditions          448251 non-null  category
 8   road_surface_conditions     448251 non-null  category
 9   special_conditions_at_site  448251 non-null  category
 10  carriageway_hazards         448251 non-null  category
 11  urban_or_rural_area         448251 non-null  category
 12  speed_limit_bins            448251 non-null  catego

In [27]:
encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

# Fit and transform the train data
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train), columns = encoder.get_feature_names_out())

# Transform the validation and test data
X_val_encoded = pd.DataFrame(encoder.transform(X_val), columns = encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(encoder.transform(X_test), columns = encoder.get_feature_names_out())

# Saving the encoder
dump(encoder, '../00_data/encoder.joblib')

['../00_data/encoder.joblib']

In [28]:
# Check dimensions of encoded datasets (confirm that number or records are the same and number of features increased)
print(X_train_encoded.shape) #(448251, 79)

print(X_val_encoded.shape) #(96054, 79)

print(X_test_encoded.shape) #(96054, 79)

(448251, 79)
(96054, 79)
(96054, 79)


#### 9. Balance the train data

In [29]:
# Oversampling using SMOTE on the encoded train data
smote = SMOTE(random_state = 33)
X_train_smote, y_train_smote = smote.fit_resample(X_train_encoded, y_train)

# Undersampling using RandomUnderSampler on the encoded train data
rus = RandomUnderSampler(random_state = 33)
X_train_rus, y_train_rus = rus.fit_resample(X_train_encoded, y_train)

In [30]:
# Combined under- and oversampling (ensemble sampling)

# Define undersampling strategy
under = RandomUnderSampler(sampling_strategy = 0.5)  # This will reduce the majority class to be double the minority

# Define oversampling strategy
over = SMOTE(sampling_strategy = 1.0)  # This will balance the two classes

# Combine the sampling strategies into a pipeline
pipeline = Pipeline(steps = [('under', under), ('over', over)])

# Apply the pipeline to your data
X_train_ensemble, y_train_ensemble = pipeline.fit_resample(X_train_encoded, y_train)


In [31]:
# Check the dimensions of the original, oversampled and undersampled train data
print(X_train_encoded.shape) # (448251, 79)
print(y_train.shape) # (448251,)

print(X_train_smote.shape) # (888216, 79)
print(y_train_smote.shape) # (888216,)

print(X_train_rus.shape) # (8286, 79)
print(y_train_rus.shape) # (8286,)

print(X_train_ensemble.shape) # (33144, 79)
print(y_train_ensemble.shape) # (33144,)

(448251, 79)
(448251,)
(888216, 79)
(888216,)
(8286, 79)
(8286,)
(16572, 79)
(16572,)


#### 10. Save the train, validation, and test sets as .csv files

In [32]:
'''
# Training data
# Original
X_train_encoded.to_csv('../00_data/X_train_orig_road_acc.csv', index = False)
y_train.to_csv('../00_data/y_train_orig_road_acc.csv', index = False)

# Oversampled
X_train_smote.to_csv('../00_data/X_train_oversamp_road_acc.csv', index = False)
y_train_smote.to_csv('../00_data/y_train_oversamp_road_acc.csv', index = False)

# Undersampled
X_train_rus.to_csv('../00_data/X_train_undersamp_road_acc.csv', index = False)
y_train_rus.to_csv('../00_data/y_train_undersamp_road_acc.csv', index = False)

# Ensemble resampled
X_train_ensemble.to_csv('../00_data/X_train_ensemble_road_acc.csv', index = False)
y_train_ensemble.to_csv('../00_data/y_train_ensemble_road_acc.csv', index = False)

# Validation data
X_val_encoded.to_csv('../00_data/X_val_road_acc.csv', index = False)
y_val.to_csv('../00_data/y_val_road_acc.csv', index = False)

# Test data
X_test_encoded.to_csv('../00_data/X_test_road_acc.csv', index = False)
y_test.to_csv('../00_data/y_test_road_acc.csv', index = False)
'''