# Import the libraries


In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [33]:
df=pd.read_csv("RTA Dataset.csv")
df.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [34]:
## print the header
print(df.columns)
print(len(df.columns))
print(df.info()
)

Index(['Time', 'Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle',
       'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Number_of_vehicles_involved',
       'Number_of_casualties', 'Vehicle_movement', 'Casualty_class',
       'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
       'Work_of_casuality', 'Fitness_of_casuality', 'Pedestrian_movement',
       'Cause_of_accident', 'Accident_severity'],
      dtype='object')
32
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       -------------- 

# Remove the unrelated features
- removing columns
    - Number_of_casualties :- This is only known after the accident.
    - Duplicate of target variable Accident_severity.
    - Fitness_of_casuality :- Measured after the accident (whether casualty was fit/unfit at hospital).
    - Work_of_casuality :- Doesn’t logically affect accident severity → weak correlation.
    - Sex_of_casualty :- Gender of the injured person doesn’t influence accident severity as strongly as environmental or driver-related factors.
    - Owner_of_vehicle :- Already captured indirectly by Vehicle_driver_relation (e.g., owner vs hired driver).

In [35]:
print(df.isnull().sum()/df.shape[0]*100
)

Time                            0.000000
Day_of_week                     0.000000
Age_band_of_driver              0.000000
Sex_of_driver                   0.000000
Educational_level               6.016564
Vehicle_driver_relation         4.701202
Driving_experience              6.731082
Type_of_vehicle                 7.713543
Owner_of_vehicle                3.913608
Service_year_of_vehicle        31.893472
Defect_of_vehicle              35.945112
Area_accident_occured           1.940565
Lanes_or_Medians                3.126015
Road_allignment                 1.152972
Types_of_Junction               7.202014
Road_surface_type               1.396557
Road_surface_conditions         0.000000
Light_conditions                0.000000
Weather_conditions              0.000000
Type_of_collision               1.258525
Number_of_vehicles_involved     0.000000
Number_of_casualties            0.000000
Vehicle_movement                2.500812
Casualty_class                  0.000000
Sex_of_casualty 

### Remobve the
- Time column
- Day_of_week
- Service_year_of_vehicle
- Defect_of_vehicle       
- Work_of_casuality              
- Fitness_of_casuality   do to large amount of missing values of data set

In [36]:
df.drop(columns=['Time','Day_of_week','Service_year_of_vehicle','Defect_of_vehicle','Work_of_casuality','Fitness_of_casuality'],inplace=True)
print(df.columns)
print(df.isnull().sum())

Index(['Age_band_of_driver', 'Sex_of_driver', 'Educational_level',
       'Vehicle_driver_relation', 'Driving_experience', 'Type_of_vehicle',
       'Owner_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Number_of_vehicles_involved',
       'Number_of_casualties', 'Vehicle_movement', 'Casualty_class',
       'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
       'Pedestrian_movement', 'Cause_of_accident', 'Accident_severity'],
      dtype='object')
Age_band_of_driver               0
Sex_of_driver                    0
Educational_level              741
Vehicle_driver_relation        579
Driving_experience             829
Type_of_vehicle                950
Owner_of_vehicle               482
Area_accident_occured          239
Lanes_or_Medians               385
Road_allignment                14

In [None]:
print(df['Owner_of_vehicle'].unique())
df.drop(df[df['Owner_of_vehicle']=='Other'].index, inplace=True)
df.dropna(inplace=True)



['Owner' 'Governmental' 'Organization']
['Owner' 'Governmental' 'Organization']
['Owner' 'Governmental' 'Organization']


In [43]:
print(df['Road_allignment'].unique())
print(df['Road_allignment'].value_counts())

['Tangent road with flat terrain'
 'Tangent road with mild grade and flat terrain' 'Escarpments'
 'Tangent road with rolling terrain' 'Gentle horizontal curve'
 'Tangent road with mountainous terrain and'
 'Steep grade downward with mountainous terrain' 'Sharp reverse curve'
 'Steep grade upward with mountainous terrain']
Road_allignment
Tangent road with flat terrain                   7212
Tangent road with mild grade and flat terrain     355
Steep grade downward with mountainous terrain     308
Tangent road with mountainous terrain and         258
Gentle horizontal curve                           122
Escarpments                                        92
Sharp reverse curve                                39
Tangent road with rolling terrain                  25
Steep grade upward with mountainous terrain        12
Name: count, dtype: int64


In [None]:
#handle the Age band od driver missing values
print(df['Age_band_of_driver'].unique())
df.drop(df[df['Age_band_of_driver']=='Unknown'].index, inplace=True)

# handle the education level missing values by replacing with age group
print(df['Educational_level'].unique())
df['Educational_level'].fillna('Unknown', inplace=True)


# remove the missing values
df=df.dropna(subset=['Vehicle_driver_relation'])
df.drop(df[df['Vehicle_driver_relation']=='Unknown'].index, inplace=True)
# handle the vehicle driver relational missing values
print(df['Vehicle_driver_relation'].unique())

# validate the driving experiance column
print(df['Driving_experience'].unique())
# fill by Unknown 
df['Driving_experience'].fillna('Unknown', inplace=True)


# handle the Type_of_vehicle missing values
print(df['Type_of_vehicle'].unique())
df['Type_of_vehicle']=df['Type_of_vehicle'].fillna('Other')


#map to unique value for the column
map_vehicle={
    'Automobile':'Car',
    'Public (> 45 seats)':'Bus',
    'Lorry (41?100Q)':'Lorry',
    'Public (13?45 seats)':'Bus',
    'Lorry (11?40Q)':'Lorry',
    'Long lorry':'Lorry',
    'Public (12 seats)':'Lorry',
    'Taxi':'Car',
    'Pick up upto 10Q':'Lorry',
    'Stationwagen':'Car',
    'Ridden horse':'Other',
    'Bajaj':'Three_wheeler',
    'Turbo':'Three_wheeler',
    'Motorcycle':'Mootorbike',
    'Special vehicle':'Other',
    'Bicycle':'Bicycle'
}
df['Type_of_vehicle']=df['Type_of_vehicle'].replace(map_vehicle)
print(df['Type_of_vehicle'].unique())


# area Accient occur 
print(df['Area_accident_occured'].unique())
df.dropna(subset=['Area_accident_occured'], inplace=True)
df['Area_accident_occured']=df['Area_accident_occured'].str.strip()
df.drop(df[df['Area_accident_occured']=='Unknown'].index, inplace=True)


# handle the Lanes_or_Medians
print(df['Lanes_or_Medians'].unique())
df['Lanes_or_Medians'].fillna('Unknown', inplace=True)


# Handle the Types_of_Junction  missing values
print(df['Types_of_Junction'].unique())
df.dropna(subset=['Types_of_Junction'], inplace=True)
df.drop(df[df['Types_of_Junction']=='Unknown'].index, inplace=True)


# Handle the Road_surface_type missing values
print(df['Road_surface_type'].unique())
df.dropna(subset=['Road_surface_type'], inplace=True)


print(df['Weather_conditions'].unique())
df.drop(df[df['Weather_conditions']=='Unknown'].index, inplace=True)


# handle the type of colision
print(df['Type_of_collision'].unique())
df.dropna(subset=['Type_of_collision'], inplace=True)
df.drop(df[df['Type_of_collision']=='Unknown'].index, inplace=True)

# Handle the Light_conditions 
print(df['Light_conditions'].unique())

# handle the Vehicle_movement
print(df['Vehicle_movement'].unique())
df.dropna(subset=['Vehicle_movement'], inplace=True)
df.drop(df[df['Vehicle_movement']=='Unknown'].index, inplace=True)


# Handle the Pedestrian_movement  column
print(df['Pedestrian_movement'].unique())
print(df['Pedestrian_movement'].value_counts())

# Handle the Cause_of_accident 
print(df['Cause_of_accident'].unique())

print(df['Light_conditions'].unique())

In [None]:
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype!='number':
        df[col]=le.fit_transform(df[col].astype(str))



# Step 2: Compute correlation matrix
corr_matrix = df.corr().abs()

# Step 3: Visualize correlation heatmap (optional, for understanding)
plt.figure(figsize=(30, 30))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# Step 4: Select upper triangle (avoid duplicate correlations)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Step 5: Identify columns with correlation above a threshold (e.g. 0.8)
threshold = 0.8
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print("Highly correlated columns to drop:", to_drop)

# Step 6: Drop those columns
df_reduced = df.drop(columns=to_drop)

print("Shape before:", df.shape)
print("Shape after removing correlated columns:", df_reduced.shape)


In [25]:
df.shape


(12316, 25)

In [15]:
# preprocessing steps 
# 1. Handle missing values
# 2. Remove the unrelated features
# 2. Encode categorical variables
# 3. Normalize/Standardize numerical features
# 4. Feature engineering (if necessary)
# 5. Split the data into training and testing sets


df.drop(columns=['Time','Day_of_week','Owner_of_vehicle','Service_year_of_vehicle',
                 'Road_allignment','Road_surface_conditions','Number_of_casualties',
                 'Casualty_class','Sex_of_casualty','Age_band_of_casualty','Casualty_severity',
                 'Work_of_casuality','Fitness_of_casuality','Defect_of_vehicle'], inplace=True)

In [16]:
print(df.isnull().sum())
print(df.shape)

Age_band_of_driver               0
Sex_of_driver                    0
Educational_level              741
Vehicle_driver_relation        579
Driving_experience             829
Type_of_vehicle                950
Area_accident_occured          239
Lanes_or_Medians               385
Types_of_Junction              887
Road_surface_type              172
Light_conditions                 0
Weather_conditions               0
Type_of_collision              155
Number_of_vehicles_involved      0
Vehicle_movement               308
Pedestrian_movement              0
Cause_of_accident                0
Accident_severity                0
dtype: int64
(12316, 18)


In [17]:
#handle the Age band od driver missing values
print(df['Age_band_of_driver'].unique())
df.drop(df[df['Age_band_of_driver']=='Unknown'].index, inplace=True)

# handle the education level missing values by replacing with age group
print(df['Educational_level'].unique())
df['Educational_level'].fillna('Unknown', inplace=True)


# remove the missing values
df=df.dropna(subset=['Vehicle_driver_relation'])
df.drop(df[df['Vehicle_driver_relation']=='Unknown'].index, inplace=True)
# handle the vehicle driver relational missing values
print(df['Vehicle_driver_relation'].unique())

# validate the driving experiance column
print(df['Driving_experience'].unique())
# fill by Unknown 
df['Driving_experience'].fillna('Unknown', inplace=True)


# handle the Type_of_vehicle missing values
print(df['Type_of_vehicle'].unique())
df['Type_of_vehicle']=df['Type_of_vehicle'].fillna('Other')


#map to unique value for the column
map_vehicle={
    'Automobile':'Car',
    'Public (> 45 seats)':'Bus',
    'Lorry (41?100Q)':'Lorry',
    'Public (13?45 seats)':'Bus',
    'Lorry (11?40Q)':'Lorry',
    'Long lorry':'Lorry',
    'Public (12 seats)':'Lorry',
    'Taxi':'Car',
    'Pick up upto 10Q':'Lorry',
    'Stationwagen':'Car',
    'Ridden horse':'Other',
    'Bajaj':'Three_wheeler',
    'Turbo':'Three_wheeler',
    'Motorcycle':'Mootorbike',
    'Special vehicle':'Other',
    'Bicycle':'Bicycle'
}
df['Type_of_vehicle']=df['Type_of_vehicle'].replace(map_vehicle)
print(df['Type_of_vehicle'].unique())


# area Accient occur 
print(df['Area_accident_occured'].unique())
df.dropna(subset=['Area_accident_occured'], inplace=True)
df['Area_accident_occured']=df['Area_accident_occured'].str.strip()
df.drop(df[df['Area_accident_occured']=='Unknown'].index, inplace=True)


# handle the Lanes_or_Medians
print(df['Lanes_or_Medians'].unique())
df['Lanes_or_Medians'].fillna('Unknown', inplace=True)


# Handle the Types_of_Junction  missing values
print(df['Types_of_Junction'].unique())
df.dropna(subset=['Types_of_Junction'], inplace=True)
df.drop(df[df['Types_of_Junction']=='Unknown'].index, inplace=True)


# Handle the Road_surface_type missing values
print(df['Road_surface_type'].unique())
df.dropna(subset=['Road_surface_type'], inplace=True)


print(df['Weather_conditions'].unique())
df.drop(df[df['Weather_conditions']=='Unknown'].index, inplace=True)


# handle the type of colision
print(df['Type_of_collision'].unique())
df.dropna(subset=['Type_of_collision'], inplace=True)
df.drop(df[df['Type_of_collision']=='Unknown'].index, inplace=True)

# Handle the Light_conditions 
print(df['Light_conditions'].unique())

# handle the Vehicle_movement
print(df['Vehicle_movement'].unique())
df.dropna(subset=['Vehicle_movement'], inplace=True)
df.drop(df[df['Vehicle_movement']=='Unknown'].index, inplace=True)


# Handle the Pedestrian_movement  column
print(df['Pedestrian_movement'].unique())
print(df['Pedestrian_movement'].value_counts())

# Handle the Cause_of_accident 
print(df['Cause_of_accident'].unique())

print(df['Light_conditions'].unique())

['18-30' '31-50' 'Under 18' 'Over 51' 'Unknown']
['Above high school' 'Junior high school' nan 'Elementary school'
 'High school' 'Unknown' 'Illiterate' 'Writing & reading']
['Employee' 'Owner' 'Other']
['1-2yr' 'Above 10yr' '5-10yr' '2-5yr' 'No Licence' 'Below 1yr' nan
 'unknown']
['Automobile' 'Public (> 45 seats)' 'Lorry (41?100Q)' nan
 'Public (13?45 seats)' 'Lorry (11?40Q)' 'Long lorry' 'Public (12 seats)'
 'Taxi' 'Pick up upto 10Q' 'Stationwagen' 'Ridden horse' 'Other' 'Bajaj'
 'Turbo' 'Motorcycle' 'Special vehicle' 'Bicycle']
['Car' 'Bus' 'Lorry' 'Other' 'Three_wheeler' 'Mootorbike' 'Bicycle']
['Residential areas' 'Office areas' '  Recreational areas'
 ' Industrial areas' 'Other' ' Church areas' '  Market areas'
 'Rural village areas' ' Outside rural areas' ' Hospital areas' nan
 'School areas' 'Rural village areasOffice areas' 'Recreational areas'
 'Unknown']
[nan 'Undivided Two way' 'other' 'Double carriageway (median)' 'One way'
 'Two-way (divided with solid lines road markin

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Educational_level'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Driving_experience'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

['Not a Pedestrian' "Crossing from driver's nearside"
 'Crossing from nearside - masked by parked or statioNot a Pedestrianry vehicle'
 'Unknown or other'
 'Crossing from offside - masked by  parked or statioNot a Pedestrianry vehicle'
 'In carriageway, statioNot a Pedestrianry - not crossing  (standing or playing)'
 'Walking along in carriageway, back to traffic'
 'Walking along in carriageway, facing traffic'
 'In carriageway, statioNot a Pedestrianry - not crossing  (standing or playing) - masked by parked or statioNot a Pedestrianry vehicle']


# Change to lowercase

In [18]:
df.head()
df['Age_band_of_driver']=df['Age_band_of_driver'].str.lower()
df['Sex_of_driver']=df['Sex_of_driver'].str.lower()
df['Educational_level']=df['Educational_level'].str.lower()
df['Vehicle_driver_relation']=df['Vehicle_driver_relation'].str.lower()
df['Driving_experience']=df['Driving_experience'].str.lower()
df['Type_of_vehicle']=df['Type_of_vehicle'].str.lower()
df['Area_accident_occured']=df['Area_accident_occured'].str.lower()
df['Lanes_or_Medians']=df['Lanes_or_Medians'].str.lower()
df['Types_of_Junction']=df['Types_of_Junction'].str.lower()
df['Road_surface_type']=df['Road_surface_type'].str.lower()
df['Light_conditions']=df['Light_conditions'].str.lower()
df['Weather_conditions']=df['Weather_conditions'].str.lower()
df['Type_of_collision']=df['Type_of_collision'].str.lower()
df['Vehicle_movement']=df['Vehicle_movement'].str.lower()
df['Pedestrian_movement']=df['Pedestrian_movement'].str.lower()
df['Cause_of_accident']=df['Cause_of_accident'].str.lower()
df['Number_of_vehicles_involved']=df['Number_of_vehicles_involved'].astype(int)

# Remove the duplicated raws

In [19]:
print(df.duplicated().sum())
# drop the duplicates
df.drop_duplicates(inplace=True)

17


In [20]:
print(df.head())


  Age_band_of_driver Sex_of_driver   Educational_level  \
0              18-30          male   above high school   
1              31-50          male  junior high school   
2              18-30          male  junior high school   
3              18-30          male  junior high school   
4              18-30          male  junior high school   

  Vehicle_driver_relation Driving_experience Type_of_vehicle  \
0                employee              1-2yr             car   
1                employee         above 10yr             bus   
2                employee              1-2yr           lorry   
3                employee             5-10yr             bus   
4                employee              2-5yr           other   

  Area_accident_occured   Lanes_or_Medians Types_of_Junction  \
0     residential areas            unknown       no junction   
1          office areas  undivided two way       no junction   
2    recreational areas              other       no junction   
3         

In [21]:
print(df['Pedestrian_movement'].unique())

['not a pedestrian' "crossing from driver's nearside"
 'crossing from nearside - masked by parked or stationot a pedestrianry vehicle'
 'unknown or other'
 'crossing from offside - masked by  parked or stationot a pedestrianry vehicle'
 'in carriageway, stationot a pedestrianry - not crossing  (standing or playing)'
 'walking along in carriageway, back to traffic'
 'walking along in carriageway, facing traffic'
 'in carriageway, stationot a pedestrianry - not crossing  (standing or playing) - masked by parked or stationot a pedestrianry vehicle']


In [None]:
# Store the data set
df.to_csv("RTA_preprocessed.csv", index=False)

# Transformation

In [None]:

#label encode for the target variable
le=LabelEncoder()
categorical_cols=['Age_band_of_driver','Sex_of_driver','Educational_level',
                  'Vehicle_driver_relation','Driving_experience','Type_of_vehicle',
                  'Area_accident_occured','Lanes_or_Medians','Types_of_Junction','Road_surface_type',
                  'Light_conditions','Weather_conditions','Type_of_collision','Vehicle_movement',
                  'Pedestrian_movement','Cause_of_accident','Accident_severity']

for category in categorical_cols:
    df[category]=le.fit_transform(df[category])


df.head()
print(df.shape)
 

In [None]:
df.to_csv("RTA_preprocessed_encoded.csv", index=False)
