# Import the libraries


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder



In [None]:
df=pd.read_csv("RTA Dataset.csv")


In [None]:
## print the header
print(df.columns)
print(len(df.columns))

# Remove the unrelated features
- removing columns
    - Number_of_casualties :- This is only known after the accident.
    - Duplicate of target variable Accident_severity.
    - Fitness_of_casuality :- Measured after the accident (whether casualty was fit/unfit at hospital).
    - Work_of_casuality :- Doesn’t logically affect accident severity → weak correlation.
    - Sex_of_casualty :- Gender of the injured person doesn’t influence accident severity as strongly as environmental or driver-related factors.
    - Owner_of_vehicle :- Already captured indirectly by Vehicle_driver_relation (e.g., owner vs hired driver).

In [None]:
# preprocessing steps 
# 1. Handle missing values
# 2. Remove the unrelated features
# 2. Encode categorical variables
# 3. Normalize/Standardize numerical features
# 4. Feature engineering (if necessary)
# 5. Split the data into training and testing sets


df.drop(columns=['Time','Day_of_week','Owner_of_vehicle','Service_year_of_vehicle',
                 'Road_allignment','Road_surface_conditions','Number_of_casualties',
                 'Casualty_class','Sex_of_casualty','Age_band_of_casualty','Casualty_severity',
                 'Work_of_casuality','Fitness_of_casuality','Defect_of_vehicle'], inplace=True)

In [None]:
print(df.isnull().sum())
print(df.shape)

In [None]:
#handle the Age band od driver missing values
print(df['Age_band_of_driver'].unique())
df.drop(df[df['Age_band_of_driver']=='Unknown'].index, inplace=True)

# handle the education level missing values by replacing with age group
print(df['Educational_level'].unique())
df['Educational_level'].fillna('Unknown', inplace=True)


# remove the missing values
df=df.dropna(subset=['Vehicle_driver_relation'])
df.drop(df[df['Vehicle_driver_relation']=='Unknown'].index, inplace=True)
# handle the vehicle driver relational missing values
print(df['Vehicle_driver_relation'].unique())

# validate the driving experiance column
print(df['Driving_experience'].unique())
# fill by Unknown 
df['Driving_experience'].fillna('Unknown', inplace=True)


# handle the Type_of_vehicle missing values
print(df['Type_of_vehicle'].unique())
df['Type_of_vehicle']=df['Type_of_vehicle'].fillna('Other')


#map to unique value for the column
map_vehicle={
    'Automobile':'Car',
    'Public (> 45 seats)':'Bus',
    'Lorry (41?100Q)':'Lorry',
    'Public (13?45 seats)':'Bus',
    'Lorry (11?40Q)':'Lorry',
    'Long lorry':'Lorry',
    'Public (12 seats)':'Lorry',
    'Taxi':'Car',
    'Pick up upto 10Q':'Lorry',
    'Stationwagen':'Car',
    'Ridden horse':'Other',
    'Bajaj':'Three_wheeler',
    'Turbo':'Three_wheeler',
    'Motorcycle':'Mootorbike',
    'Special vehicle':'Other',
    'Bicycle':'Bicycle'
}
df['Type_of_vehicle']=df['Type_of_vehicle'].replace(map_vehicle)
print(df['Type_of_vehicle'].unique())


# area Accient occur 
print(df['Area_accident_occured'].unique())
df.dropna(subset=['Area_accident_occured'], inplace=True)
df['Area_accident_occured']=df['Area_accident_occured'].str.strip()
df.drop(df[df['Area_accident_occured']=='Unknown'].index, inplace=True)


# handle the Lanes_or_Medians
print(df['Lanes_or_Medians'].unique())
df['Lanes_or_Medians'].fillna('Unknown', inplace=True)


# Handle the Types_of_Junction  missing values
print(df['Types_of_Junction'].unique())
df.dropna(subset=['Types_of_Junction'], inplace=True)
df.drop(df[df['Types_of_Junction']=='Unknown'].index, inplace=True)


# Handle the Road_surface_type missing values
print(df['Road_surface_type'].unique())
df.dropna(subset=['Road_surface_type'], inplace=True)


print(df['Weather_conditions'].unique())
df.drop(df[df['Weather_conditions']=='Unknown'].index, inplace=True)


# handle the type of colision
print(df['Type_of_collision'].unique())
df.dropna(subset=['Type_of_collision'], inplace=True)
df.drop(df[df['Type_of_collision']=='Unknown'].index, inplace=True)

# Handle the Light_conditions 
print(df['Light_conditions'].unique())

# handle the Vehicle_movement
print(df['Vehicle_movement'].unique())
df.dropna(subset=['Vehicle_movement'], inplace=True)
df.drop(df[df['Vehicle_movement']=='Unknown'].index, inplace=True)


# Handle the Pedestrian_movement  column
print(df['Pedestrian_movement'].unique())
print(df['Pedestrian_movement'].value_counts())

# Handle the Cause_of_accident 
print(df['Cause_of_accident'].unique())

# Change to lowercase

In [None]:
df.head()
df['Age_band_of_driver']=df['Age_band_of_driver'].str.lower()
df['Sex_of_driver']=df['Sex_of_driver'].str.lower()
df['Educational_level']=df['Educational_level'].str.lower()
df['Vehicle_driver_relation']=df['Vehicle_driver_relation'].str.lower()
df['Driving_experience']=df['Driving_experience'].str.lower()
df['Type_of_vehicle']=df['Type_of_vehicle'].str.lower()
df['Area_accident_occured']=df['Area_accident_occured'].str.lower()
df['Lanes_or_Medians']=df['Lanes_or_Medians'].str.lower()
df['Types_of_Junction']=df['Types_of_Junction'].str.lower()
df['Road_surface_type']=df['Road_surface_type'].str.lower()
df['Light_conditions']=df['Light_conditions'].str.lower()
df['Weather_conditions']=df['Weather_conditions'].str.lower()
df['Type_of_collision']=df['Type_of_collision'].str.lower()
df['Vehicle_movement']=df['Vehicle_movement'].str.lower()
df['Pedestrian_movement']=df['Pedestrian_movement'].str.lower()
df['Cause_of_accident']=df['Cause_of_accident'].str.lower()
df['Number_of_vehicles_involved']=df['Number_of_vehicles_involved'].astype(int)

# Remove the duplicated raws

In [None]:
print(df.duplicated().sum())
# drop the duplicates
df.drop_duplicates(inplace=True)

In [None]:
print(df.head())


In [None]:
# Store the data set
df.to_csv("RTA_preprocessed.csv", index=False)

# Transformation

In [None]:

#label encode for the target variable
le=LabelEncoder()
categorical_cols=['Age_band_of_driver','Sex_of_driver','Educational_level',
                  'Vehicle_driver_relation','Driving_experience','Type_of_vehicle',
                  'Area_accident_occured','Lanes_or_Medians','Types_of_Junction','Road_surface_type',
                  'Light_conditions','Weather_conditions','Type_of_collision','Vehicle_movement',
                  'Pedestrian_movement','Cause_of_accident','Accident_severity']

for category in categorical_cols:
    df[category]=le.fit_transform(df[category])


df.head()
print(df.shape)
 

In [None]:
df.to_csv("RTA_preprocessed_encoded.csv", index=False)
