In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [5]:
file_path = r"C:\Users\Admin\Downloads\RTA Dataset.csv"
df = pd.read_csv(file_path)
print(df.shape)
print(df.info())
print(df.head())

(12316, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Time                         12316 non-null  object
 1   Day_of_week                  12316 non-null  object
 2   Age_band_of_driver           12316 non-null  object
 3   Sex_of_driver                12316 non-null  object
 4   Educational_level            11575 non-null  object
 5   Vehicle_driver_relation      11737 non-null  object
 6   Driving_experience           11487 non-null  object
 7   Type_of_vehicle              11366 non-null  object
 8   Owner_of_vehicle             11834 non-null  object
 9   Service_year_of_vehicle      8388 non-null   object
 10  Defect_of_vehicle            7889 non-null   object
 11  Area_accident_occured        12077 non-null  object
 12  Lanes_or_Medians             11931 non-null  object
 13  Road_allignment    

In [6]:
y = df['Accident_severity']

# Define the independent variables X
X = df[[
    'Number_of_vehicles_involved',
    'Number_of_casualties',
    'Light_conditions',
    'Weather_conditions',
    'Road_surface_conditions',
    'Type_of_collision',
    'Type_of_vehicle',
    'Driving_experience',
    'Age_band_of_driver',
    'Area_accident_occured'
]]


In [7]:
# Drop duplicates
df = df.drop_duplicates()

# Handle missing values
# Drop columns with >30% missing:
threshold = 0.30
missing_frac = df.isnull().mean()
cols_to_drop = missing_frac[missing_frac > threshold].index.tolist()
df = df.drop(columns=cols_to_drop)

# Drop rows missing the target
df = df.dropna(subset=['Accident_severity'])

# Fill numeric missing with median
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    if df[c].isnull().any():
        df[c].fillna(df[c].median(), inplace=True)

# Fill categorical missing with mode
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
for c in cat_cols:
    if df[c].isnull().any():
        df[c].fillna(df[c].mode()[0], inplace=True)

# Encode categorical variables
X = pd.get_dummies(df.drop(columns=['Accident_severity']), drop_first=True)
y = df['Accident_severity']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mode()[0], inplace=True)


In [8]:
# Map the target to numeric codes
mapping = {
    'Slight Injury': 1,
    'Serious Injury': 2,
    'Fatal Injury': 3
}
df['Accident_severity_code'] = df['Accident_severity'].map(mapping)

# Define y
y = df['Accident_severity_code']

# Define X (with your selected independents)
X = df[[
    'Number_of_vehicles_involved',
    'Number_of_casualties',
    'Light_conditions',
    'Weather_conditions',
    'Road_surface_conditions',
    'Type_of_collision',
    'Type_of_vehicle',
    'Driving_experience',
    'Age_band_of_driver',
    'Area_accident_occured'
]]

# Convert categorical X columns to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
X_train_sm = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())


In [12]:

# 7. Model Evaluation on Test Set
X_test_sm = sm.add_constant(X_test)
y_pred = model.predict(X_test_sm)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.3f}")
print(f"Test R²: {r2:.3f}")

# 8. Prediction Example
new_case = {
    'const': 1,
    'Number_of_vehicles_involved': 3,
    'Number_of_casualties': 2,
    # include dummy columns you created via get_dummies
    'Light_conditions_Night': 1,
    'Weather_conditions_Rain': 1,
    # … add remaining dummy variables as needed …
}
new_df = pd.DataFrame([new_case])
predicted_severity = model.predict(new_df)[0]
print("Predicted severity score:", predicted_severity)

# 9. Save Model and Cleaned Data
df.to_csv('cleaned_RTA_dataset.csv', index=False)
print("Cleaned dataset saved: cleaned_RTA_dataset.csv")

joblib.dump(model, 'accident_severity_model.pkl')
print("Model saved: accident_severity_model.pkl")


NameError: name 'model' is not defined