In [5]:
import pandas as pd 

accident_data = pd.read_csv('dataset/accident_data.csv')
accident_data.shape

(660679, 14)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

accident_data_copy = accident_data.copy()
accident_data_copy.isnull().sum()

Index                          0
Accident_Severity              0
Accident Date                  0
Latitude                      25
Light_Conditions               0
District Area                  0
Longitude                     26
Number_of_Casualties           0
Number_of_Vehicles             0
Road_Surface_Conditions      726
Road_Type                   4520
Urban_or_Rural_Area           15
Weather_Conditions         14128
Vehicle_Type                   0
dtype: int64

In [6]:
# fill numeric with median
def impute_na(df, variable, fillval):
    df[variable] = df[variable].fillna(fillval)
impute_na(accident_data_copy, 'Latitude', accident_data_copy['Latitude'].median())
impute_na(accident_data_copy, 'Longitude', accident_data_copy['Longitude'].median())

In [7]:
for column in accident_data_copy.columns:
    if accident_data_copy[column].dtype == type(object):
        le = LabelEncoder()
        accident_data_copy[column] = le.fit_transform(accident_data_copy[column])

features = accident_data_copy.drop('Accident_Severity', axis=1)
labels = accident_data_copy['Accident_Severity']
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [10]:
features.isnull().sum()

Index                      0
Accident Date              0
Latitude                   0
Light_Conditions           0
District Area              0
Longitude                  0
Number_of_Casualties       0
Number_of_Vehicles         0
Road_Surface_Conditions    0
Road_Type                  0
Urban_or_Rural_Area        0
Weather_Conditions         0
Vehicle_Type               0
dtype: int64

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Handle class imbalance
smote = SMOTE(random_state=42)
features_res, labels_res = smote.fit_resample(features, labels)

# Feature selection
selector = SelectKBest(mutual_info_classif, k=10)
features_res = selector.fit_transform(features_res, labels_res)

# Split data
X_train, X_test, y_train, y_test = train_test_split(features_res, labels_res, test_size=0.2, random_state=42)

# Hyperparameter tuning 
# A Lot less cuz I don't have a lot of time :)
param_grid = {
    'n_estimators': [50, 100, 200]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

# Train model
model = grid_search.best_estimator_
model.fit(X_train, y_train)

# Evaluate model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85    113182
           1       0.72      0.68      0.70    112826
           2       0.77      0.77      0.77    112273

    accuracy                           0.78    338281
   macro avg       0.77      0.78      0.77    338281
weighted avg       0.77      0.78      0.77    338281



In [None]:
features.columns