In [1]:
import pandas as pd
from pycaret.classification import *
import os
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'road_acc.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Data Preprocessing
data.columns = data.columns.str.strip()  # Strip leading and trailing spaces from column names
data.replace('-', pd.NA, inplace=True)  # Replace placeholder '-' with NaN
data.fillna('Unknown', inplace=True)  # Fill NaN values with 'Unknown'

# Drop the 'Serial No.' column
if 'Serial No.' in data.columns:
    data.drop(columns=['Serial No.'], inplace=True)

# Standardize the date format in 'Date of accident' column
if 'Date of accident' in data.columns:
    data['Date of accident'] = pd.to_datetime(data['Date of accident'], errors='coerce')

# Remove classes with less than 2 samples in the target column
if 'Cause of accident' in data.columns:
    class_counts = data['Cause of accident'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    data = data[data['Cause of accident'].isin(valid_classes)]

In [2]:
data

Unnamed: 0,Date of accident,Time of accident,Type of accident,Vehicular involvement,Death,Injury,Cause of accident,Location of Accidents
0,2023-03-21,9:00 AM,Head-on collision,CNG- covered van,1,1,Recklessly Driving,"Santhia, Pabna"
1,2022-11-22,6:30 AM,Head-on collision,Votvoti-pickup van,2,0,Recklessly Driving,"Vitapara,Santhia Pabna"
2,2022-04-01,5:45 AM,Rear-end collision,Truck- mini covered van,2,0,Recklessly Driving,"Santhia, Pabna"
3,2022-05-26,1:40 PM,Rear-end collision,Unknown- rickshaw van,1,0,Recklessly Driving,"Santhia, Pabna"
4,2022-06-03,12:00 PM,Rear-end collision,Easy bike- motorcycle,1,0,Recklessly Driving,"Santhia, Pabna"
...,...,...,...,...,...,...,...,...
67,2021-12-20,7:40 AM,Rear-end collision,Nosimon- korimon,1,0,Recklessly Driving,"Ullahpara, Pabna"
68,2020-02-07,9:00 AM,Overturning,Pickup- CNG,1,0,Recklessly Driving,"Kashinathpur, Pabna"
69,2020-08-13,3:45 PM,Head on collision,Pickup-CNG,1,0,Recklessly Driving,"Bera, Pabna"
70,2020-12-26,8:55 PM,Side swipe,Bus- votvoti,0,3,Overtaking,"Bera, Pabna"


In [3]:
if 'Cause of accident' in data.columns:
    print(data['Cause of accident'].value_counts())
else:
    print("The 'Cause of accident' column is not present in the dataset.")

Cause of accident
Recklessly Driving    55
Overtaking             9
Overspeed              8
Name: count, dtype: int64


In [4]:
if 'Date of accident' in data.columns:
    data.drop(columns=['Date of accident'], inplace=True)
    print("Date of accident column dropped.")
else:
    print("The 'Date of accident' column is not present in the dataset.")

# Display the first few rows to confirm the column is dropped
print(data.head())

Date of accident column dropped.
  Time of accident    Type of accident    Vehicular involvement  Death  \
0          9:00 AM   Head-on collision         CNG- covered van      1   
1          6:30 AM   Head-on collision       Votvoti-pickup van      2   
2          5:45 AM  Rear-end collision  Truck- mini covered van      2   
3          1:40 PM  Rear-end collision    Unknown- rickshaw van      1   
4         12:00 PM  Rear-end collision    Easy bike- motorcycle      1   

   Injury   Cause of accident   Location of Accidents  
0       1  Recklessly Driving          Santhia, Pabna  
1       0  Recklessly Driving  Vitapara,Santhia Pabna  
2       0  Recklessly Driving          Santhia, Pabna  
3       0  Recklessly Driving          Santhia, Pabna  
4       0  Recklessly Driving          Santhia, Pabna  


In [5]:
data

Unnamed: 0,Time of accident,Type of accident,Vehicular involvement,Death,Injury,Cause of accident,Location of Accidents
0,9:00 AM,Head-on collision,CNG- covered van,1,1,Recklessly Driving,"Santhia, Pabna"
1,6:30 AM,Head-on collision,Votvoti-pickup van,2,0,Recklessly Driving,"Vitapara,Santhia Pabna"
2,5:45 AM,Rear-end collision,Truck- mini covered van,2,0,Recklessly Driving,"Santhia, Pabna"
3,1:40 PM,Rear-end collision,Unknown- rickshaw van,1,0,Recklessly Driving,"Santhia, Pabna"
4,12:00 PM,Rear-end collision,Easy bike- motorcycle,1,0,Recklessly Driving,"Santhia, Pabna"
...,...,...,...,...,...,...,...
67,7:40 AM,Rear-end collision,Nosimon- korimon,1,0,Recklessly Driving,"Ullahpara, Pabna"
68,9:00 AM,Overturning,Pickup- CNG,1,0,Recklessly Driving,"Kashinathpur, Pabna"
69,3:45 PM,Head on collision,Pickup-CNG,1,0,Recklessly Driving,"Bera, Pabna"
70,8:55 PM,Side swipe,Bus- votvoti,0,3,Overtaking,"Bera, Pabna"


In [6]:
output_file_path = 'road_acc_cleaned.csv'  # Specify the desired output file path
data.to_csv(output_file_path, index=False)  # Save without the index column

print(f"Updated dataset saved to {output_file_path}")

Updated dataset saved to road_acc_cleaned.csv


In [7]:
from pycaret.classification import *

In [12]:
clf_setup = setup(
    data=data,
    target='Cause of accident',
    session_id=42,
    log_experiment=False,
    use_gpu=False
)

print("Comparing models...")
best_model = compare_models(sort='AUC', n_select=1)  # Sort models by AUC and select the best one

# Print the best model
print(f"Best model: {best_model}")

# Evaluate the best model
print("Evaluating the best model...")
evaluate_model(best_model)

# Finalize the best model for future use (optional)
final_best_model = finalize_model(best_model)

# Save the best model for future use
model_save_path = 'best_model'
save_model(final_best_model, model_save_path)

# Print model performance metrics
print(f"Best model saved at {model_save_path}.")


# Generate plots using pycaret and save them using plt.save
# Assuming pycaret generates plots using plt, you can save them here
plt.save('model_comparison.png')  # Save model comparison plot
plt.save('classification_report.png')

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Cause of accident
2,Target type,Multiclass
3,Target mapping,"Overspeed: 0, Overtaking: 1, Recklessly Driving: 2"
4,Original data shape,"(72, 7)"
5,Transformed data shape,"(72, 20)"
6,Transformed train set shape,"(50, 20)"
7,Transformed test set shape,"(22, 20)"
8,Numeric features,2
9,Categorical features,4


Comparing models...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.24,0.155,0.24,0.5273,0.2797,-0.0639,-0.111,0.16
knn,K Neighbors Classifier,0.7,0.105,0.7,0.596,0.6367,0.0073,0.0079,0.029
dt,Decision Tree Classifier,0.44,0.1,0.44,0.543,0.4385,0.072,0.0875,0.028
lightgbm,Light Gradient Boosting Machine,0.76,0.1,0.76,0.584,0.6589,0.0,0.0,0.045
catboost,CatBoost Classifier,0.46,0.1,0.46,0.629,0.4883,0.059,0.0748,0.424
dummy,Dummy Classifier,0.76,0.1,0.76,0.584,0.6589,0.0,0.0,0.028
rf,Random Forest Classifier,0.74,0.095,0.74,0.613,0.6654,0.0462,0.0518,0.053
nb,Naive Bayes,0.4,0.085,0.4,0.6867,0.4571,0.0539,0.1018,0.02
et,Extra Trees Classifier,0.72,0.085,0.72,0.593,0.6492,0.0183,0.0226,0.058
lr,Logistic Regression,0.76,0.0,0.76,0.584,0.6589,0.0,0.0,0.408


Best model: XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=-1,
              num_parallel_tree=None, objective='multi:softprob', ...)
Evaluating the best model...


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Transformation Pipeline and Model Successfully Saved
Best model saved at best_model.


AttributeError: module 'matplotlib.pyplot' has no attribute 'save'