In [11]:
import pandas as pd
from pycaret.classification import *
import os
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'road_acc.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Data Preprocessing
data.columns = data.columns.str.strip()  # Strip leading and trailing spaces from column names
data.replace('-', pd.NA, inplace=True)  # Replace placeholder '-' with NaN
data.fillna('Unknown', inplace=True)  # Fill NaN values with 'Unknown'

# Drop the 'Serial No.' column
if 'Serial No.' in data.columns:
    data.drop(columns=['Serial No.'], inplace=True)

# Standardize the date format in 'Date of accident' column
if 'Date of accident' in data.columns:
    data['Date of accident'] = pd.to_datetime(data['Date of accident'], errors='coerce')

# Remove classes with less than 2 samples in the target column
if 'Cause of accident' in data.columns:
    class_counts = data['Cause of accident'].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    data = data[data['Cause of accident'].isin(valid_classes)]

In [12]:
# if 'Date of accident' in data.columns:
#     data.drop(columns=['Date of accident'], inplace=True)
#     print("Date of accident column dropped.")
# else:
#     print("The 'Date of accident' column is not present in the dataset.")

# Display the first few rows to confirm the column is dropped
print(data.head())

  Date of accident Time of accident    Type of accident  \
0       2023-03-21          9:00 AM   Head-on collision   
1       2022-11-22          6:30 AM   Head-on collision   
2       2022-04-01          5:45 AM  Rear-end collision   
3       2022-05-26          1:40 PM  Rear-end collision   
4       2022-06-03         12:00 PM  Rear-end collision   

     Vehicular involvement  Death  Injury   Cause of accident  \
0         CNG- covered van      1       1  Recklessly Driving   
1       Votvoti-pickup van      2       0  Recklessly Driving   
2  Truck- mini covered van      2       0  Recklessly Driving   
3    Unknown- rickshaw van      1       0  Recklessly Driving   
4    Easy bike- motorcycle      1       0  Recklessly Driving   

    Location of Accidents  
0          Santhia, Pabna  
1  Vitapara,Santhia Pabna  
2          Santhia, Pabna  
3          Santhia, Pabna  
4          Santhia, Pabna  


In [13]:
print(data.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date of accident       64 non-null     datetime64[ns]
 1   Time of accident       72 non-null     object        
 2   Type of accident       72 non-null     object        
 3   Vehicular involvement  72 non-null     object        
 4   Death                  72 non-null     int64         
 5   Injury                 72 non-null     int64         
 6   Cause of accident      72 non-null     object        
 7   Location of Accidents  72 non-null     object        
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 4.6+ KB
None


In [14]:
print(data.columns)


Index(['Date of accident', 'Time of accident', 'Type of accident',
       'Vehicular involvement', 'Death', 'Injury', 'Cause of accident',
       'Location of Accidents'],
      dtype='object')


In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Ensure 'Date of accident' exists and is in datetime format
if 'Date of accident' in data.columns:
    data['Date of accident'] = pd.to_datetime(data['Date of accident'], errors='coerce')

    # Extract features from 'Date of accident'
    data['Year'] = data['Date of accident'].dt.year
    data['Month'] = data['Date of accident'].dt.month
    data['Day'] = data['Date of accident'].dt.day
    data['DayOfWeek'] = data['Date of accident'].dt.dayofweek
    data['Hour'] = data['Date of accident'].dt.hour

    # Drop the original 'Date of accident' column
    data.drop(columns=['Date of accident'], inplace=True)
else:
    print("Error: 'Date of accident' column is missing.")

# One-hot encode 'Vehicular involvement'
if 'Vehicular involvement' in data.columns:
    data = pd.get_dummies(data, columns=['Vehicular involvement'], drop_first=True)
else:
    print("Warning: 'Vehicular involvement' column is missing.")

# Label encode 'Cause of accident'
if 'Cause of accident' in data.columns:
    data['Cause of accident'] = LabelEncoder().fit_transform(data['Cause of accident'])
else:
    print("Error: 'Cause of accident' column is missing.")

# Ensure 'Time of accident' is treated as a categorical variable
if 'Time of accident' in data.columns:
    data['Time of accident'] = data['Time of accident'].astype(str)
else:
    print("Warning: 'Time of accident' column is missing.")

# Print processed data
print(data.head())


Error: 'Date of accident' column is missing.
  Time of accident    Type of accident  Death  Injury  Cause of accident  \
0          9:00 AM   Head-on collision      1       1                  2   
1          6:30 AM   Head-on collision      2       0                  2   
2          5:45 AM  Rear-end collision      2       0                  2   
3          1:40 PM  Rear-end collision      1       0                  2   
4         12:00 PM  Rear-end collision      1       0                  2   

    Location of Accidents    Year  Month   Day  DayOfWeek  ...  \
0          Santhia, Pabna  2023.0    3.0  21.0        1.0  ...   
1  Vitapara,Santhia Pabna  2022.0   11.0  22.0        1.0  ...   
2          Santhia, Pabna  2022.0    4.0   1.0        4.0  ...   
3          Santhia, Pabna  2022.0    5.0  26.0        3.0  ...   
4          Santhia, Pabna  2022.0    6.0   3.0        4.0  ...   

   Vehicular involvement_Truck- Motorcycle  \
0                                    False   
1        

In [21]:
print("Data shape:", data.shape)



Data shape: (72, 72)


In [22]:
data

Unnamed: 0,Time of accident,Type of accident,Death,Injury,Cause of accident,Location of Accidents,Year,Month,Day,DayOfWeek,...,Vehicular involvement_Truck- Motorcycle,Vehicular involvement_Truck- mini covered van,Vehicular involvement_Truck-Auto rickshaw,Vehicular involvement_Unknown- rickshaw van,Vehicular involvement_Van- Auto rickshaw,Vehicular involvement_Van- CNG,Vehicular involvement_Votvoti- Covered van,Vehicular involvement_Votvoti- Motorcycle,Vehicular involvement_Votvoti- motorcycle,Vehicular involvement_Votvoti-pickup van
0,9:00 AM,Head-on collision,1,1,2,"Santhia, Pabna",2023.0,3.0,21.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,6:30 AM,Head-on collision,2,0,2,"Vitapara,Santhia Pabna",2022.0,11.0,22.0,1.0,...,False,False,False,False,False,False,False,False,False,True
2,5:45 AM,Rear-end collision,2,0,2,"Santhia, Pabna",2022.0,4.0,1.0,4.0,...,False,True,False,False,False,False,False,False,False,False
3,1:40 PM,Rear-end collision,1,0,2,"Santhia, Pabna",2022.0,5.0,26.0,3.0,...,False,False,False,True,False,False,False,False,False,False
4,12:00 PM,Rear-end collision,1,0,2,"Santhia, Pabna",2022.0,6.0,3.0,4.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,7:40 AM,Rear-end collision,1,0,2,"Ullahpara, Pabna",2021.0,12.0,20.0,0.0,...,False,False,False,False,False,False,False,False,False,False
68,9:00 AM,Overturning,1,0,2,"Kashinathpur, Pabna",2020.0,2.0,7.0,4.0,...,False,False,False,False,False,False,False,False,False,False
69,3:45 PM,Head on collision,1,0,2,"Bera, Pabna",2020.0,8.0,13.0,3.0,...,False,False,False,False,False,False,False,False,False,False
70,8:55 PM,Side swipe,0,3,1,"Bera, Pabna",2020.0,12.0,26.0,5.0,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Save the DataFrame to the specified path
file_path = r'C:\Users\MASUM\OneDrive\Desktop\Machine Learning\Accident\data.csv'  # Add the file name with .csv extension
data.to_csv(file_path, index=False)

print(f"Data saved successfully to {file_path}")


Data saved successfully to C:\Users\MASUM\OneDrive\Desktop\Machine Learning\Accident\data.csv
