1. Chargement des données


In [1]:
import pandas as pd

# Charger les données
data = pd.read_csv('data_cleaned.csv')

# Afficher les premières lignes
data.head()

Unnamed: 0,Accident_Index,Date,Day_of_Week,Junction_Control,Junction_Detail,Accident_Severity,Latitude,Light_Conditions,Local_Authority_(District),Carriageway_Hazards,...,Number_of_Casualties,Number_of_Vehicles,Police_Force,Road_Surface_Conditions,Road_Type,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200901BS70001,1/1/2021,Thursday,Give way or uncontrolled,T or staggered junction,Serious,51.512273,Daylight,Kensington and Chelsea,Other object on road,...,1,2,Metropolitan Police,Dry,One way street,30,15:11,Urban,Fine no high winds,Car
1,200901BS70002,1/5/2021,Monday,Give way or uncontrolled,Crossroads,Serious,51.514399,Daylight,Kensington and Chelsea,Other object on road,...,11,2,Metropolitan Police,Wet or damp,Single carriageway,30,10:59,Urban,Fine no high winds,Taxi/Private hire car
2,200901BS70003,1/4/2021,Sunday,Give way or uncontrolled,T or staggered junction,Slight,51.486668,Daylight,Kensington and Chelsea,Other object on road,...,1,2,Metropolitan Police,Dry,Single carriageway,30,14:19,Urban,Fine no high winds,Taxi/Private hire car
3,200901BS70004,1/5/2021,Monday,Auto traffic signal,T or staggered junction,Serious,51.507804,Daylight,Kensington and Chelsea,Other object on road,...,1,2,Metropolitan Police,Frost or ice,Single carriageway,30,8:10,Urban,Other,Motorcycle over 500cc
4,200901BS70005,1/6/2021,Tuesday,Auto traffic signal,Crossroads,Serious,51.482076,Darkness - lights lit,Kensington and Chelsea,Other object on road,...,1,2,Metropolitan Police,Dry,Single carriageway,30,17:25,Urban,Fine no high winds,Car


2. Exploration des données


Informations générales sur le dataset


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307973 entries, 0 to 307972
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Accident_Index              307973 non-null  object 
 1   Date                        307973 non-null  object 
 2   Day_of_Week                 307973 non-null  object 
 3   Junction_Control            307973 non-null  object 
 4   Junction_Detail             307973 non-null  object 
 5   Accident_Severity           307973 non-null  object 
 6   Latitude                    307973 non-null  float64
 7   Light_Conditions            307973 non-null  object 
 8   Local_Authority_(District)  307973 non-null  object 
 9   Carriageway_Hazards         307973 non-null  object 
 10  Longitude                   307973 non-null  float64
 11  Number_of_Casualties        307973 non-null  int64  
 12  Number_of_Vehicles          307973 non-null  int64  
 13  Police_Force  

Vérification des valeurs manquantes


In [3]:
data.isnull().sum()


Accident_Index                0
Date                          0
Day_of_Week                   0
Junction_Control              0
Junction_Detail               0
Accident_Severity             0
Latitude                      0
Light_Conditions              0
Local_Authority_(District)    0
Carriageway_Hazards           0
Longitude                     0
Number_of_Casualties          0
Number_of_Vehicles            0
Police_Force                  0
Road_Surface_Conditions       0
Road_Type                     0
Speed_limit                   0
Time                          0
Urban_or_Rural_Area           0
Weather_Conditions            0
Vehicle_Type                  0
dtype: int64

Vérification des doublons


In [4]:
data.duplicated().sum()

1

Statistiques descriptives pour les variables numériques


In [5]:
data[['Latitude', 'Longitude', 'Number_of_Casualties', 'Number_of_Vehicles', 'Speed_limit']].describe()

Unnamed: 0,Latitude,Longitude,Number_of_Casualties,Number_of_Vehicles,Speed_limit
count,307973.0,307973.0,307973.0,307973.0,307973.0
mean,52.487005,-1.368884,1.356882,1.829063,38.866037
std,1.339011,1.356092,0.815857,0.710477,14.032933
min,49.914488,-7.516225,1.0,1.0,10.0
25%,51.485248,-2.247937,1.0,1.0,30.0
50%,52.225943,-1.349258,1.0,2.0,30.0
75%,53.415517,-0.20681,1.0,2.0,50.0
max,60.598055,1.759398,48.0,32.0,70.0


Distribution des catégories pour quelques colonnes


In [6]:
print(data['Accident_Severity'].value_counts())
print(data['Road_Type'].value_counts())
print(data['Weather_Conditions'].value_counts())

Accident_Severity
Slight     263280
Serious     40740
Fatal        3904
Fetal          49
Name: count, dtype: int64
Road_Type
Single carriageway    232146
Dual carriageway       45467
Roundabout             20929
One way street          6197
Slip road               3234
Name: count, dtype: int64
Weather_Conditions
Fine no high winds       250553
Raining no high winds     34877
Other                      8802
Snowing no high winds      4839
Raining + high winds       3526
Fine + high winds          3148
Fog or mist                1690
Snowing + high winds        538
Name: count, dtype: int64


3. Prétraitement des données


Suppression des colonnes inutiles


In [7]:
columns_to_drop = ['Accident_Index', 'Police_Force', 'Carriageway_Hazards']
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')

Encodage des colonnes catégorielles


In [8]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = [
    'Day_of_Week', 'Junction_Control', 'Junction_Detail', 'Accident_Severity',
    'Light_Conditions', 'Local_Authority_(District)', 'Road_Surface_Conditions',
    'Road_Type', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type'
]

# Remplir les valeurs manquantes dans les colonnes catégorielles
data[categorical_columns] = data[categorical_columns].fillna('Unknown')

# Encodage avec LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

Traitement de la colonne 'Date'


In [9]:
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y', errors='coerce')
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data = data.drop('Date', axis=1)

Traitement de la colonne 'Time'


In [10]:
if 'Time' in data.columns:
    data['Time'] = pd.to_datetime(data['Time'], format='%H:%M', errors='coerce').dt.hour * 60 + \
                   pd.to_datetime(data['Time'], format='%H:%M', errors='coerce').dt.minute

Normalisation des colonnes numériques


In [11]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Latitude', 'Longitude', 'Speed_limit', 'Number_of_Casualties', 'Number_of_Vehicles']
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

4. Équilibrage des classes avec SMOTE


In [12]:
from imblearn.over_sampling import SMOTE
from collections import Counter

features = [
    'Junction_Control', 'Junction_Detail', 'Light_Conditions', 'Road_Surface_Conditions',
    'Road_Type', 'Speed_limit', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type',
    'Number_of_Casualties', 'Number_of_Vehicles', 'Latitude', 'Longitude'
]

X = data[features]
y = data['Accident_Severity']

# Appliquer SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Distribution des classes après SMOTE :", Counter(y_resampled))

Distribution des classes après SMOTE : Counter({2: 263280, 3: 263280, 1: 263280, 0: 263280})


5. Division des données en ensembles d'entraînement et de test


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("Taille de X_train :", X_train.shape)
print("Taille de X_test :", X_test.shape)
print("Taille de y_train :", y_train.shape)
print("Taille de y_test :", y_test.shape)

Taille de X_train : (842496, 13)
Taille de X_test : (210624, 13)
Taille de y_train : (842496,)
Taille de y_test : (210624,)


6. Modélisation et évaluation


Random Forest Classifier


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Rapport de classification pour Random Forest :")
print(classification_report(y_test, y_pred_rf))
print("Matrice de confusion pour Random Forest :")
print(confusion_matrix(y_test, y_pred_rf))

Rapport de classification pour Random Forest :
              precision    recall  f1-score   support

           0       0.93      0.96      0.94     52634
           1       1.00      1.00      1.00     52770
           2       0.80      0.80      0.80     52623
           3       0.81      0.78      0.80     52597

    accuracy                           0.89    210624
   macro avg       0.88      0.89      0.88    210624
weighted avg       0.88      0.89      0.88    210624

Matrice de confusion pour Random Forest :
[[50578    32   961  1063]
 [    0 52767     0     3]
 [ 1971    16 41945  8691]
 [ 1984    17  9356 41240]]


K-Nearest Neighbors (KNN)


In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("Rapport de classification pour KNN :")
print(classification_report(y_test, y_pred_knn))
print("Matrice de confusion pour KNN :")
print(confusion_matrix(y_test, y_pred_knn))

Rapport de classification pour KNN :
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     52634
           1       1.00      1.00      1.00     52770
           2       0.72      0.81      0.76     52623
           3       0.80      0.62      0.70     52597

    accuracy                           0.85    210624
   macro avg       0.85      0.85      0.84    210624
weighted avg       0.85      0.85      0.84    210624

Matrice de confusion pour KNN :
[[50653    75  1132   774]
 [    1 52767     0     2]
 [ 2834    60 42506  7223]
 [ 4358    96 15371 32772]]


Decision Tree Classifier


In [16]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=20, min_samples_leaf=10, class_weight='balanced', random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("Rapport de classification pour Decision Tree :")
print(classification_report(y_test, y_pred_dt))
print("Matrice de confusion pour Decision Tree :")
print(confusion_matrix(y_test, y_pred_dt))

Rapport de classification pour Decision Tree :
              precision    recall  f1-score   support

           0       0.62      0.73      0.67     52634
           1       0.95      1.00      0.97     52770
           2       0.50      0.33      0.40     52623
           3       0.58      0.64      0.61     52597

    accuracy                           0.67    210624
   macro avg       0.66      0.67      0.66    210624
weighted avg       0.66      0.67      0.66    210624

Matrice de confusion pour Decision Tree :
[[38321  1514  7090  5709]
 [    1 52768     1     0]
 [15775   688 17503 18657]
 [ 7919   720 10427 33531]]


Régression Logistique


In [17]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train, y_train)
y_pred_log_reg = log_reg_model.predict(X_test)

print("Rapport de classification pour la régression logistique :")
print(classification_report(y_test, y_pred_log_reg))
print("Matrice de confusion pour la régression logistique :")
print(confusion_matrix(y_test, y_pred_log_reg))

Rapport de classification pour la régression logistique :
              precision    recall  f1-score   support

           0       0.50      0.59      0.54     52634
           1       0.66      1.00      0.80     52770
           2       0.42      0.22      0.29     52623
           3       0.50      0.39      0.44     52597

    accuracy                           0.55    210624
   macro avg       0.52      0.55      0.52    210624
weighted avg       0.52      0.55      0.52    210624

Matrice de confusion pour la régression logistique :
[[31199  7106  7299  7030]
 [    0 52770     0     0]
 [18486  9205 11549 13383]
 [12578 10655  8609 20755]]


K-Means Clustering


In [18]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X_train)
y_pred_kmeans = kmeans.predict(X_test)

from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Évaluation du modèle avec le score de silhouette
silhouette_avg = silhouette_score(X_test, y_pred_kmeans)
print(f"Silhouette Score: {silhouette_avg:.2f}")


Silhouette Score: 0.29


Support Vector Machine (SVM)


In [36]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("Rapport de classification pour SVM :")
print(classification_report(y_test, y_pred_svm))
print("Matrice de confusion pour SVM :")
print(confusion_matrix(y_test, y_pred_svm))