In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report
import joblib
from datetime import datetime


df = pd.read_csv("/content/Traffic_Crashes_-_Crashes_.csv")

df = df.dropna(subset=["LATITUDE", "LONGITUDE", "CRASH_DATE"])


def parse_date(date_str):
    formats = [
        "%m/%d/%Y %I:%M:%S %p",
        "%m/%d/%Y %H:%M",
        "%m/%d/%Y"
    ]
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    return pd.NaT

df["CRASH_DATETIME"] = df["CRASH_DATE"].apply(parse_date)
df = df.dropna(subset=["CRASH_DATETIME"])
df["CRASH_HOUR"] = df["CRASH_DATETIME"].dt.hour
df["CRASH_DAY_OF_WEEK"] = df["CRASH_DATETIME"].dt.dayofweek


features = [
    "POSTED_SPEED_LIMIT",
    "WEATHER_CONDITION",
    "LIGHTING_CONDITION",
    "ROADWAY_SURFACE_COND",
    "CRASH_HOUR",
    "CRASH_DAY_OF_WEEK",
    "LATITUDE",
    "LONGITUDE"
]

non_accident_data = df[features].copy().sample(frac=0.8, random_state=42)
non_accident_data["POSTED_SPEED_LIMIT"] *= 0.7
non_accident_data["LIGHTING_CONDITION"] = "DAYLIGHT"


X = pd.concat([df[features], non_accident_data])
y = [1]*len(df) + [0]*len(non_accident_data)

X = pd.get_dummies(X, columns=[
    "WEATHER_CONDITION",
    "LIGHTING_CONDITION",
    "ROADWAY_SURFACE_COND",
    "CRASH_DAY_OF_WEEK"
])

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5]
}

model = GridSearchCV(
    RandomForestClassifier(class_weight="balanced", random_state=42),
    param_grid,
    cv=3,
    scoring='roc_auc'
)
model.fit(X, y)

print(f"Meilleurs paramètres: {model.best_params_}")
print(f"AUC-ROC: {model.best_score_:.2f}")

print(classification_report(y, model.predict(X)))


joblib.dump(model.best_estimator_, "optimized_model.joblib")
joblib.dump(X.columns.tolist(), "model_features.joblib")

  df = pd.read_csv("/content/Traffic_Crashes_-_Crashes_20250420.csv")


Meilleurs paramètres: {'max_depth': 7, 'min_samples_split': 5, 'n_estimators': 100}
AUC-ROC: 1.00
              precision    recall  f1-score   support

           0       0.91      0.98      0.94     60381
           1       0.99      0.92      0.95     75476

    accuracy                           0.95    135857
   macro avg       0.95      0.95      0.95    135857
weighted avg       0.95      0.95      0.95    135857



['model_features.joblib']