<a href="https://colab.research.google.com/github/mervefilizbaker1/DRIVER-RISK-PREDICTION/blob/main/DRP_supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### DRIVER RISK PREDICTION - SUPERVISED LEARNING


In [23]:
#libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import shap


In [2]:
df_accident = pd.read_csv('/content/drive/MyDrive/DRP/df_accident')

In [3]:
X = df_accident.drop(['risk_class','Accident_severity','driver_risk_score','env_risk_score','behaviour_risk_score','total_risk_score'], axis=1)
y = df_accident['risk_class']

Split/Train/Test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [7]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train_encoded)

In [8]:
unique, counts = np.unique(y_train_res, return_counts=True)
print(dict(zip(le.inverse_transform(unique), counts)))

{'High': np.int64(4415), 'Low': np.int64(4415), 'Medium': np.int64(4415)}


RANDOM FOREST

In [10]:
rf = RandomForestClassifier(random_state=42, class_weight="balanced", n_estimators=200)

In [11]:
rf.fit(X_train_res,y_train_res)

In [12]:
rf_param = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
}

In [13]:
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight="balanced"),
    rf_param,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

In [14]:
rf_grid.fit(X_train_res,y_train_res)
print("Best params:", rf_grid.best_params_)
print("Best score:", rf_grid.best_score_)

Best params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best score: 0.9563608909022273


In [15]:
rf_best = rf_grid.best_estimator_

LOGISTIC REGRESSION

In [17]:
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced",random_state=42)
log_reg.fit(X_train_res,y_train_res)

In [18]:
log_param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs", "liblinear"]
}

In [19]:
log_grid = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
    log_param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

In [20]:
log_grid.fit(X_train_res,y_train_res)
print("\n Logistic Regression Best Params:", log_grid.best_params_)
log_best = log_grid.best_estimator_


 Logistic Regression Best Params: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}


XG BOOST

In [21]:
xgb_param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.3],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

In [24]:
xgb = XGBClassifier(
    objective="multi:softmax",
    num_class=3,
    eval_metric="mlogloss",
    random_state=42,
    use_label_encoder=False
)

In [25]:
xgb_grid = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric="logloss"),
    xgb_param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

In [None]:
xgb_grid.fit(X_train_res,y_train_res)
print("\n XGBoost Best Params:", xgb_grid.best_params_)
xgb_best = xgb_grid.best_estimator_

RESULTS:

In [None]:
results = {}

In [None]:
models = {
    "Random Forest": rf_best,
    "Logistic Regression": log_best,
    "XGBoost": xgb_best
}

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test_encoded, y_pred)
    cm = confusion_matrix(y_test_encoded, y_pred)
    results[name] = {"acc": acc, "cm": cm}
    print(f"\n----- {name} -----")
    print("Accuracy:", round(acc, 4))
    print(classification_report(y_test_encoded, y_pred, target_names=le.classes_))

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, (name, res) in zip(axes, results.items()):
    disp = ConfusionMatrixDisplay(confusion_matrix=res["cm"], display_labels=le.classes_)
    disp.plot(cmap="Blues", ax=ax, colorbar=False, values_format="d")
    ax.set_title(f"{name}\nAccuracy: {res['acc']:.3f}")
plt.tight_layout()
plt.show()

SHAP


In [None]:
import shap

In [None]:
print("\n Logistic Regression SHAP Summary")
explainer_log = shap.LinearExplainer(log_best, X_train_res)
shap_values_log = explainer_log(X_test)
shap.summary_plot(shap_values_log, X_test, show=False)
plt.title("Logistic Regression SHAP Summary")
plt.show()

In [None]:
print("\n Random Forest SHAP Summary")
explainer_rf = shap.Explainer(rf_best, X_train_res)
shap_values_rf = explainer_rf(X_test)
shap.summary_plot(shap_values_rf, X_test, show=False)
plt.title("Random Forest SHAP Summary")
plt.show()

In [None]:
X_test_array = X_test.values if hasattr(X_test, 'values') else np.array(X_test)
feature_names = list(X_test.columns) if hasattr(X_test, 'columns') else None

print(f"X_test_array shape: {X_test_array.shape}")

explainer = shap.TreeExplainer(model)


shap_values = explainer.shap_values(X_test_array, check_additivity=False)


print(f"  Tip: {type(shap_values)}")

if isinstance(shap_values, np.ndarray):
    print(f"  Shape: {shap_values.shape}")


    if len(shap_values.shape) == 3:
        print(f"  Format: (samples={shap_values.shape[0]}, features={shap_values.shape[1]}, classes={shap_values.shape[2]})")


        for i, class_name in enumerate(le.classes_):
            print(f"\n{'='*60}")
            print(f"SHAP Summary for class: {class_name}")
            print(f"{'='*60}")


            class_shap = shap_values[:, :, i]
            print(f"  Class SHAP shape: {class_shap.shape}")

            shap.summary_plot(
                class_shap,
                X_test_array,
                feature_names=feature_names,
                show=False,
                max_display=20
            )
            plt.title(f"XGBoost SHAP Summary - {class_name}")
            plt.tight_layout()
            plt.show()

    else:
        print("⚠️ Unexpected SHAP shape!")
        print(f" shape: {shap_values.shape}")


elif isinstance(shap_values, list):
    print(f"  Class length: {len(shap_values)}")

    for i, class_name in enumerate(le.classes_):
        print(f"\n{'='*60}")
        print(f"SHAP Summary for class: {class_name}")
        print(f"  SHAP shape: {shap_values[i].shape}")
        print(f"{'='*60}")

        shap.summary_plot(
            shap_values[i],
            X_test_array,
            feature_names=feature_names,
            show=False,
            max_display=20
        )
        plt.title(f"XGBoost SHAP Summary - {class_name}")
        plt.tight_layout()
        plt.show()