### **Supervised Machine Learning**

We will split the steps in doing supervised machine learning into few parts

#### Import package

In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

from sklearn.metrics import (
    recall_score, precision_score, accuracy_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



#### Reading dataset and selecting features

In [166]:
dfAccident = pd.read_csv('datasets/accident_clean.csv')
dfAccident.columns

Index(['ACCIDENT_NO', 'ACCIDENT_DATE', 'ACCIDENT_TIME', 'LIGHT_CONDITION',
       'ROAD_GEOMETRY', 'ROAD_GEOMETRY_DESC', 'SEVERITY', 'SPEED_ZONE',
       'ATMOSPH_COND', 'ATMOSPH_COND_DESC', 'SURFACE_COND',
       'SURFACE_COND_DESC', 'ACCIDENT_YEAR', 'VEHICLE_AGE',
       'TOTAL_NO_OCCUPANTS', 'VEHICLE_TYPE', 'TRAFFIC_CONTROL',
       'ROAD_SURFACE_TYPE', 'AGE_GROUP', 'SEATING_POSITION',
       'HELMET_BELT_WORN', 'SEVERITY_ORD', 'AGE_ORD', 'SEAT_CATEGORY',
       'VEHICLE_TYPE_CAT', 'TIME_OF_DAY', 'SEVERITY_BINARY',
       'SURFACE_COND_BINARY', 'ATMOSPH_COND_BINARY', 'SEAT_BINARY',
       'HELMET_BELT_BINARY', 'TRAFFIC_CONTROL_BINARY', 'ROAD_GEOMETRY_BINARY',
       'LIGHT_CONDITION_BINARY', 'SPEED_ZONE_BINARY', 'VEHICLE_TYPE_BINARY',
       'SURFACE_COND_RISK', 'ATMOSPH_COND_RISK', 'WEATHER_RISK',
       'VEHICLE_AGE_NORM', 'TOTAL_OCCUPANTS_NORM'],
      dtype='object')

In [167]:
# Defining categorical features with some can be treated as binary
features_binary = [
    'LIGHT_CONDITION_BINARY', 'ROAD_GEOMETRY',
    'TRAFFIC_CONTROL_BINARY', 'HELMET_BELT_BINARY',
    'VEHICLE_TYPE_BINARY','SEAT_BINARY',
    'SPEED_ZONE_BINARY', 'TIME_OF_DAY','AGE_ORD',
    # 'ATMOSPH_COND_BINARY','SURFACE_COND_BINARY'
    ]
X_bin = dfAccident[features_binary].astype('category')

# Defining categorical features
features_category = [
    'LIGHT_CONDITION', 'ROAD_GEOMETRY',
    'TRAFFIC_CONTROL', 'HELMET_BELT_WORN',
    'VEHICLE_TYPE_CAT','SEAT_CATEGORY',
    'SPEED_ZONE', 'TIME_OF_DAY',
    # 'ATMOSPH_COND','SURFACE_COND'
    ]
X_cat = dfAccident[features_category].astype('category')

# Defining numerical features
X_num = ['TOTAL_OCCUPANTS_NORM',
         'WEATHER_RISK',
         'VEHICLE_AGE_NORM']

# Defining target features
y = dfAccident['SEVERITY_BINARY']

# One hot features encoding
X_encoded_bin = pd.get_dummies(X_bin, drop_first=True)
X_encoded_cat = pd.get_dummies(X_cat, drop_first=True)

# Merging dataset
X_bin_original = pd.concat([X_bin, dfAccident[X_num]], axis=1)
X_bin_one_hot = pd.concat([X_encoded_bin, dfAccident[X_num]], axis=1)
X_cat_original = pd.concat([X_cat, dfAccident[X_num]], axis=1)
X_cat_one_hot = pd.concat([X_encoded_cat, dfAccident[X_num]], axis=1)


#### Logistic Regression Model

In [168]:
def logistic_model (X_type, name_type):
    # Training test split
    X_train, X_test, y_train, y_test = train_test_split(X_type, y, test_size=0.3, random_state=1)

    # Train logistic regression
    model_logit = LogisticRegression(max_iter=1000, class_weight='balanced')
    model_logit.fit(X_train, y_train)

    # Making prediction of y
    y_pred = model_logit.predict(X_test)
    y_prob = model_logit.predict_proba(X_test)[:, 1]

    # Testing AUC and accuracy of model
    auc = round(roc_auc_score(y_test, y_prob),4)
    acc = round(accuracy_score(y_test, y_pred),4)

    print("Logit Model", name_type , "AUC is ",auc)
    print("Logit Model", name_type , "Accuracy is ",acc)
    print("Logit Model", name_type , "Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print()
    print(classification_report(y_test, y_pred, zero_division=0))
    print()
    return model_logit

In [169]:
model_cat_original = logistic_model(X_cat_original, "Categorical Original")
model_cat_one_hot = logistic_model(X_cat_one_hot, "Categorical One-Hot")
model_bin_original = logistic_model(X_bin_original, "Binary Original")
model_bin_one_hot = logistic_model(X_bin_one_hot, "Binary One-Hot")

Logit Model Categorical Original AUC is  0.7532
Logit Model Categorical Original Accuracy is  0.725
Logit Model Categorical Original Confusion Matrix:
 [[21866  8260]
 [  181   384]]

              precision    recall  f1-score   support

           0       0.99      0.73      0.84     30126
           1       0.04      0.68      0.08       565

    accuracy                           0.72     30691
   macro avg       0.52      0.70      0.46     30691
weighted avg       0.97      0.72      0.82     30691


Logit Model Categorical One-Hot AUC is  0.7656
Logit Model Categorical One-Hot Accuracy is  0.7474
Logit Model Categorical One-Hot Confusion Matrix:
 [[22559  7567]
 [  186   379]]

              precision    recall  f1-score   support

           0       0.99      0.75      0.85     30126
           1       0.05      0.67      0.09       565

    accuracy                           0.75     30691
   macro avg       0.52      0.71      0.47     30691
weighted avg       0.97      0.75 

In [170]:
# # Feature importance
# importance = pd.Series(model_logit.coef_[0], index=X_type.columns)
# importance = importance.abs().sort_values(ascending=False)

# print(f"\nTop 10 Feature Importances for {name_type}:\n{importance.head(5)}")
# print()

#     # SHAP explanation
# print(f"\nGenerating SHAP summary plot for {name_type}...")
# explainer = shap.Explainer(model_logit, X_train)
# shap_values = explainer(X_test)
# shap.summary_plot(shap_values, X_test)

#### Decision Tree

In [215]:
def decision_tree (X_type, name_type):
    
    # Training test split
    X_train, X_test, y_train, y_test = train_test_split(X_type, y, test_size=0.3, random_state=1)
    
    # Train decision tree modelling
    dtree = DecisionTreeClassifier(random_state=30, max_depth=5, min_samples_leaf=30, class_weight='balanced')
    dtree.fit(X_train, y_train)
    
    # Making prediction of y using decision tree
    y_pred = dtree.predict(X_test)
    y_prob = dtree.predict_proba(X_test)[:, 1]

    # Testing AUC and accuracy of model
    auc = round(roc_auc_score(y_test, y_prob),4)
    acc = round(accuracy_score(y_test, y_pred),4)

    print("Decision Tree", name_type , "AUC is ",auc)
    print("Decision Tree", name_type , "Accuracy is ",acc)
    print("Decision Tree", name_type , "Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print()
    print(classification_report(y_test, y_pred, zero_division=0))
    print()
    return dtree

In [216]:
dtree_cat_original = decision_tree(X_cat_original, "Categorical Original")
dtree_cat_one_hot = decision_tree(X_cat_one_hot, "Categorical One-Hot")
dtree_bin_original = decision_tree(X_bin_original, "Binary Original")
dtree_bin_one_hot = decision_tree(X_bin_one_hot, "Binary One-Hot")

Decision Tree Categorical Original AUC is  0.7399
Decision Tree Categorical Original Accuracy is  0.7707
Decision Tree Categorical Original Confusion Matrix:
 [[23307  6819]
 [  219   346]]

              precision    recall  f1-score   support

           0       0.99      0.77      0.87     30126
           1       0.05      0.61      0.09       565

    accuracy                           0.77     30691
   macro avg       0.52      0.69      0.48     30691
weighted avg       0.97      0.77      0.85     30691


Decision Tree Categorical One-Hot AUC is  0.701
Decision Tree Categorical One-Hot Accuracy is  0.7943
Decision Tree Categorical One-Hot Confusion Matrix:
 [[24059  6067]
 [  246   319]]

              precision    recall  f1-score   support

           0       0.99      0.80      0.88     30126
           1       0.05      0.56      0.09       565

    accuracy                           0.79     30691
   macro avg       0.52      0.68      0.49     30691
weighted avg       0.9

#### Light GBM

In [None]:
def light_gbm (X_type, name_type):
    
    # Training test split
    X_train, X_test, y_train, y_test = train_test_split(X_type, y, test_size=0.3, random_state=1)
    
    # Calculate scale_pos_weight for class imbalance
    scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
    
    # Train LightGBM model
    lgb = LGBMClassifier(scale_pos_weight=scale_pos_weight,
                         max_depth=6, min_samples_leaf=15,
                         verbose=-1,
                         random_state=42)
    lgb.fit(X_train, y_train)
    
    # Making prediction of y using decision tree
    y_pred = lgb.predict(X_test)
    y_prob = lgb.predict_proba(X_test)[:, 1]

    # Testing AUC and accuracy of model
    auc = round(roc_auc_score(y_test, y_prob),4)
    acc = round(accuracy_score(y_test, y_pred),4)

    print("LightGBM", name_type , "AUC is ",auc)
    print("LightGBM", name_type , "Accuracy is ",acc)
    print("LightGBM", name_type , "Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print()
    print(classification_report(y_test, y_pred, zero_division=0))
    print()
    return lgb


In [187]:
lgb_cat_original = light_gbm(X_cat_original, "Categorical Original")
lgb_cat_one_hot = light_gbm(X_cat_one_hot, "Categorical One-Hot")
lgb_bin_original = light_gbm(X_bin_original, "Binary Original")
lgb_bin_one_hot = light_gbm(X_bin_one_hot, "Binary One-Hot")

LightGBM Categorical Original AUC is  0.7457
LightGBM Categorical Original Accuracy is  0.7733
LightGBM Categorical Original Confusion Matrix:
 [[23384  6742]
 [  217   348]]

              precision    recall  f1-score   support

           0       0.99      0.78      0.87     30126
           1       0.05      0.62      0.09       565

    accuracy                           0.77     30691
   macro avg       0.52      0.70      0.48     30691
weighted avg       0.97      0.77      0.86     30691


LightGBM Categorical One-Hot AUC is  0.7487
LightGBM Categorical One-Hot Accuracy is  0.7757
LightGBM Categorical One-Hot Confusion Matrix:
 [[23463  6663]
 [  221   344]]

              precision    recall  f1-score   support

           0       0.99      0.78      0.87     30126
           1       0.05      0.61      0.09       565

    accuracy                           0.78     30691
   macro avg       0.52      0.69      0.48     30691
weighted avg       0.97      0.78      0.86     306

#### XGBoost

In [203]:
def xgboost (X_type, name_type):
    
    # Training test split
    X_train, X_test, y_train, y_test = train_test_split(X_type, y, test_size=0.3, random_state=1)
    
    # Calculate scale_pos_weight for class imbalance
    scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
    
    # Train LightGBM model
    xgb = XGBClassifier(scale_pos_weight=scale_pos_weight,
                        # n_estimators=200, learning_rate=0.05, max_depth=6,
                        random_state=42,
                        eval_metric='auc'
                        )
    
    xgb.fit(X_train, y_train)
    
    # Making prediction of y using decision tree
    y_pred = xgb.predict(X_test)
    y_prob = xgb.predict_proba(X_test)[:, 1]

    # Testing AUC and accuracy of model
    auc = round(roc_auc_score(y_test, y_prob),4)
    acc = round(accuracy_score(y_test, y_pred),4)

    print("XGBoost", name_type , "AUC is ",auc)
    print("XGBoost", name_type , "Accuracy is ",acc)
    print("XGBoost", name_type , "Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print()
    print(classification_report(y_test, y_pred, zero_division=0))
    print()
    return xgb


In [204]:
xgb_cat_one_hot = xgboost(X_cat_one_hot, "Categorical One-Hot")
xgb_bin_one_hot = xgboost(X_bin_one_hot, "Binary One-Hot")

XGBoost Categorical One-Hot AUC is  0.7186
XGBoost Categorical One-Hot Accuracy is  0.7949
XGBoost Categorical One-Hot Confusion Matrix:
 [[24089  6037]
 [  258   307]]

              precision    recall  f1-score   support

           0       0.99      0.80      0.88     30126
           1       0.05      0.54      0.09       565

    accuracy                           0.79     30691
   macro avg       0.52      0.67      0.49     30691
weighted avg       0.97      0.79      0.87     30691


XGBoost Binary One-Hot AUC is  0.7022
XGBoost Binary One-Hot Accuracy is  0.7976
XGBoost Binary One-Hot Confusion Matrix:
 [[24202  5924]
 [  288   277]]

              precision    recall  f1-score   support

           0       0.99      0.80      0.89     30126
           1       0.04      0.49      0.08       565

    accuracy                           0.80     30691
   macro avg       0.52      0.65      0.48     30691
weighted avg       0.97      0.80      0.87     30691


