# Classifiers for accident fatality

### Code for training a Random Forest model

#### First, train three basic RF models (using default settings) with the different training data (original, oversampled and undersampled).

##### Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from joblib import load
import xgboost as xgb
from xgboost import XGBClassifier
import numpy as np

##### Data

In [2]:
# Training data with no resampling
X_train_orig = pd.read_csv('../00_data/X_train_orig_road_acc.csv')
y_train_orig = pd.read_csv('../00_data/y_train_orig_road_acc.csv')

# Oversampled training data
X_train_oversamp = pd.read_csv('../00_data/X_train_oversamp_road_acc.csv')
y_train_oversamp = pd.read_csv('../00_data/y_train_oversamp_road_acc.csv')

# Undersampled training data
X_train_undersamp = pd.read_csv('../00_data/X_train_undersamp_road_acc.csv')
y_train_undersamp = pd.read_csv('../00_data/y_train_undersamp_road_acc.csv')

# Ensemble resampled training data
X_train_ensemble = pd.read_csv('../00_data/X_train_ensemble_road_acc.csv')
y_train_ensemble = pd.read_csv('../00_data/y_train_ensemble_road_acc.csv')


# Validation data
X_val = pd.read_csv('../00_data/X_val_road_acc.csv')
y_val = pd.read_csv('../00_data/y_val_road_acc.csv')

#### RF model trained on original (unbalanced) data

In [8]:
rf_clf_orig = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_orig = cross_val_score(rf_clf_orig, X_train_orig, y_train_orig.values.ravel(), cv = 5)

# Print the cross-validation scores for each fold and the mean CV score
print("RF model (default values) trained on original (unbalanced) data")
print("Cross-validation scores for each fold:", cv_scores_orig)
print("Average cross-validation score:", cv_scores_orig.mean())

# Fit the model to original training data
rf_clf_orig.fit(X_train_orig, y_train_orig.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = rf_clf_orig.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = rf_clf_orig.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

RF model (default values) trained on original (unbalanced) data
Cross-validation scores for each fold: [0.98997223 0.9899275  0.98999442 0.98987172 0.99022867]
Average cross-validation score: 0.989998906921973
AUC Score: 0.585031640849742
Validation Accuracy: 0.9900368542694734
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     95168
           1       0.00      0.00      0.00       886

    accuracy                           0.99     96054
   macro avg       0.50      0.50      0.50     96054
weighted avg       0.98      0.99      0.99     96054

Confusion Matrix:
[[95097    71]
 [  886     0]]


As expected, this model did not perform well. It basically predicts everyting as non-fatal and has zero true positives, i.e., zero instances of correctly classified fatal accidents. Area under the curve is 58.5%, meaning it is only slightly better than random guesser.

#### RF model trained on oversampled data

In [5]:
rf_clf_oversamp = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_oversamp = cross_val_score(rf_clf_oversamp, X_train_oversamp, y_train_oversamp.values.ravel(), cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on oversampled data")
print("Cross-validation scores for each fold:", cv_scores_oversamp)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_oversamp.mean())

# Fit the model to original training data
rf_clf_oversamp.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = rf_clf_oversamp.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = rf_clf_oversamp.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

RF model (default values) trained on oversampled data
Cross-validation scores for each fold: [0.95264124 0.97006918 0.9691685  0.96983838 0.97039568]
Average cross-validation score: 0.9664225974102694
AUC Score: 0.5879311408523987
Validation Accuracy: 0.9551502279967519
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054

Confusion Matrix:
[[91630  3538]
 [  770   116]]


Using oversampled training data slightly improved the model, however, it still performs quite badly. Now, we have 116 true positives, and almost the same area under the curve (58.8%) as the previous code.

#### RF model trained on undersampled data

In [6]:
rf_clf_undersamp = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_undersamp = cross_val_score(rf_clf_undersamp, X_train_undersamp, y_train_undersamp.values.ravel(), cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on undersampled data")
print("Cross-validation scores for each fold:", cv_scores_undersamp)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_undersamp.mean())

# Fit the model to original training data
rf_clf_undersamp.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = rf_clf_undersamp.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = rf_clf_undersamp.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel() class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

RF model (default values) trained on undersampled data
Cross-validation scores for each fold: [0.6562123  0.67410984 0.65117683 0.67229934 0.65238383]
Average cross-validation score: 0.6612364257931224
AUC Score: 0.6840373696756388
Validation Accuracy: 0.6697586774106232
              precision    recall  f1-score   support

           0       0.99      0.67      0.80     95168
           1       0.02      0.62      0.03       886

    accuracy                           0.67     96054
   macro avg       0.51      0.64      0.42     96054
weighted avg       0.99      0.67      0.79     96054

Confusion Matrix:
[[63784 31384]
 [  337   549]]


Using undersampled data improved the results further, increasing the number of true positives to 549 and the area under the curve increased 68.4%. However, also the number of false positives (non-fatal accidents predicted as fatal) also drastically increased. </br> </br>
None of the models yielded particularly good results, thus we will test some parameter tuning. We will focus on oversampled and undersampled training data only for the parameter tuning.

#### Hyperparameter tuning

#### Oversampling

In [9]:
# Performing the hyperparameter search on a subset of the whole dataset

subset_size = 0.1 
X_train_oversamp_subset = X_train_oversamp.sample(frac = subset_size, random_state = 33)
y_train_oversamp_subset = y_train_oversamp.loc[X_train_oversamp_subset.index]

In [14]:
# Define a parameter distribution to sample from
param_distributions = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Initialize the base model
rf_clf = RandomForestClassifier(random_state = 33)

# Set up the RandomizedSearchCV object
rf_random_search = RandomizedSearchCV(
    estimator = rf_clf,
    param_distributions = param_distributions,
    n_iter = 100,
    cv = 5, 
    verbose = 2,
    random_state = 33,
    n_jobs = 12
)

# Fit the RandomizedSearchCV object to the training data
rf_random_search.fit(X_train_oversamp_subset, y_train_oversamp_subset.values.ravel())

# Get the best estimator
best_rf_clf = rf_random_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [15]:
# Fit the model with the best hyperparameters on the full oversampled training data
best_rf_clf.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predicting probabilities on the validation set for AUC calculation
prob_predictions = best_rf_clf.predict_proba(X_val)[:, 1]

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print("Classification Report:\n", classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:\n", conf_matrix)

# Print the best hyperparameters
print("Best Hyperparameters:\n", rf_random_search.best_params_)

AUC Score: 0.6049742045811632
Validation Accuracy: 0.9559726820330231
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054

Confusion Matrix:
 [[91711  3457]
 [  772   114]]
Best Hyperparameters:
 {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}


#### Undersampling

In [16]:
param_distributions = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Initialize the base model
rf_clf = RandomForestClassifier(random_state = 33)

# Set up the RandomizedSearchCV object
rf_random_search = RandomizedSearchCV(
    estimator = rf_clf,
    param_distributions = param_distributions,
    n_iter = 100,
    cv = 5, 
    verbose = 2,
    random_state = 33,
    n_jobs = 12
)

# Fit the RandomizedSearchCV object to the training data
rf_random_search.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Get the best estimator
best_rf_clf = rf_random_search.best_estimator_

# Fit the model with the best hyperparameters on the full oversampled training data
best_rf_clf.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predicting probabilities on the validation set for AUC calculation
prob_predictions = best_rf_clf.predict_proba(X_val)[:, 1]

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print("Classification Report:\n", classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:\n", conf_matrix)

# Print the best hyperparameters
print("Best Hyperparameters:\n", rf_random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
AUC Score: 0.736727463354338
Validation Accuracy: 0.7450288379453224
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.75      0.85     95168
           1       0.02      0.61      0.04       886

    accuracy                           0.75     96054
   macro avg       0.51      0.68      0.45     96054
weighted avg       0.99      0.75      0.85     96054

Confusion Matrix:
 [[71021 24147]
 [  344   542]]
Best Hyperparameters:
 {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': False}


Parameter tuning did not really improve the results. The AUC scores were improved, especially for the undersampled data, but the number of true positves remained more or less the same.

#### Feature importance of tunes models

#### Oversampled

Below comes the feature importance analysis from the tuned random forest classifiers.

In [11]:
# Retraining the model (in case not done at the same time as parameter tuning)

best_rf_clf_over = RandomForestClassifier(n_estimators = 100,
                                          criterion = 'entropy',
                                          max_depth = None,
                                          min_samples_split = 10,
                                          min_samples_leaf = 1,
                                          max_features = 'sqrt',
                                          bootstrap = False,
                                          n_jobs = 12,
                                          random_state = 33)

# Fit the model to oversampled training data
best_rf_clf_over.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = best_rf_clf_over.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf_over.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

AUC Score: 0.6049742342305247
Validation Accuracy: 0.9559726820330231
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054

Confusion Matrix:
[[91711  3457]
 [  772   114]]


In [12]:
# Load the encoder
encoder = load('../00_data/encoder.joblib')

# Get feature importances
importances = best_rf_clf_over.feature_importances_

# Get feature names
feature_names = encoder.get_feature_names_out()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

feature_importances_df.to_csv('../00_data/feature_importance_RF_oversampled.csv', index = False)


                                          feature  importance
69                         speed_limit_bins_30-39    0.042290
20                     junction_detail_roundabout    0.037156
75                       time_of_day_evening_rush    0.035745
0                              day_of_week_friday    0.034288
6                           day_of_week_wednesday    0.033062
..                                            ...         ...
63  carriageway_hazards_pedestrian_in_carriageway    0.000102
41          weather_conditions_snowing_high_winds    0.000094
47     road_surface_conditions_oil_or_diesel_road    0.000031
46               road_surface_conditions_mud_road    0.000008
59                carriageway_hazards_dog_on_road    0.000007

[79 rows x 2 columns]


#### Undersampled

In [14]:
# Retraining the model (in case not done at the same time as parameter tuning)

best_rf_clf_under = RandomForestClassifier(n_estimators = 100,
                                          criterion = 'entropy',
                                          max_depth = 10,
                                          min_samples_split = 10,
                                          min_samples_leaf = 4,
                                          max_features = 'log2',
                                          bootstrap = False,
                                          n_jobs = 12,
                                          random_state = 33)

# Fit the model to oversampled training data
best_rf_clf_under.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = best_rf_clf_under.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf_under.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

AUC Score: 0.736727463354338
Validation Accuracy: 0.7450288379453224
              precision    recall  f1-score   support

           0       1.00      0.75      0.85     95168
           1       0.02      0.61      0.04       886

    accuracy                           0.75     96054
   macro avg       0.51      0.68      0.45     96054
weighted avg       0.99      0.75      0.85     96054

Confusion Matrix:
[[71021 24147]
 [  344   542]]


In [15]:
# Load the encoder
encoder = load('../00_data/encoder.joblib')

# Get feature importances
importances = best_rf_clf_under.feature_importances_

# Get feature names
feature_names = encoder.get_feature_names_out()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

feature_importances_df.to_csv('../00_data/feature_importance_RF_undersampled.csv', index = False)

                                          feature  importance
66                      urban_or_rural_area_rural    0.102950
20                     junction_detail_roundabout    0.102151
69                         speed_limit_bins_30-39    0.074415
67                      urban_or_rural_area_urban    0.069410
72                         speed_limit_bins_60-69    0.060589
..                                            ...         ...
59                carriageway_hazards_dog_on_road    0.000000
41          weather_conditions_snowing_high_winds    0.000000
63  carriageway_hazards_pedestrian_in_carriageway    0.000000
47     road_surface_conditions_oil_or_diesel_road    0.000000
46               road_surface_conditions_mud_road    0.000000

[79 rows x 2 columns]


### Code for training an XGBoost model

#### First, train three basic XGB models (using default settings) with the different training data (original, oversampled and undersampled).

In [3]:
# Unbalanced (original) data

# Instantiate an XGBoost classifier object
xgb_clf_orig = xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 33)

# Fit the classifier to the training data
xgb_clf_orig.fit(X_train_orig, y_train_orig.values.ravel())

# Predict the labels for the validation set
y_val_pred = xgb_clf_orig.predict(X_val)

# Calculate the accuracy on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), y_val_pred)

# Calculate the ROC AUC score
val_roc_auc = roc_auc_score(y_val.values.ravel(), xgb_clf_orig.predict_proba(X_val)[:,1])

# Generate a confusion matrix
val_conf_matrix = confusion_matrix(y_val.values.ravel(), y_val_pred)

# Generate a classification report
val_class_report = classification_report(y_val.values.ravel(), y_val_pred)

# Print the results
print("Results from XGBoost trained on unbalanced (original) data")
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation ROC AUC Score: {val_roc_auc}')
print('Validation Classification Report:')
print(val_class_report)
print('Validation Confusion Matrix:')
print(val_conf_matrix)


Results from XGBoost trained on unbalanced (original) data
Validation Accuracy: 0.990776021821059
Validation ROC AUC Score: 0.7308531480411118
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     95168
           1       0.00      0.00      0.00       886

    accuracy                           0.99     96054
   macro avg       0.50      0.50      0.50     96054
weighted avg       0.98      0.99      0.99     96054

Validation Confusion Matrix:
[[95168     0]
 [  886     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Predicted everything as non-fatal, so using the unbalanced data here is useless

In [4]:
# SMOTE oversampled data

# Instantiate an XGBoost classifier object
xgb_clf_over = xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 33)

# Fit the classifier to the training data
xgb_clf_over.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predict the labels for the validation set
y_val_pred = xgb_clf_over.predict(X_val)

# Calculate the accuracy on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), y_val_pred)

# Calculate the ROC AUC score
val_roc_auc = roc_auc_score(y_val.values.ravel(), xgb_clf_over.predict_proba(X_val)[:,1])

# Generate a confusion matrix
val_conf_matrix = confusion_matrix(y_val.values.ravel(), y_val_pred)

# Generate a classification report
val_class_report = classification_report(y_val.values.ravel(), y_val_pred)

# Print the results
print("Results from XGBoost trained on oversampled (SMOTE) data")
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation ROC AUC Score: {val_roc_auc}')
print('Validation Classification Report:')
print(val_class_report)
print('Validation Confusion Matrix:')
print(val_conf_matrix)


Results from XGBoost trained on oversampled (SMOTE) data
Validation Accuracy: 0.9583359360359798
Validation ROC AUC Score: 0.6752880625219168
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     95168
           1       0.04      0.15      0.06       886

    accuracy                           0.96     96054
   macro avg       0.52      0.56      0.52     96054
weighted avg       0.98      0.96      0.97     96054

Validation Confusion Matrix:
[[91915  3253]
 [  749   137]]


Better but still pretty bad... Only correctly predicts 137 fatal accidents. Better AUC score compared to random forest classifiers.

In [5]:
# Randomised undersampled data

# Instantiate an XGBoost classifier object
xgb_clf_under = xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 33)

# Fit the classifier to the training data
xgb_clf_under.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predict the labels for the validation set
y_val_pred = xgb_clf_under.predict(X_val)

# Calculate the accuracy on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), y_val_pred)

# Calculate the ROC AUC score
val_roc_auc = roc_auc_score(y_val.values.ravel(), xgb_clf_under.predict_proba(X_val)[:,1])

# Generate a confusion matrix
val_conf_matrix = confusion_matrix(y_val.values.ravel(), y_val_pred)

# Generate a classification report
val_class_report = classification_report(y_val.values.ravel(), y_val_pred)

# Print the results
print("Results from XGBoost trained on undersampled (randomised) data")
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation ROC AUC Score: {val_roc_auc}')
print('Validation Classification Report:')
print(val_class_report)
print('Validation Confusion Matrix:')
print(val_conf_matrix)

Results from XGBoost trained on undersampled (randomised) data
Validation Accuracy: 0.7140150332104858
Validation ROC AUC Score: 0.7191545477471419
Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83     95168
           1       0.02      0.63      0.04       886

    accuracy                           0.71     96054
   macro avg       0.51      0.67      0.44     96054
weighted avg       0.99      0.71      0.82     96054

Validation Confusion Matrix:
[[68024 27144]
 [  326   560]]


Even better but may be improved with hyperparameter tuning.

#### Perform randomised hyperparameter search to see if XGB models can be improved. Only train with over- and undersampled data.

#### Oversampled

In [3]:
# Performing the hyperparameter search on a subset of the whole dataset

subset_size = 0.1 
X_train_oversamp_subset = X_train_oversamp.sample(frac = subset_size, random_state = 33)
y_train_oversamp_subset = y_train_oversamp.loc[X_train_oversamp_subset.index]

In [4]:
# Define the hyperparameter grid to search
param_grid = {
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize the XGBClassifier
xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator = xgb,
    param_distributions = param_grid,
    n_iter = 100,  # Number of parameter settings sampled. Adjust according to your computational resources
    scoring = 'roc_auc',  # Can change to 'accuracy' or other metrics
    cv = 5,
    verbose = 2,
    random_state = 33,
    n_jobs = -1  # Use all available cores
)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train_oversamp_subset, y_train_oversamp_subset.values.ravel())

# Get the best estimator
best_xgb_over = random_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

# Train the best estimator on the full training data
best_xgb_over.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predict probabilities on the validation set
y_val_pred_prob = best_xgb_over.predict_proba(X_val)[:, 1]

# Compute the ROC AUC score
val_roc_auc = roc_auc_score(y_val.values.ravel(), y_val_pred_prob)
print(f"Validation ROC AUC Score: {val_roc_auc}")

# Predict on the validation set
y_val_pred = best_xgb_over.predict(X_val)

# Compute accuracy
val_accuracy = accuracy_score(y_val.values.ravel(), y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Print classification report
print(classification_report(y_val.values.ravel(), y_val_pred))

# Print confusion matrix
print(confusion_matrix(y_val.values.ravel(), y_val_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Validation ROC AUC Score: 0.6476119431802484
Validation Accuracy: 0.9580444333395798
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.52     96054
weighted avg       0.98      0.96      0.97     96054

[[91911  3257]
 [  773   113]]


#### Feature importance of oversampled XGB model

In [7]:
# Load the encoder
encoder = load('../00_data/encoder.joblib')

# Get feature importances
importances = best_xgb_over.feature_importances_

# Get feature names
feature_names = encoder.get_feature_names_out()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df.head())  # Adjust the number of rows to display as needed

feature_importances_df.to_csv('../00_data/feature_importance_XGB_oversampled.csv', index = False)

                            feature  importance
67        urban_or_rural_area_urban    0.111699
20       junction_detail_roundabout    0.078333
68           speed_limit_bins_20-29    0.046826
69           speed_limit_bins_30-39    0.037532
26  junction_control_not_a_junction    0.037153


#### Undersampling

In [8]:
# Define the hyperparameter grid to search
param_grid = {
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize the XGBClassifier
xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator = xgb,
    param_distributions = param_grid,
    n_iter = 100,  # Number of parameter settings sampled. Adjust according to your computational resources
    scoring = 'roc_auc',  # Can change to 'accuracy' or other metrics
    cv = 5,
    verbose = 2,
    random_state = 33,
    n_jobs = -1  # Use all available cores
)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Get the best estimator
best_xgb_under = random_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

# Train the best estimator on the full training data
best_xgb_under.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predict probabilities on the validation set
y_val_pred_prob = best_xgb_under.predict_proba(X_val)[:, 1]

# Compute the ROC AUC score
val_roc_auc = roc_auc_score(y_val.values.ravel(), y_val_pred_prob)
print(f"Validation ROC AUC Score: {val_roc_auc}")

# Predict on the validation set
y_val_pred = best_xgb_under.predict(X_val)

# Compute accuracy
val_accuracy = accuracy_score(y_val.values.ravel(), y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Print classification report
print(classification_report(y_val.values.ravel(), y_val_pred))

# Print confusion matrix
print(confusion_matrix(y_val.values.ravel(), y_val_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.5}
Validation ROC AUC Score: 0.7379815661143757
Validation Accuracy: 0.7305786328523539
              precision    recall  f1-score   support

           0       1.00      0.73      0.84     95168
           1       0.02      0.64      0.04       886

    accuracy                           0.73     96054
   macro avg       0.51      0.69      0.44     96054
weighted avg       0.99      0.73      0.84     96054

[[69606 25562]
 [  317   569]]


Best model so far.

In [9]:
# Load the encoder
encoder = load('../00_data/encoder.joblib')

# Get feature importances
importances = best_xgb_under.feature_importances_

# Get feature names
feature_names = encoder.get_feature_names_out()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df.head())  # Adjust the number of rows to display as needed

feature_importances_df.to_csv('../00_data/feature_importance_XGB_undersampled.csv', index = False)

                            feature  importance
67        urban_or_rural_area_urban    0.119632
20       junction_detail_roundabout    0.108156
26  junction_control_not_a_junction    0.072534
69           speed_limit_bins_30-39    0.071924
66        urban_or_rural_area_rural    0.069595


#### Ensemble resampled

Since XGBoost model seems better than Random Forest, we will train it also using a combination of under- and oversampled data.

In [16]:
# Define the hyperparameter grid to search
param_grid = {
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Initialize the XGBClassifier
xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator = xgb,
    param_distributions = param_grid,
    n_iter = 100,  # Number of parameter settings sampled. Adjust according to your computational resources
    scoring = 'roc_auc',  # Can change to 'accuracy' or other metrics
    cv = 5,
    verbose = 2,
    random_state = 33,
    n_jobs = -1  # Use all available cores
)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train_ensemble, y_train_ensemble.values.ravel())

# Get the best estimator
best_xgb_ensemble = random_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", random_search.best_params_)

# Train the best estimator on the full training data
best_xgb_ensemble.fit(X_train_ensemble, y_train_ensemble.values.ravel())

# Predict probabilities on the validation set
y_val_pred_prob = best_xgb_ensemble.predict_proba(X_val)[:, 1]

# Compute the ROC AUC score
val_roc_auc = roc_auc_score(y_val.values.ravel(), y_val_pred_prob)
print(f"Validation ROC AUC Score: {val_roc_auc}")

# Predict on the validation set
y_val_pred = best_xgb_ensemble.predict(X_val)

# Compute accuracy
val_accuracy = accuracy_score(y_val.values.ravel(), y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Print classification report
print(classification_report(y_val.values.ravel(), y_val_pred))

# Print confusion matrix
print(confusion_matrix(y_val.values.ravel(), y_val_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
Validation ROC AUC Score: 0.7357696525929766
Validation Accuracy: 0.8302621442105482
              precision    recall  f1-score   support

           0       0.99      0.83      0.91     95168
           1       0.03      0.51      0.05       886

    accuracy                           0.83     96054
   macro avg       0.51      0.67      0.48     96054
weighted avg       0.99      0.83      0.90     96054

[[79295 15873]
 [  431   455]]


Using combined under- and oversampled training data did not improve the model. </br></br>
Since we were more interested in correctly classifying fatal accidents, we decided that the true positives, that is, correctly identified fatal accidents, outweight the high number of false positives (non-fatal accidents predicted as fatal). </br>
Thus, based on the number of true positives together with the AUC score, XGBoost classifier trained on undersampled data was deemed the best performing model, and will be tested using the test data. </br>


In [3]:
# Validation data
X_test = pd.read_csv('../00_data/X_test_road_acc.csv')
y_test = pd.read_csv('../00_data/y_test_road_acc.csv')

In [5]:
# Initialize the XGBClassifier
best_xgb = XGBClassifier(use_label_encoder = False, 
                    eval_metric = 'logloss',
                    max_depth = 3,
                    min_child_weight = 1,
                    subsample = 1.0,
                    colsample_bytree = 0.5,
                    n_estimators = 200,
                    learning_rate = 0.1)


# Train the best estimator on the full training data
best_xgb.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predict probabilities on the test set
y_test_pred_prob = best_xgb.predict_proba(X_test)[:, 1]

# Compute the ROC AUC score
test_roc_auc = roc_auc_score(y_test.values.ravel(), y_test_pred_prob)
print(f"Test ROC AUC Score: {test_roc_auc}")

# Predict on the test set
y_test_pred = best_xgb.predict(X_test)

# Compute accuracy
test_accuracy = accuracy_score(y_test.values.ravel(), y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Print classification report
print(classification_report(y_test.values.ravel(), y_test_pred))

# Print confusion matrix
print(confusion_matrix(y_test.values.ravel(), y_test_pred))

Test ROC AUC Score: 0.7485831427594654
Test Accuracy: 0.7301309679971683
              precision    recall  f1-score   support

           0       1.00      0.73      0.84     95142
           1       0.02      0.65      0.04       912

    accuracy                           0.73     96054
   macro avg       0.51      0.69      0.44     96054
weighted avg       0.99      0.73      0.84     96054

[[69536 25606]
 [  316   596]]
