## Code for training a random forest model to classify accident fatality

#### First, train three basic RF models (using default settings) with the different training data (original, oversampled and undersampled).

##### Libraries

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from joblib import load

##### Data

In [3]:
# Training data with no resampling
X_train_orig = pd.read_csv('../0_data/X_train_orig_road_acc.csv')
y_train_orig = pd.read_csv('../0_data/y_train_orig_road_acc.csv')

# Oversampled training data
X_train_oversamp = pd.read_csv('../0_data/X_train_oversamp_road_acc.csv')
y_train_oversamp = pd.read_csv('../0_data/y_train_oversamp_road_acc.csv')

# Undersampled training data
X_train_undersamp = pd.read_csv('../0_data/X_train_undersamp_road_acc.csv')
y_train_undersamp = pd.read_csv('../0_data/y_train_undersamp_road_acc.csv')


# Validation data
X_val = pd.read_csv('../0_data/X_val_road_acc.csv')
y_val = pd.read_csv('../0_data/y_val_road_acc.csv')

#### RF model trained on original (unbalanced) data

In [8]:
rf_clf_orig = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_orig = cross_val_score(rf_clf_orig, X_train_orig, y_train_orig.values.ravel(), cv = 5)

# Print the cross-validation scores for each fold and the mean CV score
print("RF model (default values) trained on original (unbalanced) data")
print("Cross-validation scores for each fold:", cv_scores_orig)
print("Average cross-validation score:", cv_scores_orig.mean())

# Fit the model to original training data
rf_clf_orig.fit(X_train_orig, y_train_orig.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = rf_clf_orig.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = rf_clf_orig.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

RF model (default values) trained on original (unbalanced) data
Cross-validation scores for each fold: [0.98997223 0.9899275  0.98999442 0.98987172 0.99022867]
Average cross-validation score: 0.989998906921973
AUC Score: 0.585031640849742
Validation Accuracy: 0.9900368542694734
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     95168
           1       0.00      0.00      0.00       886

    accuracy                           0.99     96054
   macro avg       0.50      0.50      0.50     96054
weighted avg       0.98      0.99      0.99     96054

Confusion Matrix:
[[95097    71]
 [  886     0]]


#### RF model trained on oversampled data

In [5]:
rf_clf_oversamp = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_oversamp = cross_val_score(rf_clf_oversamp, X_train_oversamp, y_train_oversamp.values.ravel(), cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on oversampled data")
print("Cross-validation scores for each fold:", cv_scores_oversamp)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_oversamp.mean())

# Fit the model to original training data
rf_clf_oversamp.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = rf_clf_oversamp.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = rf_clf_oversamp.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

RF model (default values) trained on oversampled data
Cross-validation scores for each fold: [0.95264124 0.97006918 0.9691685  0.96983838 0.97039568]
Average cross-validation score: 0.9664225974102694
AUC Score: 0.5879311408523987
Validation Accuracy: 0.9551502279967519
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054

Confusion Matrix:
[[91630  3538]
 [  770   116]]


#### RF model trained on undersampled data

In [6]:
rf_clf_undersamp = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_undersamp = cross_val_score(rf_clf_undersamp, X_train_undersamp, y_train_undersamp.values.ravel(), cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on undersampled data")
print("Cross-validation scores for each fold:", cv_scores_undersamp)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_undersamp.mean())

# Fit the model to original training data
rf_clf_undersamp.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = rf_clf_undersamp.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = rf_clf_undersamp.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel() class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

RF model (default values) trained on undersampled data
Cross-validation scores for each fold: [0.6562123  0.67410984 0.65117683 0.67229934 0.65238383]
Average cross-validation score: 0.6612364257931224
AUC Score: 0.6840373696756388
Validation Accuracy: 0.6697586774106232
              precision    recall  f1-score   support

           0       0.99      0.67      0.80     95168
           1       0.02      0.62      0.03       886

    accuracy                           0.67     96054
   macro avg       0.51      0.64      0.42     96054
weighted avg       0.99      0.67      0.79     96054

Confusion Matrix:
[[63784 31384]
 [  337   549]]


None of the models yielded particularly good results, thus some parameter tuning will be needed. We will focus on oversampled and undersampled training data only for the parameter tuning.

#### Hyperparameter tuning

#### Oversampling

In [9]:
# Performing the hyperparameter search on a subset of the whole dataset

subset_size = 0.1 
X_train_oversamp_subset = X_train_oversamp.sample(frac = subset_size, random_state = 33)
y_train_oversamp_subset = y_train_oversamp.loc[X_train_oversamp_subset.index]

In [14]:
# Define a parameter distribution to sample from
param_distributions = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Initialize the base model
rf_clf = RandomForestClassifier(random_state = 33)

# Set up the RandomizedSearchCV object
rf_random_search = RandomizedSearchCV(
    estimator = rf_clf,
    param_distributions = param_distributions,
    n_iter = 100,
    cv = 5, 
    verbose = 2,
    random_state = 33,
    n_jobs = 12
)

# Fit the RandomizedSearchCV object to the training data
rf_random_search.fit(X_train_oversamp_subset, y_train_oversamp_subset.values.ravel())

# Get the best estimator
best_rf_clf = rf_random_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [15]:
# Fit the model with the best hyperparameters on the full oversampled training data
best_rf_clf.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predicting probabilities on the validation set for AUC calculation
prob_predictions = best_rf_clf.predict_proba(X_val)[:, 1]

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print("Classification Report:\n", classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:\n", conf_matrix)

# Print the best hyperparameters
print("Best Hyperparameters:\n", rf_random_search.best_params_)

AUC Score: 0.6049742045811632
Validation Accuracy: 0.9559726820330231
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054

Confusion Matrix:
 [[91711  3457]
 [  772   114]]
Best Hyperparameters:
 {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}


#### Undersampling

In [16]:
param_distributions = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Initialize the base model
rf_clf = RandomForestClassifier(random_state = 33)

# Set up the RandomizedSearchCV object
rf_random_search = RandomizedSearchCV(
    estimator = rf_clf,
    param_distributions = param_distributions,
    n_iter = 100,
    cv = 5, 
    verbose = 2,
    random_state = 33,
    n_jobs = 12
)

# Fit the RandomizedSearchCV object to the training data
rf_random_search.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Get the best estimator
best_rf_clf = rf_random_search.best_estimator_

# Fit the model with the best hyperparameters on the full oversampled training data
best_rf_clf.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predicting probabilities on the validation set for AUC calculation
prob_predictions = best_rf_clf.predict_proba(X_val)[:, 1]

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print("Classification Report:\n", classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:\n", conf_matrix)

# Print the best hyperparameters
print("Best Hyperparameters:\n", rf_random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
AUC Score: 0.736727463354338
Validation Accuracy: 0.7450288379453224
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.75      0.85     95168
           1       0.02      0.61      0.04       886

    accuracy                           0.75     96054
   macro avg       0.51      0.68      0.45     96054
weighted avg       0.99      0.75      0.85     96054

Confusion Matrix:
 [[71021 24147]
 [  344   542]]
Best Hyperparameters:
 {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': False}


#### Feature importance of tunes models

#### Oversampled

In [5]:
# Retraining the model (in case not done at the same time as parameter tuning)

best_rf_clf_over = RandomForestClassifier(n_estimators = 100,
                                          criterion = 'entropy',
                                          max_depth = None,
                                          min_samples_split = 10,
                                          min_samples_leaf = 1,
                                          max_features = 'sqrt',
                                          bootstrap = False,
                                          n_jobs = 12,
                                          random_state = 33)

# Fit the model to oversampled training data
best_rf_clf_over.fit(X_train_oversamp, y_train_oversamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = best_rf_clf_over.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf_over.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

AUC Score: 0.6049743231786089
Validation Accuracy: 0.9559726820330231
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054

Confusion Matrix:
[[91711  3457]
 [  772   114]]


In [6]:
# Load the encoder
encoder = load('../0_data/encoder.joblib')

# Get feature importances
importances = best_rf_clf_over.feature_importances_

# Get feature names
feature_names = encoder.get_feature_names_out()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df)


                                          feature  importance
69                         speed_limit_bins_30-39    0.042290
20                     junction_detail_roundabout    0.037156
75                       time_of_day_evening_rush    0.035745
0                              day_of_week_friday    0.034288
6                           day_of_week_wednesday    0.033062
..                                            ...         ...
63  carriageway_hazards_pedestrian_in_carriageway    0.000102
41          weather_conditions_snowing_high_winds    0.000094
47     road_surface_conditions_oil_or_diesel_road    0.000031
46               road_surface_conditions_mud_road    0.000008
59                carriageway_hazards_dog_on_road    0.000007

[79 rows x 2 columns]


#### Undersampled

In [8]:
# Retraining the model (in case not done at the same time as parameter tuning)

best_rf_clf_under = RandomForestClassifier(n_estimators = 100,
                                          criterion = 'entropy',
                                          max_depth = 10,
                                          min_samples_split = 10,
                                          min_samples_leaf = 4,
                                          max_features = 'log2',
                                          bootstrap = False,
                                          n_jobs = 12,
                                          random_state = 33)

# Fit the model to oversampled training data
best_rf_clf_under.fit(X_train_undersamp, y_train_undersamp.values.ravel())

# Predicting probabilities on the validation set
prob_predictions = best_rf_clf_under.predict_proba(X_val)[:, 1]  # probabilities for the positive class

# Calculate AUC
auc_score = roc_auc_score(y_val.values.ravel(), prob_predictions)
print(f"AUC Score: {auc_score}")

# Predicting class labels (for accuracy, confusion matrix, etc.)
class_predictions = best_rf_clf_under.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val.values.ravel(), class_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Detailed classification report
print(classification_report(y_val.values.ravel(), class_predictions))

# Confusion Matrix
conf_matrix = confusion_matrix(y_val.values.ravel(), class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

AUC Score: 0.736727463354338
Validation Accuracy: 0.7450288379453224
              precision    recall  f1-score   support

           0       1.00      0.75      0.85     95168
           1       0.02      0.61      0.04       886

    accuracy                           0.75     96054
   macro avg       0.51      0.68      0.45     96054
weighted avg       0.99      0.75      0.85     96054

Confusion Matrix:
[[71021 24147]
 [  344   542]]


In [9]:
# Load the encoder
encoder = load('../0_data/encoder.joblib')

# Get feature importances
importances = best_rf_clf_over.feature_importances_

# Get feature names
feature_names = encoder.get_feature_names_out()

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

                                       feature  importance
20                  junction_detail_roundabout    0.052845
67                   urban_or_rural_area_urban    0.041857
69                      speed_limit_bins_30-39    0.041367
66                   urban_or_rural_area_rural    0.038626
72                      speed_limit_bins_60-69    0.025161
..                                         ...         ...
65    carriageway_hazards_vehicle_load_on_road    0.000492
44      road_surface_conditions_flood_over_3cm    0.000389
41       weather_conditions_snowing_high_winds    0.000370
47  road_surface_conditions_oil_or_diesel_road    0.000084
59             carriageway_hazards_dog_on_road    0.000000

[79 rows x 2 columns]
