## Code for training a random forest model to classify accident fatality

#### First, train three basic RF models (using default settings) with the different training data (original, oversampled and undersampled).

##### Libraries

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

##### Data

In [7]:
# Training data with no resampling
X_train_orig = pd.read_csv('../0_data/X_train_orig_road_acc.csv')
y_train_orig = pd.read_csv('../0_data/y_train_orig_road_acc.csv')

# Oversampled training data
X_train_oversamp = pd.read_csv('../0_data/X_train_oversamp_road_acc.csv')
y_train_oversamp = pd.read_csv('../0_data/y_train_oversamp_road_acc.csv')

# Undersampled training data
X_train_undersamp = pd.read_csv('../0_data/X_train_undersamp_road_acc.csv')
y_train_undersamp = pd.read_csv('../0_data/y_train_undersamp_road_acc.csv')


# Validation data
X_val = pd.read_csv('../0_data/X_val_road_acc.csv')
y_val = pd.read_csv('../0_data/y_val_road_acc.csv')

In [13]:
# Transforming labels to a 1D numpy array
y_train_orig = y_train_orig.values.ravel()
y_train_oversamp = y_train_oversamp.values.ravel()
y_train_undersamp = y_train_undersamp.values.ravel()
y_val = y_val.values.ravel()

#### RF model trained on original (unbalanced) data

In [18]:
rf_clf_orig = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_orig = cross_val_score(rf_clf_orig, X_train_orig, y_train_orig, cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on original (unbalanced) data")
print("Cross-validation scores for each fold:", cv_scores_orig)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_orig.mean())

# Fit the model to original training data
rf_clf_orig.fit(X_train_orig, y_train_orig)

# Predicting on validation set
predictions = rf_clf_orig.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val, predictions)
print(f"Test Accuracy: {val_accuracy}")

# Classification report
print(classification_report(y_val, predictions))

RF model (default values) trained on original (unbalanced) data
Cross-validation scores for each fold: [0.98997223 0.9899275  0.98999442 0.98987172 0.99022867]
Average cross-validation score: 0.989998906921973
Test Accuracy: 0.9900368542694734
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     95168
           1       0.00      0.00      0.00       886

    accuracy                           0.99     96054
   macro avg       0.50      0.50      0.50     96054
weighted avg       0.98      0.99      0.99     96054



#### RF model trained on oversampled data

In [19]:
rf_clf_oversamp = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_oversamp = cross_val_score(rf_clf_oversamp, X_train_oversamp, y_train_oversamp, cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on oversampled data")
print("Cross-validation scores for each fold:", cv_scores_oversamp)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_oversamp.mean())

# Fit the model to original training data
rf_clf_oversamp.fit(X_train_oversamp, y_train_oversamp)

# Predicting on validation set
predictions = rf_clf_oversamp.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val, predictions)
print(f"Test Accuracy: {val_accuracy}")

# Classification report
print(classification_report(y_val, predictions))

RF model (default values) trained on oversampled data
Cross-validation scores for each fold: [0.95264124 0.97006918 0.9691685  0.96983838 0.97039568]
Average cross-validation score: 0.9664225974102694
Test Accuracy: 0.9551502279967519
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     95168
           1       0.03      0.13      0.05       886

    accuracy                           0.96     96054
   macro avg       0.51      0.55      0.51     96054
weighted avg       0.98      0.96      0.97     96054



#### RF model trained on undersampled data

In [20]:
rf_clf_undersamp = RandomForestClassifier(random_state = 33)

# 5-fold cross-validation
cv_scores_undersamp = cross_val_score(rf_clf_undersamp, X_train_undersamp, y_train_undersamp, cv = 5)

# Print the cross-validation scores for each fold
print("RF model (default values) trained on undersampled data")
print("Cross-validation scores for each fold:", cv_scores_undersamp)

# Print the average cross-validation score
print("Average cross-validation score:", cv_scores_undersamp.mean())

# Fit the model to original training data
rf_clf_undersamp.fit(X_train_undersamp, y_train_undersamp)

# Predicting on validation set
predictions = rf_clf_undersamp.predict(X_val)

# Evaluating the model on the validation set
val_accuracy = accuracy_score(y_val, predictions)
print(f"Test Accuracy: {val_accuracy}")

# Classification report
print(classification_report(y_val, predictions))

RF model (default values) trained on undersampled data
Cross-validation scores for each fold: [0.6562123  0.67410984 0.65117683 0.67229934 0.65238383]
Average cross-validation score: 0.6612364257931224
Test Accuracy: 0.6697586774106232
              precision    recall  f1-score   support

           0       0.99      0.67      0.80     95168
           1       0.02      0.62      0.03       886

    accuracy                           0.67     96054
   macro avg       0.51      0.64      0.42     96054
weighted avg       0.99      0.67      0.79     96054

