In [11]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight


### Reading in data

In [12]:
train = pd.read_csv('../data/train_final.csv') 
test = pd.read_csv('../data/test_final.csv') 
val = pd.read_csv('../data/val_final.csv')

### Splitting into training, validation and test sets 

In [13]:
X_train = train.copy().drop(columns=['ExtentOfInjuryCode'])
y_train = train['ExtentOfInjuryCode'].copy()

#validation variables
X_val = val.copy().drop(columns=['ExtentOfInjuryCode'])
y_val = val['ExtentOfInjuryCode'].copy()

# Test
X_test = test.copy().drop(columns=['ExtentOfInjuryCode'])
y_test = test['ExtentOfInjuryCode'].copy()


### Encoding the Extent of Injury Code column, our target varialble

In [14]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

### Standarizing

In [15]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


### Balancing Classes

In [16]:
#addressing class imbalance
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=train['ExtentOfInjuryCode']
)

### Evaluation Before Hyperparameter Tuning

In [17]:
xgb_classifier = xgb.XGBClassifier(random_state = 42)
xgb_classifier.fit(X_train, y_train, sample_weight=sample_weights)
y_pred = xgb_classifier.predict(X_val)


# Predictions from the tuned model
train_pred_bhpt = xgb_classifier.predict(X_train)
val_pred_bhpt = xgb_classifier.predict(X_val)
test_pred_bhpt = xgb_classifier.predict(X_test)

# Print accuracy
print(f"Training Accuracy Before Hyperparemeter Tuning: {accuracy_score(y_train, train_pred_bhpt):.4f}")
print(f"Validation Accuracy Before Hyperparameter Tuning: {accuracy_score(y_val, val_pred_bhpt):.4f}")
print(f"Test Accuracy Before Hyperparameter Tuning: {accuracy_score(y_test, test_pred_bhpt):.4f}")

#classification report for training, validation and test sets
print("\n Train Classification Report Before Hyperparameter Tuning: ")
print(classification_report(y_train, train_pred_bhpt))

print("\n Validation Classification Report Before Hyperparameter Tuning: ")
print(classification_report(y_val, val_pred_bhpt))

print("\n Test Classification Report Before Hyperparameter Tuning: ")
print(classification_report(y_test, test_pred_bhpt))

Training Accuracy Before Hyperparemeter Tuning: 0.6357
Validation Accuracy Before Hyperparameter Tuning: 0.6243
Test Accuracy Before Hyperparameter Tuning: 0.6202

 Train Classification Report Before Hyperparameter Tuning: 
              precision    recall  f1-score   support

           0       0.19      0.85      0.31      1652
           1       0.57      0.50      0.53     76242
           2       0.81      0.71      0.76    134059
           3       0.15      0.63      0.24      6385

    accuracy                           0.64    218338
   macro avg       0.43      0.67      0.46    218338
weighted avg       0.70      0.64      0.66    218338


 Validation Classification Report Before Hyperparameter Tuning: 
              precision    recall  f1-score   support

           0       0.14      0.65      0.23       400
           1       0.55      0.48      0.52     18986
           2       0.81      0.71      0.75     33637
           3       0.12      0.50      0.19      1562

   

### Hyperparameter Tuning

In [18]:
param_grid = {
    'n_estimators' : [100, 200, 300, 500], 
    'learning_rate' : [0.01, 0.1, 0.2], 
    'max_depth' : [3, 6, 9], 
    'min_child_weight' : [1, 3, 5], 
    'subsample' : [0.7, 0.85, 1.0], 
    'colsample_bytree' : [0.7, 0.85, 1.0], 
    'reg_alpha': [0, 0.01, 0.1,  1, 10, 100], 
    'reg_lambda': [0.5, 0.7, 1, 1.3]

}

xgb_model = xgb.XGBClassifier(random_state = 42)
grid_search = RandomizedSearchCV(xgb_model, param_grid, cv = 10, scoring = "accuracy", n_iter = 25,  n_jobs = -1, verbose = 2, random_state = 42)
grid_search.fit(X_train, y_train, sample_weight=sample_weights)
best_xgb = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print('Best Accuracy:', grid_search.best_score_)

Fitting 10 folds for each of 25 candidates, totalling 250 fits
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.7, subsample=0.7; total time= 4.7min
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.7, subsample=0.7; total time= 4.7min
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.7, subsample=0.7; total time= 4.7min
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.7, subsample=0.7; total time= 4.8min
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=0.7, subsample=0.7; total time= 5.1min
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=500, 

In [21]:
print("Best Parameters:", grid_search.best_params_)
print('Best Accuracy:', grid_search.best_score_)

Best Parameters: {'subsample': 0.85, 'reg_lambda': 1.3, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best Accuracy: 0.6312643422756505


### Evaluating Best Model

In [20]:
# Predictions from the tuned model
train_pred = best_xgb.predict(X_train)
val_pred = best_xgb.predict(X_val)
test_pred = best_xgb.predict(X_test)

# Print accuracy
print(f"Training Accuracy: {accuracy_score(y_train, train_pred):.4f}")
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, test_pred):.4f}")

#classification report for training, validation and test sets
print("\n Classification Report: Training Set ")
print(classification_report(y_train, train_pred))

print("\n Classification Report: Validation Set ")
print(classification_report(y_val, val_pred))

print("\n Classification Report: Test Set ")
print(classification_report(y_test, test_pred))


Training Accuracy: 0.6547
Validation Accuracy: 0.6315
Test Accuracy: 0.6281

 Classification Report: Training Set 
              precision    recall  f1-score   support

           0       0.26      0.91      0.41      1652
           1       0.58      0.54      0.56     76242
           2       0.82      0.72      0.76    134059
           3       0.19      0.72      0.30      6385

    accuracy                           0.65    218338
   macro avg       0.46      0.72      0.51    218338
weighted avg       0.71      0.65      0.67    218338


 Classification Report: Validation Set 
              precision    recall  f1-score   support

           0       0.16      0.56      0.25       400
           1       0.55      0.51      0.53     18986
           2       0.81      0.71      0.75     33637
           3       0.12      0.48      0.20      1562

    accuracy                           0.63     54585
   macro avg       0.41      0.56      0.43     54585
weighted avg       0.69      