<a href="https://colab.research.google.com/github/pascalghanimi/Injury-Prediction-in-Runners/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Datenextraktion für XGBoost
import pickle

with open("features.pkl", "rb") as f:
  features = pickle.load(f)

with open("features_days.pkl", "rb") as f:
  features_days = pickle.load(f)

with open("features_weeks.pkl", "rb") as f:
  features_weeks = pickle.load(f)

with open("features_objective.pkl", "rb") as f:
  features_objective = pickle.load(f)

with open("features_subjective.pkl", "rb") as f:
  features_subjective = pickle.load(f)

with open("labels.pkl", "rb") as f:
  labels = pickle.load(f)

print(features[0][0]) # greift auf ersten Athleten und erste Reihe zu
print(features_days[0][0])
print(features_weeks[0])
print(labels[0])

print(len(features[0][0])) # Anzahl der Features

In [None]:
import numpy as np

X = np.vstack([features[athlete_id] for athlete_id in features])
X_days = np.vstack([features_days[athlete_id] for athlete_id in features_days])
X_weeks = np.vstack([features_weeks[athlete_id] for athlete_id in features_weeks])
X_objective = np.vstack([features_objective[athlete_id] for athlete_id in features_objective])
X_subjective = np.vstack([features_subjective[athlete_id] for athlete_id in features_subjective])

Y = np.hstack([labels[athlete_id] for athlete_id in labels])


(42680, 139) (42680, 70) (42680, 69) (42680, 91) (42680, 48) (42680,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

X_train_days, X_temp_days, Y_train_days, Y_temp_days = train_test_split(X_days, Y, test_size=0.3, random_state=42)
X_val_days, X_test_days, Y_val_days, Y_test_days = train_test_split(X_temp_days, Y_temp_days, test_size=0.5, random_state=42)

X_train_weeks, X_temp_weeks, Y_train_weeks, Y_temp_weeks = train_test_split(X_weeks, Y, test_size=0.3, random_state=42)
X_val_weeks, X_test_weeks, Y_val_weeks, Y_test_weeks = train_test_split(X_temp_weeks, Y_temp_weeks, test_size=0.5, random_state=42)

X_train_subjective, X_temp_subjective, Y_train_subjective, Y_temp_subjective = train_test_split(X_subjective, Y, test_size=0.3, random_state=42)
X_val_subjective, X_test_subjective, Y_val_subjective, Y_test_subjective = train_test_split(X_temp_subjective, Y_temp_subjective, test_size=0.5, random_state=42)

X_train_objective, X_temp_objective, Y_train_objective, Y_temp_objective = train_test_split(X_objective, Y, test_size=0.3, random_state=42)
X_val_objective, X_test_objective, Y_val_objective, Y_test_objective = train_test_split(X_temp_objective, Y_temp_objective, test_size=0.5, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape)
print(X_train_days.shape, X_val_days.shape, X_test_days.shape)
print(X_train_weeks.shape, X_val_weeks.shape, X_test_weeks.shape)
print(X_train_subjective.shape, X_val_subjective.shape, X_test_subjective.shape)
print(X_train_objective.shape, X_val_objective.shape, X_test_objective.shape)
print(Y_train.shape, Y_val.shape, Y_test.shape)


(29876, 139) (6402, 139) (6402, 139)
(29876, 70) (6402, 70) (6402, 70)
(29876, 69) (6402, 69) (6402, 69)
(29876, 48) (6402, 48) (6402, 48)
(29876, 91) (6402, 91) (6402, 91)
(29876,) (6402,) (6402,)


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

def train (X_train, Y_train, X_val, Y_val, X_test, Y_test):
  def get_training_batch(X_train, Y_train, batch_size):
    injured_indices = np.where(Y_train == 1)[0]
    uninjured_indices = np.where(Y_train == 0)[0]

    injured_sample = np.random.choice(injured_indices, size=batch_size // 2, replace=True)
    uninjured_sample = np.random.choice(uninjured_indices, size=batch_size // 2, replace=True)

    selected_indices = np.concatenate([injured_sample, uninjured_sample])
    np.random.shuffle(selected_indices)

    X_batch = X_train[selected_indices]
    Y_batch = Y_train[selected_indices]

    return X_batch, Y_batch


  params = {
      "objective": "binary:logistic",
      "eta": 0.01,
      "max_depth": 3,
      "subsample": 0.8,
      "colsample_bytree": 0.8,
      "n_estimators": 512,
      "lambda": 1.0,
      "alpha": 0.5
  }

  num_models = 9
  models = []
  batch_size = 2048

  for model_idx in range(num_models):
    X_batch, Y_batch = get_training_batch(X_train, Y_train, batch_size)

    model = xgb.XGBClassifier(**params)
    model.fit(X_batch, Y_batch, eval_set=[(X_val, Y_val)], verbose=False)
    Y_pred_val = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(Y_val, Y_pred_val)
    Y_preds_test = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(Y_test, Y_preds_test)

    print(f"\nModell {model_idx + 1} Ergebnisse:")
    print(f"Test-AUC: {test_auc:.4f}")
    models.append(model)

  calibrated_models = [
      CalibratedClassifierCV(m, method='sigmoid', cv="prefit").fit(X_val, Y_val) for m in models
  ]


  # Ensemble-AUC auf Wahrscheinlichkeiten
  def ensemble_predict_proba(models, X):
      probas = np.array([m.predict_proba(X)[:, 1] for m in models])
      return probas.mean(axis=0)

  Y_proba_test = ensemble_predict_proba(calibrated_models, X_test)
  test_auc = roc_auc_score(Y_test, Y_proba_test)

  print(f"Test-AUC des Ensembles: {test_auc:.4f}")

In [None]:
# Training mit allen Featuren
train(X_train, Y_train, X_val, Y_val, X_test, Y_test)


Modell 1 Ergebnisse:
Test-AUC: 0.6731

Modell 2 Ergebnisse:
Test-AUC: 0.6908

Modell 3 Ergebnisse:
Test-AUC: 0.7281

Modell 4 Ergebnisse:
Test-AUC: 0.6914

Modell 5 Ergebnisse:
Test-AUC: 0.6937

Modell 6 Ergebnisse:
Test-AUC: 0.6912

Modell 7 Ergebnisse:
Test-AUC: 0.6766

Modell 8 Ergebnisse:
Test-AUC: 0.6908

Modell 9 Ergebnisse:
Test-AUC: 0.6956




Test-AUC des Ensembles: 0.7170


In [None]:
# Training mit den Tagesdaten
train(X_train_days, Y_train_days, X_val_days, Y_val_days, X_test_days, Y_test_days)


Modell 1 Ergebnisse:
Test-AUC: 0.6842

Modell 2 Ergebnisse:
Test-AUC: 0.6739

Modell 3 Ergebnisse:
Test-AUC: 0.6775

Modell 4 Ergebnisse:
Test-AUC: 0.6992

Modell 5 Ergebnisse:
Test-AUC: 0.6973

Modell 6 Ergebnisse:
Test-AUC: 0.6626

Modell 7 Ergebnisse:
Test-AUC: 0.7063

Modell 8 Ergebnisse:
Test-AUC: 0.7015

Modell 9 Ergebnisse:
Test-AUC: 0.6878




Test-AUC des Ensembles: 0.7108


In [None]:
# Training mit den Wochendaten
train(X_train_weeks, Y_train_weeks, X_val_weeks, Y_val_weeks, X_test_weeks, Y_test_weeks)


Modell 1 Ergebnisse:
Test-AUC: 0.6607

Modell 2 Ergebnisse:
Test-AUC: 0.6822

Modell 3 Ergebnisse:
Test-AUC: 0.6634

Modell 4 Ergebnisse:
Test-AUC: 0.6385

Modell 5 Ergebnisse:
Test-AUC: 0.6929

Modell 6 Ergebnisse:
Test-AUC: 0.6530

Modell 7 Ergebnisse:
Test-AUC: 0.6600

Modell 8 Ergebnisse:
Test-AUC: 0.6732

Modell 9 Ergebnisse:
Test-AUC: 0.6629




Test-AUC des Ensembles: 0.6861


In [None]:
# Training mit den objektiven Daten
train(X_train_objective, Y_train_objective, X_val_objective, Y_val_objective, X_test_objective, Y_test_objective)


Modell 1 Ergebnisse:
Test-AUC: 0.6613

Modell 2 Ergebnisse:
Test-AUC: 0.6677

Modell 3 Ergebnisse:
Test-AUC: 0.6592

Modell 4 Ergebnisse:
Test-AUC: 0.7036

Modell 5 Ergebnisse:
Test-AUC: 0.6404

Modell 6 Ergebnisse:
Test-AUC: 0.6778

Modell 7 Ergebnisse:
Test-AUC: 0.6779

Modell 8 Ergebnisse:
Test-AUC: 0.6783

Modell 9 Ergebnisse:
Test-AUC: 0.6712




Test-AUC des Ensembles: 0.6924


In [None]:
# Training mit den subjektiven Daten
train(X_train_subjective, Y_train_subjective, X_val_subjective, Y_val_subjective, X_test_subjective, Y_test_subjective)


Modell 1 Ergebnisse:
Test-AUC: 0.6451

Modell 2 Ergebnisse:
Test-AUC: 0.6270

Modell 3 Ergebnisse:
Test-AUC: 0.6373

Modell 4 Ergebnisse:
Test-AUC: 0.6472

Modell 5 Ergebnisse:
Test-AUC: 0.6306

Modell 6 Ergebnisse:
Test-AUC: 0.6313

Modell 7 Ergebnisse:
Test-AUC: 0.6393

Modell 8 Ergebnisse:
Test-AUC: 0.6265

Modell 9 Ergebnisse:
Test-AUC: 0.6596




Test-AUC des Ensembles: 0.6665
