<a href="https://colab.research.google.com/github/pascalghanimi/Injury-Prediction-in-Runners/blob/main/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data extraction for LightGBM
import pickle

with open("features.pkl", "rb") as f:
  features = pickle.load(f)

with open("features_days.pkl", "rb") as f:
  features_days = pickle.load(f)

with open("features_weeks.pkl", "rb") as f:
  features_weeks = pickle.load(f)

with open("features_objective.pkl", "rb") as f:
  features_objective = pickle.load(f)

with open("features_subjective.pkl", "rb") as f:
  features_subjective = pickle.load(f)

with open("labels.pkl", "rb") as f:
  labels = pickle.load(f)

print(features[0][0]) # first atlete first row
print(features_days[0][0])
print(features_weeks[0])
print(labels[0])

print(len(features[0][0])) # total amount of features

In [None]:
import numpy as np

X = np.vstack([features[athlete_id] for athlete_id in features])
X_days = np.vstack([features_days[athlete_id] for athlete_id in features_days])
X_weeks = np.vstack([features_weeks[athlete_id] for athlete_id in features_weeks])
X_objective = np.vstack([features_objective[athlete_id] for athlete_id in features_objective])
X_subjective = np.vstack([features_subjective[athlete_id] for athlete_id in features_subjective])

Y = np.hstack([labels[athlete_id] for athlete_id in labels])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

X_train_days, X_temp_days, Y_train_days, Y_temp_days = train_test_split(X_days, Y, test_size=0.3, random_state=42)
X_val_days, X_test_days, Y_val_days, Y_test_days = train_test_split(X_temp_days, Y_temp_days, test_size=0.5, random_state=42)

X_train_weeks, X_temp_weeks, Y_train_weeks, Y_temp_weeks = train_test_split(X_weeks, Y, test_size=0.3, random_state=42)
X_val_weeks, X_test_weeks, Y_val_weeks, Y_test_weeks = train_test_split(X_temp_weeks, Y_temp_weeks, test_size=0.5, random_state=42)

X_train_subjective, X_temp_subjective, Y_train_subjective, Y_temp_subjective = train_test_split(X_subjective, Y, test_size=0.3, random_state=42)
X_val_subjective, X_test_subjective, Y_val_subjective, Y_test_subjective = train_test_split(X_temp_subjective, Y_temp_subjective, test_size=0.5, random_state=42)

X_train_objective, X_temp_objective, Y_train_objective, Y_temp_objective = train_test_split(X_objective, Y, test_size=0.3, random_state=42)
X_val_objective, X_test_objective, Y_val_objective, Y_test_objective = train_test_split(X_temp_objective, Y_temp_objective, test_size=0.5, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape)
print(X_train_days.shape, X_val_days.shape, X_test_days.shape)
print(X_train_weeks.shape, X_val_weeks.shape, X_test_weeks.shape)
print(X_train_subjective.shape, X_val_subjective.shape, X_test_subjective.shape)
print(X_train_objective.shape, X_val_objective.shape, X_test_objective.shape)
print(Y_train.shape, Y_val.shape, Y_test.shape)


In [None]:
!pip install optuna

In [None]:
def get_training_batch(X_train, Y_train, batch_size):
    injured_indices = np.where(Y_train == 1)[0]
    uninjured_indices = np.where(Y_train == 0)[0]

    injured_sample = np.random.choice(injured_indices, size=batch_size//2, replace=True)
    uninjured_sample = np.random.choice(uninjured_indices, size=batch_size//2, replace=True)

    selected_indices = np.concatenate([injured_sample, uninjured_sample])
    np.random.shuffle(selected_indices)

    X_batch = X_train[selected_indices]
    Y_batch = Y_train[selected_indices]

    return X_batch, Y_batch

In [None]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

def lightgbm_objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 7, 255),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "reg_alpha": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart", "goss"]),
        "verbose": -1
    }

    # Batch Sampling
    X_batch, Y_batch = get_training_batch(X_train, Y_train, 4096)

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_batch, Y_batch,
        eval_set=[(X_val, Y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    # Validation AUC
    val_proba = model.predict_proba(X_val)[:, 1]
    return roc_auc_score(Y_val, val_proba)

# Optuna-Studie starten
lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(lightgbm_objective, n_trials=100) # 100 trials

print("Best LightGBM-parameters:", lgb_study.best_params)

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
import warnings

def train(X_train, Y_train, X_val, Y_val, X_test, Y_test):
  warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

  params = {**lgb_study.best_params}  # Erst in ein normales Dict umwandeln
  params.update({
      "objective": "binary",
      "metric": "auc",
      "verbose": -1
  })


  num_models = 9
  models = []
  batch_size = 4096

  for model_idx in range(num_models):
    X_batch, Y_batch = get_training_batch(X_train, Y_train, batch_size)
    model = lgb.LGBMClassifier(**params)
    model.fit(X_batch, Y_batch, eval_set=[(X_val, Y_val)])
    Y_pred_test = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(Y_test, Y_pred_test)

    print(f"\nLightGBM Modell {model_idx + 1} Results:")
    print(f"Test-AUC: {test_auc:.4f}")
    models.append(model)

  calibrated_models = [
          CalibratedClassifierCV(m, method='sigmoid', cv="prefit").fit(X_val, Y_val) for m in models
  ]

  def ensemble_predict(models, X):
    test_probas = np.stack([m.predict_proba(X)[:, 1] for m in models])
    return np.mean(test_probas, axis=0)

  Y_probab_test = ensemble_predict(calibrated_models, X_test)
  test_auc = roc_auc_score(Y_test, Y_probab_test)

  print(f"Test-AUC of LightGBM Ensembles: {test_auc:.4f}")

  return calibrated_models, test_auc

In [None]:
# Training with all features (weeks and days combined)
lgbm_models, _ = train(X_train, Y_train, X_val, Y_val, X_test, Y_test)

In [None]:
# Training with days data
lgbm_models_days, _ = train(X_train_days, Y_train_days, X_val_days, Y_val_days, X_test_days, Y_test_days)

In [None]:
# Training with weeks data
lgbm_models_weeks, _ = train(X_train_weeks, Y_train_weeks, X_val_weeks, Y_val_weeks, X_test_weeks, Y_test_weeks)

In [None]:
# Training with objective data
lgbm_models_objective, _ = train(X_train_objective, Y_train_objective, X_val_objective, Y_val_objective, X_test_objective, Y_test_objective)

In [None]:
# Training with subjective data
lgbm_models_subjective, _ = train(X_train_subjective, Y_train_subjective, X_val_subjective, Y_val_subjective, X_test_subjective, Y_test_subjective)

In [None]:
amount_of_trainings = 10
all_auc_scores = []
auc_scores_days = []
auc_scores_weeks = []
auc_scores_objective = []
auc_scores_subjective = []

def calculate_average_auc_over_10_rounds(train_fn, auc_scores_array):
  for training in range(amount_of_trainings):
    _, auc = train_fn()
    auc_scores_array.append(auc)
  mean_auc_score = np.mean(auc_scores_array)
  std_auc_score = np.std(auc_scores_array)
  print(f"Mean AUC Score: {mean_auc_score}")
  print(f"Standard Deviation of AUC Scores: {std_auc_score}")

calculate_average_auc_over_10_rounds(lambda: train(X_train, Y_train, X_val, Y_val, X_test, Y_test), all_auc_scores)
calculate_average_auc_over_10_rounds(lambda: train(X_train_days, Y_train_days, X_val_days, Y_val_days, X_test_days, Y_test_days), auc_scores_days)
calculate_average_auc_over_10_rounds(lambda: train(X_train_weeks, Y_train_weeks, X_val_weeks, Y_val_weeks, X_test_weeks, Y_test_weeks), auc_scores_weeks)
calculate_average_auc_over_10_rounds(lambda: train(X_train_objective, Y_train_objective, X_val_objective, Y_val_objective, X_test_objective, Y_test_objective), auc_scores_objective)
calculate_average_auc_over_10_rounds(lambda: train(X_train_subjective, Y_train_subjective, X_val_subjective, Y_val_subjective, X_test_subjective, Y_test_subjective), auc_scores_subjective)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(
    [all_auc_scores, auc_scores_days, auc_scores_weeks, auc_scores_objective, auc_scores_subjective],
    labels=["All Features", "Days", "Weeks", "Objective", "Subjective"],
    patch_artist=True
)
plt.ylabel("AUC Score")
plt.title("Vergleich der AUC Scores über verschiedene Feature-Sets")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Mean
mean_auc_scores = {
    "All Features": np.mean(all_auc_scores),
    "Days": np.mean(auc_scores_days),
    "Weeks": np.mean(auc_scores_weeks),
    "Objective": np.mean(auc_scores_objective),
    "Subjective": np.mean(auc_scores_subjective),
}

# Diagrams
plt.figure(figsize=(10, 6))
plt.bar(mean_auc_scores.keys(), mean_auc_scores.values(), color=["blue", "green", "orange", "red", "purple"], alpha=0.7)
plt.ylabel("Mean AUC Score")
plt.title("Mittlere AUC Scores für verschiedene Feature-Sets")
plt.ylim(min(mean_auc_scores.values()) - 0.01, max(mean_auc_scores.values()) + 0.01)
plt.grid(axis="y", linestyle="--", alpha=0.7)

for i, v in enumerate(mean_auc_scores.values()):
    plt.text(i, v + 0.002, f"{v:.4f}", ha="center", fontsize=10, fontweight="bold")

plt.show()

In [None]:
# Single model export (only all features since it has the best probabilities overall)
joblib.dump(lgbm_models, 'lgbm_models.pkl')

In [None]:
# Model export for Ensemble prediction later
import joblib

joblib.dump(lgbm_models, 'lgbm_models.pkl')
joblib.dump(lgbm_models_days, 'lgbm_models_days.pkl')
joblib.dump(lgbm_models_weeks, 'lgbm_models_weeks.pkl')
joblib.dump(lgbm_models_objective, 'lgbm_models_objective.pkl')
joblib.dump(lgbm_models_subjective, 'lgbm_models_subjective.pkl')

['lgbm_models_subjective.pkl']