# Model Training - Injury Risk Predictor

This notebook trains and evaluates multiple models for injury risk prediction.

## Steps:
1. Load processed data
2. Split data by time (train/validation/test)
3. Train baseline rule-based model
4. Train ML models (Logistic Regression, Random Forest, XGBoost)
5. Evaluate and compare models
6. Save best model

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from src.ml.features import engineer_features_for_dataset
from src.ml.preprocessing import (
    handle_missing_values,
    encode_categorical_features,
    split_data_by_time,
    scale_features,
    prepare_features_for_training
)
from src.ml.train import train_all_models, save_model
from src.ml.evaluate import (
    evaluate_model,
    compare_models,
    create_confusion_matrix,
    plot_roc_curve,
    plot_precision_recall_curve
)
from src.ml.models import get_all_models

## 1. Load Data

In [None]:
# Load training logs and athlete metadata
training_logs = pd.read_csv('../data/training_logs.csv')
athlete_metadata = pd.read_csv('../data/athlete_metadata.csv')

print(f"Training logs shape: {training_logs.shape}")
print(f"Athlete metadata shape: {athlete_metadata.shape}")
print(f"\nInjury rate: {training_logs['injured'].mean():.2%}")

## 2. Engineer Features

In [None]:
# Engineer all features
df = engineer_features_for_dataset(training_logs, athlete_metadata)

print(f"Data shape after feature engineering: {df.shape}")
print(f"\nFeature columns:")
print(df.columns.tolist()[:20])  # Show first 20

## 3. Preprocessing

In [None]:
# Handle missing values
df = handle_missing_values(df, method='forward_fill')

# Encode categorical features
df, encoders = encode_categorical_features(df)

print(f"Data shape after preprocessing: {df.shape}")
print(f"\nMissing values: {df.isnull().sum().sum()}")

## 4. Time-Based Data Splitting

In [None]:
# Split by time to avoid data leakage
# Train: weeks 1-14 (60%)
# Validation: weeks 15-19 (20%)
# Test: weeks 20-24 (20%)
X_train, y_train, X_val, y_val, X_test, y_test = split_data_by_time(
    df,
    train_weeks=(1, 14),
    val_weeks=(15, 19),
    test_weeks=(20, 24)
)

print(f"Training set: {X_train.shape[0]} samples, {y_train.mean():.2%} injury rate")
print(f"Validation set: {X_val.shape[0]} samples, {y_val.mean():.2%} injury rate")
print(f"Test set: {X_test.shape[0]} samples, {y_test.mean():.2%} injury rate")

## 5. Scale Features

In [None]:
# Scale features (fit on training, transform all)
X_train_scaled, scaler = scale_features(X_train, fit=True, scaler_type='standard')
X_val_scaled, _ = scale_features(X_val, fit=False, scaler=scaler)
X_test_scaled, _ = scale_features(X_test, fit=False, scaler=scaler)

print(f"Features scaled successfully")
print(f"Training features shape: {X_train_scaled.shape}")

## 6. Train All Models

In [None]:
# Train all models (baseline + ML models)
# Set tune_hyperparameters=True for hyperparameter tuning (slower but better)
results = train_all_models(
    X_train_scaled, y_train,
    X_val_scaled, y_val,
    X_test_scaled, y_test,
    tune_hyperparameters=False  # Set to True for better performance
)

## 7. Model Comparison

In [None]:
# Create comparison table
comparison_metrics = {name: metrics for name, (_, metrics) in results.items() 
                     if name != 'best_model' and name != 'best_model_name'}
comparison_df = compare_models(comparison_metrics)

print("\nModel Comparison (Validation Set):")
print(comparison_df.to_string(index=False))

## 8. Visualizations

In [None]:
# Get best model
best_model_name = results['best_model_name']
best_model = results['best_model'][0]

print(f"Best Model: {best_model_name}")

# Confusion Matrix
y_test_pred = best_model.predict(X_test_scaled)
y_test_proba = best_model.predict_proba(X_test_scaled)

create_confusion_matrix(y_test, y_test_pred, best_model_name, 
                       save_path='../models/confusion_matrix.png')
print("✓ Confusion matrix saved")

# ROC Curve
plot_roc_curve(y_test, y_test_proba, best_model_name,
              save_path='../models/roc_curve.png')
print("✓ ROC curve saved")

# Precision-Recall Curve
plot_precision_recall_curve(y_test, y_test_proba, best_model_name,
                           save_path='../models/pr_curve.png')
print("✓ Precision-Recall curve saved")

## 9. Feature Importance

In [None]:
# Feature importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_train_scaled.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 15 Most Important Features:")
    print(feature_importance.head(15).to_string(index=False))
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Importance')
    plt.title(f'Top 15 Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('../models/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Feature importance plot saved")
elif hasattr(best_model, 'coef_'):
    # For linear models, use coefficients
    feature_importance = pd.DataFrame({
        'feature': X_train_scaled.columns,
        'coefficient': best_model.coef_[0]
    }).sort_values('coefficient', key=abs, ascending=False)
    
    print("\nTop 15 Most Important Features (by coefficient magnitude):")
    print(feature_importance.head(15).to_string(index=False))

## 10. Save Best Model

In [None]:
# Save best model and scaler
save_model(best_model, scaler, best_model_name, output_dir='../models')

print(f"\n✓ Model training complete!")
print(f"✓ Best model ({best_model_name}) saved to models/")
print(f"✓ Scaler saved to models/")