In [None]:
# # Notebook 1: Basic Cross-Validation Strategies
#
# ## Goals
# * Understand the limitations of a single validation split (Hold-Out).
# * Implement and understand K-Fold Cross-Validation.
# * Implement and understand Stratified K-Fold Cross-Validation (crucial for classification).
# * Implement and understand Leave-One-Out Cross-Validation (LOOCV).
# * Use `sklearn.model_selection.cross_val_score` for convenient CV evaluation.
#
# We will use the **Development Set** (`X_dev`, `y_dev`) created in Notebook 0. The Test Set remains untouched.

# ## 1. Setup and Loading Data
#
# Let's import libraries and load the development data we prepared.

# +
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    train_test_split,
    KFold,
    StratifiedKFold,
    LeaveOneOut,
    cross_val_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import time

# %matplotlib inline
sns.set(style='whitegrid')

# Assume X_dev, y_dev, groups_dev are loaded from Notebook 0
# If you didn't use %store or are running standalone, regenerate or load them here.
# Example using %store:
# %store -r X_dev y_dev groups_dev X_test y_test groups_test

# For standalone running, let's quickly regenerate similar data:
RANDOM_STATE = 42
try:
    X_dev.shape # Check if variable exists
    print("Using data loaded from previous notebook.")
except NameError:
    print("Generating synthetic data for standalone execution...")
    from sklearn.datasets import make_classification
    N_SAMPLES_DEV = 400 # Assuming 80% of 500
    N_FEATURES = 20
    N_CLASSES = 2
    N_PATIENTS_DEV = 80 # Rough estimate
    IMBALANCE = 0.8
    X_dev, y_dev = make_classification(
        n_samples=N_SAMPLES_DEV, n_features=N_FEATURES, n_informative=10, n_redundant=5, n_repeated=0,
        n_classes=N_CLASSES, n_clusters_per_class=2, weights=[IMBALANCE, 1.0 - IMBALANCE],
        flip_y=0.05, class_sep=0.8, random_state=RANDOM_STATE
    )
    samples_per_patient = N_SAMPLES_DEV // N_PATIENTS_DEV
    groups_dev = np.repeat(np.arange(N_PATIENTS_DEV), samples_per_patient)
    remaining_samples = N_SAMPLES_DEV % N_PATIENTS_DEV
    if remaining_samples > 0:
        groups_dev = np.concatenate([groups_dev, np.random.choice(N_PATIENTS_DEV, remaining_samples)])
    np.random.seed(RANDOM_STATE)
    np.random.shuffle(groups_dev)
    print(f"Generated X_dev shape: {X_dev.shape}, y_dev shape: {y_dev.shape}, groups_dev shape: {groups_dev.shape}")


# Define a simple baseline model
model = LogisticRegression(solver='liblinear', random_state=RANDOM_STATE)
# -

# ## 2. Hold-Out Validation (Simple Train/Validation Split)
#
# This is the simplest approach but often unreliable due to high variance. We split the *development* data into a single training set and a single validation set. The performance heavily depends on which specific samples end up in the validation set.

# +
# Perform a simple train/validation split ON THE DEVELOPMENT DATA
X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev,
    test_size=0.25, # e.g., 75% train, 25% validation from the dev set
    random_state=RANDOM_STATE,
    stratify=y_dev
)

print(f"Train set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")

# Train the model on the training part
model.fit(X_train, y_train)

# Evaluate on the validation part
y_pred_val = model.predict(X_val)
y_proba_val = model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred_val)
auc = roc_auc_score(y_val, y_proba_val)

print(f"\nHold-Out Validation Performance (random_state={RANDOM_STATE}):")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC:      {auc:.4f}")

# Demonstrate variability by changing the random state
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(
    X_dev, y_dev, test_size=0.25, random_state=RANDOM_STATE + 1, stratify=y_dev
)
model.fit(X_train_2, y_train_2)
y_pred_val_2 = model.predict(X_val_2)
y_proba_val_2 = model.predict_proba(X_val_2)[:, 1]
accuracy_2 = accuracy_score(y_val_2, y_pred_val_2)
auc_2 = roc_auc_score(y_val_2, y_proba_val_2)

print(f"\nHold-Out Validation Performance (random_state={RANDOM_STATE + 1}):")
print(f"Accuracy: {accuracy_2:.4f}")
print(f"AUC:      {auc_2:.4f}")
print("\nNote how the performance can change just by changing the random split!")
# -

# ## 3. K-Fold Cross-Validation
#
# K-Fold CV addresses the variance issue of the hold-out method.
# 1. Shuffle the dataset randomly.
# 2. Split the dataset into K equal(ish) folds.
# 3. For each fold `k`:
#     * Use fold `k` as the validation set.
#     * Use the remaining K-1 folds as the training set.
#     * Train and evaluate the model.
# 4. Average the performance scores from the K evaluations.
#
# This uses the data more effectively and gives a more stable estimate. Common K values are 5 or 10.

# +
N_SPLITS_KFOLD = 5
kf = KFold(n_splits=N_SPLITS_KFOLD, shuffle=True, random_state=RANDOM_STATE)

# Use cross_val_score for convenience
# It handles the looping, training, and scoring automatically
# Note: By default, cross_val_score uses the model's .score() method (accuracy for classifiers)
# We can specify other metrics using the 'scoring' parameter.

print(f"--- Running {N_SPLITS_KFOLD}-Fold Cross-Validation ---")

start_time = time.time()
kfold_accuracies = cross_val_score(
    model,
    X_dev,
    y_dev,
    cv=kf,         # Use the KFold object
    scoring='accuracy',
    n_jobs=-1      # Use all available CPU cores
)
kfold_time = time.time() - start_time

print(f"\nIndividual fold accuracies (K-Fold): {kfold_accuracies}")
print(f"Mean accuracy: {kfold_accuracies.mean():.4f}")
print(f"Std deviation: {kfold_accuracies.std():.4f}")
print(f"Time taken:    {kfold_time:.2f} seconds")

start_time = time.time()
kfold_aucs = cross_val_score(
    model, X_dev, y_dev, cv=kf, scoring='roc_auc', n_jobs=-1
)
kfold_auc_time = time.time() - start_time

print(f"\nIndividual fold AUCs (K-Fold): {kfold_aucs}")
print(f"Mean AUC:      {kfold_aucs.mean():.4f}")
print(f"Std deviation: {kfold_aucs.std():.4f}")
print(f"Time taken:    {kfold_auc_time:.2f} seconds")


# Manual loop example (to see indices)
print("\n--- Manual K-Fold Loop Example (First 2 Folds) ---")
fold_counter = 1
for train_index, val_index in kf.split(X_dev):
    if fold_counter > 2: break
    X_train_fold, X_val_fold = X_dev[train_index], X_dev[val_index]
    y_train_fold, y_val_fold = y_dev[train_index], y_dev[val_index]

    print(f"\nFold {fold_counter}:")
    print(f"  Train indices: {train_index[:5]}...{train_index[-5:]} (Size: {len(train_index)})")
    print(f"  Validation indices: {val_index[:5]}...{val_index[-5:]} (Size: {len(val_index)})")

    # Check class distribution in this fold (can be uneven with standard KFold)
    train_dist = np.bincount(y_train_fold) / len(y_train_fold)
    val_dist = np.bincount(y_val_fold) / len(y_val_fold)
    print(f"  Train class distribution: {train_dist}")
    print(f"  Val class distribution:   {val_dist}")

    fold_counter += 1
# -

# **Note:** Standard K-Fold does not guarantee that the class proportions are maintained within each fold, which can be problematic for imbalanced datasets.

# ## 4. Stratified K-Fold Cross-Validation
#
# Stratified K-Fold is a variation designed specifically for classification tasks, particularly when class imbalance exists. It ensures that each fold has approximately the **same percentage of samples from each target class** as the complete dataset.

# +
N_SPLITS_SKFOLD = 5
skf = StratifiedKFold(n_splits=N_SPLITS_SKFOLD, shuffle=True, random_state=RANDOM_STATE)

print(f"\n--- Running {N_SPLITS_SKFOLD}-Fold Stratified Cross-Validation ---")

start_time = time.time()
skfold_accuracies = cross_val_score(
    model, X_dev, y_dev, cv=skf, scoring='accuracy', n_jobs=-1
)
skfold_time = time.time() - start_time

print(f"\nIndividual fold accuracies (Stratified K-Fold): {skfold_accuracies}")
print(f"Mean accuracy: {skfold_accuracies.mean():.4f}")
print(f"Std deviation: {skfold_accuracies.std():.4f}")
print(f"Time taken:    {skfold_time:.2f} seconds")


start_time = time.time()
skfold_aucs = cross_val_score(
    model, X_dev, y_dev, cv=skf, scoring='roc_auc', n_jobs=-1
)
skfold_auc_time = time.time() - start_time

print(f"\nIndividual fold AUCs (Stratified K-Fold): {skfold_aucs}")
print(f"Mean AUC:      {skfold_aucs.mean():.4f}")
print(f"Std deviation: {skfold_aucs.std():.4f}")
print(f"Time taken:    {skfold_auc_time:.2f} seconds")


# Manual loop example (to see stratification)
print("\n--- Manual Stratified K-Fold Loop Example (First 2 Folds) ---")
fold_counter = 1
for train_index, val_index in skf.split(X_dev, y_dev): # Pass y for stratification
    if fold_counter > 2: break
    X_train_fold, X_val_fold = X_dev[train_index], X_dev[val_index]
    y_train_fold, y_val_fold = y_dev[train_index], y_dev[val_index]

    print(f"\nFold {fold_counter}:")
    print(f"  Train indices: {train_index[:5]}...{train_index[-5:]} (Size: {len(train_index)})")
    print(f"  Validation indices: {val_index[:5]}...{val_index[-5:]} (Size: {len(val_index)})")

    # Check class distribution in this fold (should be stratified)
    train_dist = np.bincount(y_train_fold) / len(y_train_fold)
    val_dist = np.bincount(y_val_fold) / len(y_val_fold)
    print(f"  Train class distribution: {train_dist}")
    print(f"  Val class distribution:   {val_dist}") # Should be similar to train and overall

    fold_counter += 1
# -

# **Observation:** The mean scores might be similar between K-Fold and Stratified K-Fold if the dataset isn't heavily imbalanced or large enough. However, the standard deviation might be lower with Stratified K-Fold, and it's generally the safer choice for classification.

# ## 5. Leave-One-Out Cross-Validation (LOOCV)
#
# LOOCV is an extreme case of K-Fold where K equals the number of samples (N).
# 1. For each sample `i`:
#     * Use sample `i` as the validation set (size 1).
#     * Use the remaining N-1 samples as the training set.
#     * Train and evaluate.
# 2. Average the N performance scores.
#
# **Pros:** Uses maximum data for training in each step (low bias estimate). Deterministic (no randomness in splits).
# **Cons:** Computationally **very expensive** (N model trainings). Can have high variance in the performance estimate (sensitive to individual points).

# +
loo = LeaveOneOut()
n_splits_loo = loo.get_n_splits(X_dev)
print(f"\n--- Running Leave-One-Out Cross-Validation ({n_splits_loo} Splits) ---")

# WARNING: This can be VERY slow for datasets of even moderate size.
# Only run this section if X_dev is small or you have time.
# Consider reducing N_SAMPLES in Notebook 0 if needed for faster demo.

if n_splits_loo <= 500: # Set a reasonable limit for demo purposes
    start_time = time.time()
    loo_accuracies = cross_val_score(
        model, X_dev, y_dev, cv=loo, scoring='accuracy', n_jobs=-1
    )
    loo_time = time.time() - start_time

    print(f"\nMean accuracy (LOOCV): {loo_accuracies.mean():.4f}")
    print(f"Std deviation: {loo_accuracies.std():.4f}") # Std dev might be high
    print(f"Time taken:    {loo_time:.2f} seconds")

    # AUC is less meaningful with only one sample in the validation set,
    # but cross_val_score can still compute it if the model provides probabilities.
    try:
        start_time = time.time()
        loo_aucs = cross_val_score(
             model, X_dev, y_dev, cv=loo, scoring='roc_auc', n_jobs=-1, error_score='raise'
        )
        loo_auc_time = time.time() - start_time
        print(f"\nMean AUC (LOOCV):      {loo_aucs.mean():.4f}")
        print(f"Std deviation: {loo_aucs.std():.4f}")
        print(f"Time taken:    {loo_auc_time:.2f} seconds")
    except ValueError as e:
        print(f"\nCould not calculate AUC with LOOCV: {e}")
        print(" (This often happens as AUC is undefined for a single validation sample).")

else:
    print(f"\nSkipping LOOCV execution as number of splits ({n_splits_loo}) is large.")

# -

# ## 6. Summary
#
# We have explored basic CV techniques:
# *   **Hold-Out:** Simple, but high variance. Not generally recommended for final evaluation.
# *   **K-Fold:** Good general standard, balances bias/variance well. Use `shuffle=True`.
# *   **Stratified K-Fold:** **Recommended for classification**, especially with imbalanced data. Ensures class proportions per fold.
# *   **LOOCV:** Low bias, very high computational cost. Potentially high variance in estimate. Use only for very small datasets.
#
# In the next notebooks, we will cover CV techniques crucial for specific data types common in medical AI, such as grouped/patient data and time series data.