# Analyze Per-Fold Performance Variance

Investigate why some folds perform better than others.

In [None]:
from core.config import initialize_notebook
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

env = initialize_notebook(regenerate_run_id=False)
seed = env.configs.run['seed']
run_cfg = env.configs.run

## Load Results from Previous Run

In [None]:
# Specify which task to analyze
task_name = "clinical_vs_control"  # Change this to analyze different tasks

# Load saved results
svm_dir = env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] / f"seed_{seed}" / "svm" / task_name
with open(svm_dir / "results.pkl", "rb") as f:
    results = pickle.load(f)

svm_folds = results['svm_folds']
baseline_folds = results['baseline_folds']

print(f"Loaded results for: {task_name}")
print(f"Number of folds: {len(svm_folds)}")

## Per-Fold Performance Comparison

In [None]:
# Extract metrics per fold
fold_data = []

for fold_idx, (svm_fold, baseline_fold) in enumerate(zip(svm_folds, baseline_folds)):
    fold_info = {
        'Fold': fold_idx + 1,
        'SVM_ROC_AUC': svm_fold['metrics'].get('roc_auc', 0),
        'SVM_Balanced_Acc': svm_fold['metrics']['balanced_accuracy'],
        'SVM_Accuracy': svm_fold['metrics']['accuracy'],
        'Baseline_ROC_AUC': baseline_fold['metrics'].get('roc_auc', 0),
        'Baseline_Balanced_Acc': baseline_fold['metrics']['balanced_accuracy'],
        'N_Test': len(svm_fold['y_test']),
        'N_Positive_Test': svm_fold['y_test'].sum(),
        'N_Negative_Test': len(svm_fold['y_test']) - svm_fold['y_test'].sum(),
        'Best_Params': str(svm_fold.get('best_params', {})),
    }
    fold_data.append(fold_info)

df_folds = pd.DataFrame(fold_data)
df_folds

## Visualize Performance Across Folds

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: ROC-AUC by fold
x = df_folds['Fold']
axes[0, 0].plot(x, df_folds['SVM_ROC_AUC'], 'o-', label='SVM', linewidth=2, markersize=8)
axes[0, 0].plot(x, df_folds['Baseline_ROC_AUC'], 's-', label='Baseline', linewidth=2, markersize=8)
axes[0, 0].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Chance')
axes[0, 0].set_xlabel('Fold', fontsize=12)
axes[0, 0].set_ylabel('ROC-AUC', fontsize=12)
axes[0, 0].set_title('ROC-AUC by Fold', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)
axes[0, 0].set_xticks(x)

# Plot 2: Balanced Accuracy by fold
axes[0, 1].plot(x, df_folds['SVM_Balanced_Acc'], 'o-', label='SVM', linewidth=2, markersize=8)
axes[0, 1].plot(x, df_folds['Baseline_Balanced_Acc'], 's-', label='Baseline', linewidth=2, markersize=8)
axes[0, 1].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Chance')
axes[0, 1].set_xlabel('Fold', fontsize=12)
axes[0, 1].set_ylabel('Balanced Accuracy', fontsize=12)
axes[0, 1].set_title('Balanced Accuracy by Fold', fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)
axes[0, 1].set_xticks(x)

# Plot 3: Test set size by fold
axes[1, 0].bar(x - 0.2, df_folds['N_Positive_Test'], width=0.4, label='Positive (Clinical)', alpha=0.7)
axes[1, 0].bar(x + 0.2, df_folds['N_Negative_Test'], width=0.4, label='Negative (Control)', alpha=0.7)
axes[1, 0].set_xlabel('Fold', fontsize=12)
axes[1, 0].set_ylabel('Number of Subjects', fontsize=12)
axes[1, 0].set_title('Test Set Composition by Fold', fontsize=14, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3, axis='y')
axes[1, 0].set_xticks(x)

# Plot 4: Performance vs Test Set Imbalance
imbalance_ratio = df_folds['N_Negative_Test'] / df_folds['N_Positive_Test']
axes[1, 1].scatter(imbalance_ratio, df_folds['SVM_ROC_AUC'], s=100, alpha=0.6, label='SVM')
axes[1, 1].scatter(imbalance_ratio, df_folds['Baseline_ROC_AUC'], s=100, alpha=0.6, marker='s', label='Baseline')
for i, fold in enumerate(df_folds['Fold']):
    axes[1, 1].annotate(f'F{fold}', (imbalance_ratio.iloc[i], df_folds['SVM_ROC_AUC'].iloc[i]), 
                       fontsize=9, ha='right')
axes[1, 1].set_xlabel('Imbalance Ratio (Neg/Pos)', fontsize=12)
axes[1, 1].set_ylabel('ROC-AUC', fontsize=12)
axes[1, 1].set_title('Performance vs Imbalance Ratio', fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\nImbalance Ratio per Fold:")
for i, ratio in enumerate(imbalance_ratio):
    print(f"  Fold {i+1}: 1:{ratio:.1f}")

## Analyze Best Fold

In [None]:
# Find best performing fold
best_fold_idx = df_folds['SVM_ROC_AUC'].idxmax()
best_fold_num = df_folds.iloc[best_fold_idx]['Fold']
best_roc = df_folds.iloc[best_fold_idx]['SVM_ROC_AUC']

print(f"Best Performing Fold: {int(best_fold_num)}")
print(f"ROC-AUC: {best_roc:.3f}")
print(f"\nBest fold details:")
print(df_folds.iloc[best_fold_idx])

# Compare to worst fold
worst_fold_idx = df_folds['SVM_ROC_AUC'].idxmin()
worst_fold_num = df_folds.iloc[worst_fold_idx]['Fold']
worst_roc = df_folds.iloc[worst_fold_idx]['SVM_ROC_AUC']

print(f"\n\nWorst Performing Fold: {int(worst_fold_num)}")
print(f"ROC-AUC: {worst_roc:.3f}")
print(f"\nWorst fold details:")
print(df_folds.iloc[worst_fold_idx])

print(f"\n\nPerformance Gap: {best_roc - worst_roc:.3f} ({(best_roc - worst_roc)/worst_roc * 100:.1f}% relative difference)")

## Examine Predictions from Best vs Worst Fold

In [None]:
from sklearn.metrics import confusion_matrix

# Best fold
best_fold = svm_folds[best_fold_idx]
cm_best = confusion_matrix(best_fold['y_test'], best_fold['y_pred'])

# Worst fold
worst_fold = svm_folds[worst_fold_idx]
cm_worst = confusion_matrix(worst_fold['y_test'], worst_fold['y_pred'])

print(f"Confusion Matrix - Best Fold ({int(best_fold_num)}):")
print(f"                 Predicted")
print(f"                 Neg   Pos")
print(f"Actual Neg      {cm_best[0,0]:4d}  {cm_best[0,1]:4d}")
print(f"Actual Pos      {cm_best[1,0]:4d}  {cm_best[1,1]:4d}")
print(f"\nSensitivity (Recall): {cm_best[1,1]/(cm_best[1,0]+cm_best[1,1]):.3f}")
print(f"Specificity: {cm_best[0,0]/(cm_best[0,0]+cm_best[0,1]):.3f}")

print(f"\n\nConfusion Matrix - Worst Fold ({int(worst_fold_num)}):")
print(f"                 Predicted")
print(f"                 Neg   Pos")
print(f"Actual Neg      {cm_worst[0,0]:4d}  {cm_worst[0,1]:4d}")
print(f"Actual Pos      {cm_worst[1,0]:4d}  {cm_worst[1,1]:4d}")
print(f"\nSensitivity (Recall): {cm_worst[1,1]/(cm_worst[1,0]+cm_worst[1,1]):.3f}")
print(f"Specificity: {cm_worst[0,0]/(cm_worst[0,0]+cm_worst[0,1]):.3f}")

## Summary Statistics

In [None]:
print("="*60)
print("SUMMARY STATISTICS ACROSS FOLDS")
print("="*60)

print(f"\nSVM ROC-AUC:")
print(f"  Mean: {df_folds['SVM_ROC_AUC'].mean():.3f}")
print(f"  Std:  {df_folds['SVM_ROC_AUC'].std():.3f}")
print(f"  Min:  {df_folds['SVM_ROC_AUC'].min():.3f} (Fold {int(df_folds.iloc[df_folds['SVM_ROC_AUC'].idxmin()]['Fold'])})")
print(f"  Max:  {df_folds['SVM_ROC_AUC'].max():.3f} (Fold {int(df_folds.iloc[df_folds['SVM_ROC_AUC'].idxmax()]['Fold'])})")

print(f"\nBaseline ROC-AUC:")
print(f"  Mean: {df_folds['Baseline_ROC_AUC'].mean():.3f}")
print(f"  Std:  {df_folds['Baseline_ROC_AUC'].std():.3f}")
print(f"  Min:  {df_folds['Baseline_ROC_AUC'].min():.3f} (Fold {int(df_folds.iloc[df_folds['Baseline_ROC_AUC'].idxmin()]['Fold'])})")
print(f"  Max:  {df_folds['Baseline_ROC_AUC'].max():.3f} (Fold {int(df_folds.iloc[df_folds['Baseline_ROC_AUC'].idxmax()]['Fold'])})")

print(f"\nHyperparameters Used:")
for i, row in df_folds.iterrows():
    print(f"  Fold {int(row['Fold'])}: {row['Best_Params']}")