# Clinical vs. Control SVM Classification with Nested Cross-Validation

**Approach:**
- 5-fold outer CV: Each fold uses 80% train, 20% test
- **GridSearchCV for hyperparameter tuning**: Does its own 3-fold inner CV within the 80% training data
- Each subject tested exactly once across all folds
- Train on ALL data with class_weight='balanced' (no downsampling)
- Aggregate predictions from all 5 folds for final metrics

In [None]:
from core.config import initialize_notebook

env = initialize_notebook(regenerate_run_id=False)

research_question = env.configs.run['run_name']
seed = env.configs.run['seed']
kernel = env.configs.svm['model']['kernel']
class_weight = env.configs.svm['model']['class_weight']
tuning_enabled = env.configs.svm.get('tuning', {}).get('enabled', False)

print(f"Research Question: {research_question.upper()}")
print(f"Seed: {seed}")
print(f"SVM Kernel: {kernel}")
print(f"Class Weight: {class_weight}")
print(f"Hyperparameter Tuning: {'ENABLED' if tuning_enabled else 'DISABLED'}")
print(f"Outer CV Folds: {env.configs.svm['cv']['n_outer_splits']}")

## Load Data

In [None]:
from core.svm.pipeline import load_full_dataset

# Load ALL data for nested CV (no fixed holdout)
full_df = load_full_dataset(env)

print(f"Total samples for nested CV: {len(full_df):,} subjects")

group_col = env.configs.data['columns']['mapping']['research_group']
print(f"\nGroup distribution:\n{full_df[group_col].value_counts()}")

## Run Single Task with Nested CV



In [None]:
from core.svm.pipeline import run_task_with_nested_cv

tasks = env.configs.svm['tasks']
print("Available tasks:")
for i, task in enumerate(tasks):
    print(f"  {i}: {task['name']}")

# Select task to run (change index here)
task_config = tasks[1]  # 0=any_vs_control, 1=clinical_vs_control, etc.
print(f"\nRunning task: {task_config['name']}")

# Run the task with nested CV (all 5 folds)
results = run_task_with_nested_cv(env, full_df, task_config, use_wandb=False, sweep_mode=False)

## View Results

In [None]:
# Overall metrics (aggregated from all 5 folds)
print("="*60)
print("OVERALL RESULTS (All 5 folds aggregated)")
print("="*60)

print("\nBaseline (Logistic Regression):")
for metric, value in results['baseline']['overall'].items():
    print(f"  {metric}: {value:.3f}")

print("\nSVM:")
for metric, value in results['svm']['overall'].items():
    print(f"  {metric}: {value:.3f}")

print("\n" + "="*60)
print("PER-FOLD STATISTICS (Mean ± Std)")
print("="*60)

print("\nSVM Per-Fold:")
for metric, value in results['svm']['per_fold'].items():
    print(f"  {metric}: {value:.3f}")

print(f"\nTotal samples tested: {results['svm']['n_samples']}")
print(f"Number of folds: {results['svm']['n_folds']}")

## View Confusion Matrices

In [None]:
from IPython.display import Image, display
from pathlib import Path

run_cfg = env.configs.run
task_name = task_config['name']
plots_dir = env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] / f"seed_{seed}" / "svm" / task_name / "plots"

print(f"Confusion matrices saved to: {plots_dir}")

# Display confusion matrices
display(Image(str(plots_dir / f"cm_baseline_{task_name}.png")))
display(Image(str(plots_dir / f"cm_svm_{task_name}.png")))

## ROC and Precision-Recall Curves with Operating Point

Visualize where the tuned model sits on the ROC and precision-recall curves using the per-fold predictions.

In [None]:
import pickle
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve,
    precision_recall_curve,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    matthews_corrcoef,
)
from sklearn.calibration import calibration_curve

# Load results
run_cfg = env.configs.run
task_name = task_config['name']
MODEL = "svm"
BASE_DIR = env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] / f"seed_{seed}"

with open(BASE_DIR / MODEL / task_name / "results.pkl", "rb") as f:
    saved_results = pickle.load(f)

folds = saved_results[f"{MODEL}_folds"]

# Aggregate all fold predictions
y_true = np.concatenate([fold["y_test"] for fold in folds])
scores = np.concatenate([fold["y_score"] for fold in folds])

# Get thresholds
if 'threshold' in folds[0]:
    thresholds = [fold["threshold"] for fold in folds]
    operating_thr = np.mean(thresholds)
elif 'best_threshold' in folds[0]:
    thresholds = [fold["best_threshold"] for fold in folds]
    operating_thr = np.mean(thresholds)
else:
    operating_thr = 0.0

y_pred = (scores >= operating_thr).astype(int)

# ============================================================
# COMPREHENSIVE METRICS
# ============================================================

# Basic metrics
roc_auc = roc_auc_score(y_true, scores)
pr_auc = average_precision_score(y_true, scores)

# Confusion matrix components
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()

# Calculate all metrics
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # TPR, Recall
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # TNR
precision = tp / (tp + fp) if (tp + fp) > 0 else 0    # PPV
npv = tn / (tn + fn) if (tn + fn) > 0 else 0          # Negative Predictive Value
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
fdr = fp / (fp + tp) if (fp + tp) > 0 else 0          # False Discovery Rate

# Advanced metrics
mcc = matthews_corrcoef(y_true, y_pred)
baseline_rate = np.sum(y_true) / len(y_true)
lift = precision / baseline_rate if baseline_rate > 0 else 0
nns = 1 / precision if precision > 0 else np.inf      # Number Needed to Screen

print("="*70)
print(f"COMPREHENSIVE METRICS - {MODEL.upper()} - {task_name}")
print("="*70)
print(f"\nOperating Threshold: {operating_thr:.3f}")
print(f"\n--- Discrimination Metrics ---")
print(f"ROC-AUC:              {roc_auc:.3f}")
print(f"PR-AUC:               {pr_auc:.3f}")
print(f"Matthews Corr Coef:   {mcc:.3f}  {'[Weak]' if mcc < 0.3 else '[Moderate]' if mcc < 0.5 else '[Strong]'}")

print(f"\n--- Classification Performance ---")
print(f"Sensitivity (Recall): {sensitivity:.3f}  (catching {100*sensitivity:.1f}% of positives)")
print(f"Specificity:          {specificity:.3f}  (correctly rejecting {100*specificity:.1f}% of negatives)")
print(f"Precision (PPV):      {precision:.3f}  ({100*precision:.1f}% of predictions are correct)")
print(f"NPV:                  {npv:.3f}  ({100*npv:.1f}% of negative predictions are correct)")

print(f"\n--- Error Rates ---")
print(f"False Positive Rate:  {fpr:.3f}  ({100*fpr:.1f}% of controls misclassified)")
print(f"False Negative Rate:  {fnr:.3f}  (missing {100*fnr:.1f}% of positives)")
print(f"False Discovery Rate: {fdr:.3f}  ({100*fdr:.1f}% of predictions are false alarms)")

print(f"\n--- Utility ---")
print(f"Lift over Baseline:   {lift:.2f}x  (model is {lift:.1f}x better than random)")
print(f"Number Needed Screen: {nns:.1f}  (examine {nns:.0f} flagged cases to find 1 true positive)")

print(f"\n--- Prediction Distribution ---")
print(f"Baseline Positive Rate:  {100*baseline_rate:.1f}%  ({int(np.sum(y_true))}/{len(y_true)} actual positives)")
print(f"Predicted Positive Rate: {100*np.sum(y_pred)/len(y_pred):.1f}%  ({np.sum(y_pred)}/{len(y_pred)} predicted)")

print(f"\n--- Confusion Matrix ---")
print(f"                Predicted")
print(f"                Neg    Pos")
print(f"Actual  Neg    {tn:4d}   {fp:4d}")
print(f"        Pos    {fn:4d}   {tp:4d}")

# ============================================================
# TOP-K PRECISION CURVE
# ============================================================

# Sort predictions by confidence (highest scores first)
sorted_idx = np.argsort(scores)[::-1]
sorted_y_true = y_true[sorted_idx]

# Calculate precision at different K values
k_values = []
precisions_at_k = []
for k in range(10, min(500, len(scores)), 10):
    k_values.append(k)
    prec_k = np.sum(sorted_y_true[:k]) / k
    precisions_at_k.append(prec_k)

# ============================================================
# CALIBRATION
# ============================================================

# For decision_function, map scores to [0,1] for calibration plot
from scipy.special import expit
probs = expit(scores)  # Sigmoid to approximate probabilities
prob_true, prob_pred = calibration_curve(y_true, probs, n_bins=10, strategy='quantile')

# ============================================================
# VISUALIZATION
# ============================================================

fig = plt.figure(figsize=(18, 10))
gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)

# 1. ROC Curve
ax1 = fig.add_subplot(gs[0, 0])
fpr_curve, tpr_curve, _ = roc_curve(y_true, scores)
op_tpr = sensitivity
op_fpr = fpr
ax1.plot(fpr_curve, tpr_curve, label=f"ROC (AUC={roc_auc:.3f})", linewidth=2)
ax1.scatter(op_fpr, op_tpr, color="red", s=100, zorder=5, 
           label=f"Operating @ {operating_thr:.2f}")
ax1.plot([0, 1], [0, 1], "k--", alpha=0.3, label="Random")
ax1.set_xlabel("False Positive Rate", fontsize=11)
ax1.set_ylabel("True Positive Rate (Recall)", fontsize=11)
ax1.set_title(f"ROC Curve - {MODEL.upper()}", fontsize=12, fontweight='bold')
ax1.legend(loc="lower right", fontsize=9)
ax1.grid(alpha=0.3)

# 2. Precision-Recall Curve
ax2 = fig.add_subplot(gs[0, 1])
prec_curve, rec_curve, _ = precision_recall_curve(y_true, scores)
ax2.plot(rec_curve, prec_curve, label=f"PR (AUC={pr_auc:.3f})", linewidth=2)
ax2.scatter(sensitivity, precision, color="red", s=100, zorder=5,
           label=f"Operating @ {operating_thr:.2f}")
ax2.axhline(baseline_rate, color='k', linestyle='--', alpha=0.3, 
           label=f"Baseline ({baseline_rate:.3f})")
ax2.set_xlabel("Recall (Sensitivity)", fontsize=11)
ax2.set_ylabel("Precision (PPV)", fontsize=11)
ax2.set_title(f"Precision-Recall Curve - {MODEL.upper()}", fontsize=12, fontweight='bold')
ax2.set_ylim([0, 1])
ax2.legend(loc="best", fontsize=9)
ax2.grid(alpha=0.3)

# 3. Top-K Precision
ax3 = fig.add_subplot(gs[0, 2])
ax3.plot(k_values, precisions_at_k, linewidth=2, color='#2ca02c')
ax3.axhline(baseline_rate, color='k', linestyle='--', alpha=0.3, label='Baseline')
ax3.axhline(precision, color='red', linestyle=':', alpha=0.5, 
           label=f'Operating precision ({precision:.3f})')
ax3.set_xlabel("Top K Predictions", fontsize=11)
ax3.set_ylabel("Precision in Top K", fontsize=11)
ax3.set_title("Top-K Precision Curve", fontsize=12, fontweight='bold')
ax3.legend(loc="best", fontsize=9)
ax3.grid(alpha=0.3)

# 4. Calibration Plot
ax4 = fig.add_subplot(gs[1, 0])
ax4.plot(prob_pred, prob_true, marker='o', linewidth=2, label='Model')
ax4.plot([0, 1], [0, 1], "k--", alpha=0.3, label="Perfect calibration")
ax4.set_xlabel("Mean Predicted Probability", fontsize=11)
ax4.set_ylabel("Fraction of Positives", fontsize=11)
ax4.set_title("Calibration Curve", fontsize=12, fontweight='bold')
ax4.legend(loc="best", fontsize=9)
ax4.grid(alpha=0.3)
ax4.set_xlim([0, 1])
ax4.set_ylim([0, 1])

# 5. Score Distribution
ax5 = fig.add_subplot(gs[1, 1])
ax5.hist(scores[y_true == 0], bins=50, alpha=0.5, label='Negative', color='blue', density=True)
ax5.hist(scores[y_true == 1], bins=50, alpha=0.5, label='Positive', color='red', density=True)
ax5.axvline(operating_thr, color='green', linestyle='--', linewidth=2, label=f'Threshold ({operating_thr:.2f})')
ax5.set_xlabel("Decision Score", fontsize=11)
ax5.set_ylabel("Density", fontsize=11)
ax5.set_title("Score Distribution by Class", fontsize=12, fontweight='bold')
ax5.legend(loc="best", fontsize=9)
ax5.grid(alpha=0.3)

# 6. Confusion Matrix Heatmap
ax6 = fig.add_subplot(gs[1, 2])
im = ax6.imshow(cm, cmap='Blues', aspect='auto')
ax6.set_xticks([0, 1])
ax6.set_yticks([0, 1])
ax6.set_xticklabels(['Negative', 'Positive'])
ax6.set_yticklabels(['Negative', 'Positive'])
ax6.set_xlabel('Predicted', fontsize=11)
ax6.set_ylabel('Actual', fontsize=11)
ax6.set_title('Confusion Matrix', fontsize=12, fontweight='bold')

# Add text annotations
for i in range(2):
    for j in range(2):
        text = ax6.text(j, i, f'{cm[i, j]}\n({100*cm[i,j]/np.sum(cm):.1f}%)',
                       ha="center", va="center", color="black", fontsize=10)

plt.colorbar(im, ax=ax6, fraction=0.046, pad=0.04)

plt.suptitle(f'{MODEL.upper()} - {task_name} - Comprehensive Metrics Dashboard', 
             fontsize=14, fontweight='bold', y=0.995)

# Store figure reference for saving later
comprehensive_dashboard_fig = fig

plt.show()

print("\n✓ Comprehensive metrics dashboard generated")

## Threshold Analysis

Explore how performance metrics change across different threshold values.

In [None]:
# ============================================================
# THRESHOLD ANALYSIS TABLE
# ============================================================

print("\n" + "="*90)
print("THRESHOLD SENSITIVITY ANALYSIS")
print("="*90)

# Get thresholds from config
rf_config = env.configs.randomforest
threshold_config = rf_config.get('evaluation', {}).get('threshold_search', {})
config_thresholds = threshold_config.get('thresholds', [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50])

# Add the actual operating threshold to the list
test_thresholds = sorted(set(config_thresholds + [operating_thr]))

print(f"\n{'Threshold':^10} | {'Sens':^6} | {'Spec':^6} | {'PPV':^6} | {'NPV':^6} | {'F2':^6} | {'MCC':^6} | {'# Flag':^7} | {'NNS':^6}")
print("-" * 90)

for thr in test_thresholds:
    y_pred_thr = (scores >= thr).astype(int)
    cm_thr = confusion_matrix(y_true, y_pred_thr)
    tn_t, fp_t, fn_t, tp_t = cm_thr.ravel()
    
    sens_t = tp_t / (tp_t + fn_t) if (tp_t + fn_t) > 0 else 0
    spec_t = tn_t / (tn_t + fp_t) if (tn_t + fp_t) > 0 else 0
    ppv_t = tp_t / (tp_t + fp_t) if (tp_t + fp_t) > 0 else 0
    npv_t = tn_t / (tn_t + fn_t) if (tn_t + fn_t) > 0 else 0
    
    # F2 score (recall weighted 2x more than precision)
    if ppv_t > 0 and sens_t > 0:
        f2_t = 5 * ppv_t * sens_t / (4 * ppv_t + sens_t)
    else:
        f2_t = 0
    
    mcc_t = matthews_corrcoef(y_true, y_pred_thr)
    nns_t = 1 / ppv_t if ppv_t > 0 else np.inf
    n_flagged = y_pred_thr.sum()
    
    # Highlight current operating threshold (exact match)
    marker = " ← CURRENT" if abs(thr - operating_thr) < 0.001 else ""
    
    print(f"{thr:^10.3f} | {sens_t:^6.3f} | {spec_t:^6.3f} | {ppv_t:^6.3f} | {npv_t:^6.3f} | {f2_t:^6.3f} | {mcc_t:^6.3f} | {n_flagged:^7d} | {nns_t:^6.1f}{marker}")

print(f"\nNote: Operating threshold {operating_thr:.3f} is the mean of per-fold thresholds")
print("✓ Threshold analysis complete")

In [None]:
# ============================================================
# PRECISION-RECALL VS THRESHOLD VISUALIZATION
# ============================================================

# Calculate metrics across threshold range
threshold_range = np.linspace(0, 1, 100)
precisions_list = []
recalls_list = []
f1_scores = []
f2_scores = []
specificities = []

for thr in threshold_range:
    y_pred_t = (scores >= thr).astype(int)
    cm_t = confusion_matrix(y_true, y_pred_t)
    tn_t, fp_t, fn_t, tp_t = cm_t.ravel()
    
    recall_t = tp_t / (tp_t + fn_t) if (tp_t + fn_t) > 0 else 0
    prec_t = tp_t / (tp_t + fp_t) if (tp_t + fp_t) > 0 else 0
    spec_t = tn_t / (tn_t + fp_t) if (tn_t + fp_t) > 0 else 0
    
    if prec_t > 0 and recall_t > 0:
        f1_t = 2 * prec_t * recall_t / (prec_t + recall_t)
        f2_t = 5 * prec_t * recall_t / (4 * prec_t + recall_t)
    else:
        f1_t = 0
        f2_t = 0
    
    precisions_list.append(prec_t)
    recalls_list.append(recall_t)
    f1_scores.append(f1_t)
    f2_scores.append(f2_t)
    specificities.append(spec_t)

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Left plot: Precision, Recall, Specificity vs Threshold
ax1.plot(threshold_range, recalls_list, label='Recall (Sensitivity)', linewidth=2)
ax1.plot(threshold_range, precisions_list, label='Precision (PPV)', linewidth=2)
ax1.plot(threshold_range, specificities, label='Specificity', linewidth=2, linestyle='--')
ax1.axvline(operating_thr, color='red', linestyle=':', linewidth=2, label=f'Operating ({operating_thr:.2f})')
ax1.set_xlabel('Threshold', fontsize=12)
ax1.set_ylabel('Score', fontsize=12)
ax1.set_title('Precision, Recall, Specificity vs Threshold', fontsize=13, fontweight='bold')
ax1.legend(loc='best')
ax1.grid(alpha=0.3)
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1])

# Right plot: F1 and F2 scores vs Threshold
ax2.plot(threshold_range, f1_scores, label='F1 Score', linewidth=2, color='green')
ax2.plot(threshold_range, f2_scores, label='F2 Score (prioritizes recall)', linewidth=2, color='purple')
ax2.axvline(operating_thr, color='red', linestyle=':', linewidth=2, label=f'Operating ({operating_thr:.2f})')

# Mark best F2 threshold
best_f2_idx = np.argmax(f2_scores)
best_f2_thr = threshold_range[best_f2_idx]
best_f2_val = f2_scores[best_f2_idx]
ax2.scatter(best_f2_thr, best_f2_val, color='purple', s=150, zorder=5, marker='*',
           label=f'Best F2 @ {best_f2_thr:.2f}')

ax2.set_xlabel('Threshold', fontsize=12)
ax2.set_ylabel('Score', fontsize=12)
ax2.set_title('F-Scores vs Threshold', fontsize=13, fontweight='bold')
ax2.legend(loc='best')
ax2.grid(alpha=0.3)
ax2.set_xlim([0, 1])
ax2.set_ylim([0, 1])

plt.tight_layout()

# Store figure reference for saving later
threshold_analysis_fig = fig

plt.show()

print(f"\n✓ Best F2 threshold: {best_f2_thr:.3f} (F2={best_f2_val:.3f})")
print(f"  Current threshold: {operating_thr:.3f}")
print(f"  Difference: {abs(best_f2_thr - operating_thr):.3f}")
print("\n✓ Threshold analysis plots complete")

## High Confidence Predictions

Examine the most confident predictions and how reliability varies by probability bin.

In [None]:
# ============================================================
# HIGH CONFIDENCE PREDICTIONS
# ============================================================

print("\n" + "="*70)
print("HIGH CONFIDENCE PREDICTIONS")
print("="*70)

# Sort by confidence
sorted_idx = np.argsort(scores)[::-1]
sorted_scores = scores[sorted_idx]
sorted_y_true = y_true[sorted_idx]
sorted_y_pred = y_pred[sorted_idx]

# Top confident predictions
n_top = 20
print(f"\n--- Top {n_top} Most Confident Predictions ---")
print(f"{'Rank':^6} | {'Prob':^6} | {'Pred':^6} | {'True':^6} | {'Correct':^8}")
print("-" * 45)

for i in range(min(n_top, len(sorted_scores))):
    prob = sorted_scores[i]
    pred = "POS" if sorted_y_pred[i] == 1 else "NEG"
    true = "POS" if sorted_y_true[i] == 1 else "NEG"
    correct = "✓" if sorted_y_pred[i] == sorted_y_true[i] else "✗"
    
    print(f"{i+1:^6d} | {prob:^6.3f} | {pred:^6s} | {true:^6s} | {correct:^8s}")

# Confidence bins
print(f"\n--- Predictions by Confidence Bin ---")
confidence_bins = [(0.0, 0.05), (0.05, 0.10), (0.10, 0.15), (0.15, 0.20), (0.20, 0.50), (0.50, 1.0)]

print(f"{'Prob Range':^15} | {'Count':^7} | {'% True Pos':^11} | {'Actual PPV':^11}")
print("-" * 60)

for low, high in confidence_bins:
    mask = (scores >= low) & (scores < high)
    n_in_bin = mask.sum()
    if n_in_bin > 0:
        n_actual_pos = y_true[mask].sum()
        pct_actual_pos = 100 * n_actual_pos / n_in_bin
        
        # PPV in this bin (among those predicted positive)
        pred_pos_mask = mask & (y_pred == 1)
        if pred_pos_mask.sum() > 0:
            ppv = y_true[pred_pos_mask].sum() / pred_pos_mask.sum()
        else:
            ppv = 0.0
        
        print(f"[{low:.2f}, {high:.2f})   | {n_in_bin:^7d} | {pct_actual_pos:^11.1f} | {ppv:^11.3f}")

# Summary stats for high confidence positives
high_conf_mask = (scores >= 0.15) & (y_pred == 1)
if high_conf_mask.sum() > 0:
    high_conf_ppv = y_true[high_conf_mask].sum() / high_conf_mask.sum()
    print(f"\n✓ High confidence (≥0.15) predictions: {high_conf_mask.sum()}")
    print(f"  Precision among high-confidence: {high_conf_ppv:.3f}")
    print(f"  Catching {y_true[high_conf_mask].sum()} of {y_true.sum()} true positives ({100*y_true[high_conf_mask].sum()/y_true.sum():.1f}%)")

print("\n✓ High confidence analysis complete")

## Error Analysis

In [None]:
# ============================================================
# ERROR ANALYSIS
# ============================================================

print("\n" + "="*70)
print("ERROR ANALYSIS")
print("="*70)

# False Positives (predicted positive, actually negative)
fp_mask = (y_pred == 1) & (y_true == 0)
fp_scores = scores[fp_mask]

# False Negatives (predicted negative, actually positive)
fn_mask = (y_pred == 0) & (y_true == 1)
fn_scores = scores[fn_mask]

# True Positives
tp_mask = (y_pred == 1) & (y_true == 1)
tp_scores = scores[tp_mask]

# True Negatives
tn_mask = (y_pred == 0) & (y_true == 0)
tn_scores = scores[tn_mask]

print(f"\n--- Error Distribution ---")
print(f"False Positives: {fp_mask.sum()} (most confident wrong alarms)")
if fp_mask.sum() > 0:
    print(f"  Score range: [{fp_scores.min():.3f}, {fp_scores.max():.3f}]")
    print(f"  Mean score: {fp_scores.mean():.3f}")
    print(f"  Median score: {np.median(fp_scores):.3f}")
    
    # Show most confident false positives
    fp_indices = np.where(fp_mask)[0]
    fp_sorted_idx = fp_indices[np.argsort(scores[fp_indices])[::-1]]
    
    print(f"\n  Top 10 Most Confident False Positives:")
    print(f"  {'Index':^8} | {'Score':^8}")
    print("  " + "-" * 20)
    for idx in fp_sorted_idx[:10]:
        print(f"  {idx:^8d} | {scores[idx]:^8.3f}")

print(f"\nFalse Negatives: {fn_mask.sum()} (missed cases)")
if fn_mask.sum() > 0:
    print(f"  Score range: [{fn_scores.min():.3f}, {fn_scores.max():.3f}]")
    print(f"  Mean score: {fn_scores.mean():.3f}")
    print(f"  Median score: {np.median(fn_scores):.3f}")
    
    # Show closest misses (highest scoring false negatives)
    fn_indices = np.where(fn_mask)[0]
    fn_sorted_idx = fn_indices[np.argsort(scores[fn_indices])[::-1]]
    
    print(f"\n  Top 10 False Negatives (closest to threshold):")
    print(f"  {'Index':^8} | {'Score':^8} | {'Gap to Thr':^12}")
    print("  " + "-" * 32)
    for idx in fn_sorted_idx[:10]:
        gap = operating_thr - scores[idx]
        print(f"  {idx:^8d} | {scores[idx]:^8.3f} | {gap:^12.3f}")

print(f"\n--- Success Distribution ---")
print(f"True Positives: {tp_mask.sum()}")
if tp_mask.sum() > 0:
    print(f"  Mean score: {tp_scores.mean():.3f} (avg confidence when correct)")
    print(f"  Min score: {tp_scores.min():.3f} (barely caught)")
    print(f"  Max score: {tp_scores.max():.3f} (highest confidence)")

print(f"\nTrue Negatives: {tn_mask.sum()}")
if tn_mask.sum() > 0:
    print(f"  Mean score: {tn_scores.mean():.3f}")
    print(f"  Max score: {tn_scores.max():.3f} (closest call to threshold)")

# Score separation analysis
print(f"\n--- Class Separation ---")
pos_scores = scores[y_true == 1]
neg_scores = scores[y_true == 0]
print(f"Positive class mean: {pos_scores.mean():.3f} ± {pos_scores.std():.3f}")
print(f"Negative class mean: {neg_scores.mean():.3f} ± {neg_scores.std():.3f}")
print(f"Mean difference: {pos_scores.mean() - neg_scores.mean():.3f}")

print("\n✓ Error analysis complete")

## Save Comprehensive Run Results

Save all results, configs, and figures for reproducibility.

In [None]:
import pickle
import json
import shutil
from pathlib import Path
from datetime import datetime

# Configuration
MODEL_NAME = "svm"
TASK_NAME = task_config['name']

# Create timestamped save directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_dir = (env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] /
            f"seed_{seed}" / MODEL_NAME / TASK_NAME / f"analysis_{timestamp}")
save_dir.mkdir(parents=True, exist_ok=True)

print("="*70)
print("SAVING COMPREHENSIVE RUN RESULTS")
print("="*70)
print(f"\nSave directory: {save_dir}")

# 1. Copy results pickle
results_path = (env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] /
                f"seed_{seed}" / MODEL_NAME / TASK_NAME / "results.pkl")
if results_path.exists():
    shutil.copy(results_path, save_dir / "results.pkl")
    print(f"✓ Copied results.pkl")

# 2. Save configurations
configs_dir = save_dir / "configs"
configs_dir.mkdir(exist_ok=True)

import yaml
with open(configs_dir / f"{MODEL_NAME}.yaml", "w") as f:
    yaml.dump(dict(getattr(env.configs, MODEL_NAME)), f, default_flow_style=False)
with open(configs_dir / "run.yaml", "w") as f:
    yaml.dump(dict(env.configs.run), f, default_flow_style=False)
with open(configs_dir / "task.json", "w") as f:
    json.dump(task_config, f, indent=2)
print(f"✓ Saved config files")

# 3. Save all matplotlib figures
figures_dir = save_dir / "figures"
figures_dir.mkdir(exist_ok=True)

saved_figs = []

# Save the comprehensive dashboard figure (from cell 11)
try:
    comprehensive_dashboard_fig.savefig(
        figures_dir / "comprehensive_metrics_dashboard.png", 
        dpi=300, bbox_inches='tight'
    )
    saved_figs.append("comprehensive_metrics_dashboard.png")
except NameError:
    print("  ⚠ Warning: comprehensive_dashboard_fig not found - run cell 11 first")

# Save the threshold analysis figure (from cell 14)
try:
    threshold_analysis_fig.savefig(
        figures_dir / "threshold_analysis.png", 
        dpi=300, bbox_inches='tight'
    )
    saved_figs.append("threshold_analysis.png")
except NameError:
    print("  ⚠ Warning: threshold_analysis_fig not found - run cell 14 first")

# Also save any other active figures
for i in plt.get_fignums():
    fig = plt.figure(i)
    fig.savefig(figures_dir / f"figure_{i:02d}.png", dpi=300, bbox_inches='tight')
    saved_figs.append(f"figure_{i:02d}.png")

# Copy confusion matrices from pipeline output
plots_dir = (env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] /
             f"seed_{seed}" / MODEL_NAME / TASK_NAME / "plots")
if plots_dir.exists():
    for plot_file in plots_dir.glob("*.png"):
        shutil.copy(plot_file, figures_dir / plot_file.name)
        saved_figs.append(plot_file.name)

if saved_figs:
    print(f"✓ Saved {len(saved_figs)} figures:")
    for fig_name in saved_figs:
        print(f"  - {fig_name}")
else:
    print("⚠ No figures saved - make sure to run the analysis cells first")

# 4. Save metrics summary
metrics_summary = {
    "timestamp": timestamp,
    "model": MODEL_NAME,
    "task": TASK_NAME,
    "run_id": run_cfg['run_id'],
    "seed": seed,
    "overall_metrics": {
        "baseline": {k: float(v) if isinstance(v, (int, float, np.number)) else v
                     for k, v in results['baseline']['overall'].items()},
        MODEL_NAME: {k: float(v) if isinstance(v, (int, float, np.number)) else v
                     for k, v in results[MODEL_NAME]['overall'].items()}
    },
    "per_fold_stats": {
        k: float(v) if isinstance(v, (int, float, np.number)) else v
        for k, v in results[MODEL_NAME]['per_fold'].items()
    }
}

with open(save_dir / "metrics_summary.json", "w") as f:
    json.dump(metrics_summary, f, indent=2)

# 5. Save comprehensive metrics (if computed)
if 'roc_auc' in locals():
    comprehensive = {
        "discrimination": {
            "roc_auc": float(roc_auc),
            "pr_auc": float(pr_auc),
            "mcc": float(mcc)
        },
        "performance": {
            "sensitivity": float(sensitivity),
            "specificity": float(specificity),
            "precision": float(precision),
            "npv": float(npv)
        },
        "errors": {
            "fpr": float(fpr),
            "fnr": float(fnr),
            "fdr": float(fdr)
        },
        "confusion_matrix": {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)},
        "threshold": float(operating_thr)
    }
    with open(save_dir / "comprehensive_metrics.json", "w") as f:
        json.dump(comprehensive, f, indent=2)
    print(f"✓ Saved comprehensive metrics")

# 6. Create README
readme = f"""# {MODEL_NAME.upper()} Results: {TASK_NAME}

**Timestamp**: {timestamp}
**Run ID**: {run_cfg['run_id']}
**Seed**: {seed}

## Performance
- ROC-AUC: {results[MODEL_NAME]['overall']['roc_auc']:.3f}
- Balanced Accuracy: {results[MODEL_NAME]['overall']['balanced_accuracy']:.3f}

## Files
- `results.pkl`: Complete results
- `metrics_summary.json`: Key metrics
- `comprehensive_metrics.json`: Detailed metrics
- `configs/`: Configuration files
- `figures/`: All visualizations

## Reproducibility
Use configs in `configs/` with seed {seed}
"""

with open(save_dir / "README.md", "w") as f:
    f.write(readme)

print(f"\n✓ All results saved to: {save_dir.name}")
print(f"✓ Total files: {len(list(save_dir.rglob('*')))}")
print("="*70)

## Run All Tasks (Optional)

Once single task works, you can run all tasks at once.

In [None]:
from core.svm.pipeline import run_svm_pipeline

# Run complete pipeline for all tasks
all_results = run_svm_pipeline(env, use_wandb=False, sweep_mode=False)