# SVM Classification with Nested Cross-Validation

Rigorous evaluation using:
- Nested preprocessing (Harmonize→Scale→PCA per fold)
- 5-fold stratified CV on 90% dev set
- Held-out 10% test set for final evaluation
- Baseline (Logistic Regression) comparison
- Feature importance → brain region mapping

In [None]:
from core.config import initialize_notebook
import pandas as pd
import pickle
from pathlib import Path

env = initialize_notebook(regenerate_run_id=False)

research_question = env.configs.run['run_name']
seed = env.configs.run['seed']
kernel = env.configs.svm['model']['kernel']

print(f"Research Question: {research_question.upper()}")
print(f"Seed: {seed}")
print(f"SVM Kernel: {kernel}")
print(f"CV Folds: {env.configs.svm['cv']['n_splits']}")

## Load Data

Combine train + val → 90% development set for CV.
Keep test set (10%) completely held out.

In [None]:
from core.svm.pipeline import load_development_data

dev_df, data_dir = load_development_data(env)
test_df = pd.read_parquet(data_dir / "test.parquet")

print(f"Development set: {len(dev_df):,} subjects")
print(f"Test set: {len(test_df):,} subjects")

group_col = env.configs.data['columns']['mapping']['research_group']
print(f"\nDev group distribution:\n{dev_df[group_col].value_counts()}")

## Select Classification Task

Test with single task first before running all.

In [None]:
tasks = env.configs.svm['tasks']
print("Available tasks:")
for i, task in enumerate(tasks):
    print(f"  {i}: {task['name']}")

# Select first task for testing
task_config = tasks[1]
print(f"\nTesting with: {task_config['name']}")

## Baseline: Logistic Regression with Nested CV

In [None]:
from core.svm.pipeline import filter_task_data, run_nested_cv
from core.svm.models import create_baseline

# Filter data for this task
dev_filtered, y_dev = filter_task_data(dev_df, task_config, group_col)
test_filtered, y_test = filter_task_data(test_df, task_config, group_col)

print(f"Task: {task_config['name']}")
print(f"Dev: {len(y_dev)} | Test: {len(y_test)}")
print(f"Positive class ratio: {y_dev.mean():.2%}")

# Run baseline with nested CV
baseline = create_baseline(env.configs.svm, seed)
print("\nRunning baseline with nested CV")
baseline_cv = run_nested_cv(dev_filtered, y_dev, baseline, env, seed, use_wandb=False)

print("\nBaseline CV Results:")
print(f"  Accuracy: {baseline_cv['aggregated']['accuracy_mean']:.3f} ± {baseline_cv['aggregated']['accuracy_std']:.3f}")
print(f"  Balanced Accuracy: {baseline_cv['aggregated']['balanced_accuracy_mean']:.3f} ± {baseline_cv['aggregated']['balanced_accuracy_std']:.3f}")
print(f"  F1: {baseline_cv['aggregated']['f1_mean']:.3f} ± {baseline_cv['aggregated']['f1_std']:.3f}")
print(f"  ROC-AUC: {baseline_cv['aggregated']['roc_auc_mean']:.3f} ± {baseline_cv['aggregated']['roc_auc_std']:.3f}")

## SVM with Nested CV

In [None]:
from core.svm.models import create_svm

svm = create_svm(env.configs.svm, seed)
print(f"Running {kernel} SVM with nested CV...")
svm_cv = run_nested_cv(dev_filtered, y_dev, svm, env, seed, use_wandb=False)

print("\nSVM CV Results:")
print(f"  Accuracy: {svm_cv['aggregated']['accuracy_mean']:.3f} ± {svm_cv['aggregated']['accuracy_std']:.3f}")
print(f"  Balanced Accuracy: {svm_cv['aggregated']['balanced_accuracy_mean']:.3f} ± {svm_cv['aggregated']['balanced_accuracy_std']:.3f}")
print(f"  F1: {svm_cv['aggregated']['f1_mean']:.3f} ± {svm_cv['aggregated']['f1_std']:.3f}")
print(f"  ROC-AUC: {svm_cv['aggregated']['roc_auc_mean']:.3f} ± {svm_cv['aggregated']['roc_auc_std']:.3f}")

print("\nBaseline vs SVM (CV):")
print(f"  Accuracy: {(svm_cv['aggregated']['accuracy_mean'] - baseline_cv['aggregated']['accuracy_mean']):.3f}")
print(f"  Balanced Accuracy: {(svm_cv['aggregated']['balanced_accuracy_mean'] - baseline_cv['aggregated']['balanced_accuracy_mean']):.3f}")
print(f"  ROC-AUC: {(svm_cv['aggregated']['roc_auc_mean'] - baseline_cv['aggregated']['roc_auc_mean']):.3f}")

## Final Models on Test Set

Train on full dev set, evaluate once on held-out test.

In [None]:
from core.svm.pipeline import run_final_model

run_cfg = env.configs.run
task_name = task_config['name']
svm_dir = env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] / f"seed_{seed}" / "svm" / task_name
svm_dir.mkdir(parents=True, exist_ok=True)

print("Training final baseline on full dev set...")
baseline_final = run_final_model(dev_filtered, test_filtered, y_dev, y_test, 
                                 baseline, env, seed, f"baseline_{task_name}", svm_dir)

print("Training final SVM on full dev set...")
svm_final = run_final_model(dev_filtered, test_filtered, y_dev, y_test,
                            svm, env, seed, f"svm_{task_name}", svm_dir)

print("\n" + "="*60)
print("TEST SET RESULTS (FINAL)")
print("="*60)

print("\nBaseline:")
for metric, value in baseline_final['test_metrics'].items():
    print(f"  {metric}: {value:.3f}")

print("\nSVM:")
for metric, value in svm_final['test_metrics'].items():
    print(f"  {metric}: {value:.3f}")

## Confusion Matrices

In [None]:
from core.svm.preprocessing import preprocess_fold
from core.svm.evaluation import compute_confusion_matrix
from core.svm.visualization import plot_confusion_matrix
from IPython.display import Image, display

plots_dir = svm_dir / "plots"
plots_dir.mkdir(exist_ok=True)

# Preprocess test data
_, X_test_pca, _ = preprocess_fold(dev_filtered, test_filtered, env, seed)

# Predictions
y_pred_baseline = baseline_final['model'].predict(X_test_pca)
y_pred_svm = svm_final['model'].predict(X_test_pca)

# Confusion matrices
cm_baseline = compute_confusion_matrix(y_test, y_pred_baseline)
cm_svm = compute_confusion_matrix(y_test, y_pred_svm)

plot_confusion_matrix(cm_baseline, ["Negative", "Positive"], f"Baseline - {task_name}",
                     plots_dir / f"cm_baseline_{task_name}.png")
plot_confusion_matrix(cm_svm, ["Negative", "Positive"], f"SVM - {task_name}",
                     plots_dir / f"cm_svm_{task_name}.png")

display(Image(str(plots_dir / f"cm_baseline_{task_name}.png")))
display(Image(str(plots_dir / f"cm_svm_{task_name}.png")))

## Feature Importance → Brain Regions

In [None]:
from core.svm.interpretation import get_feature_importance_permutation, map_pca_to_brain_regions
from core.svm.feature_mapping import enrich_brain_regions
from core.tsne.embeddings import get_imaging_columns
from core.svm.visualization import plot_feature_importance

n_components = svm_final['pipeline']['n_components']
pca_features = [f"PC{i+1}" for i in range(n_components)]

# Get test set PCA features (from svm_final)
X_test_pca = svm_final['X_test_pca']

# Get feature importance (using permutation for all kernels for consistency)
svm_importance = get_feature_importance_permutation(
    svm_final['model'], X_test_pca, y_test, pca_features, seed
)

print(f"Top 10 Principal Components (by importance):\n")
print(svm_importance.head(10).to_string(index=False))

# Map to brain regions
all_imaging_cols = get_imaging_columns(dev_filtered, env.configs.svm['imaging_prefixes'])
valid_features = svm_final['pipeline']['valid_features']
imaging_cols = [col for i, col in enumerate(all_imaging_cols) if valid_features[i]]

brain_regions = map_pca_to_brain_regions(
    svm_importance, svm_final['pipeline']['pca'], imaging_cols,
    top_n_components=10, top_n_features=20
)

# Add human-readable labels
brain_regions_enriched = enrich_brain_regions(brain_regions, env)
brain_regions_enriched.to_csv(svm_dir / "brain_regions.csv", index=False)

# Display formatted table (no truncation)
print("\n\n" + "="*120)
print(f"TOP 20 BRAIN REGIONS - {task_name.replace('_', ' ').title()}")
print("="*120)

display_df = brain_regions_enriched.head(20).copy()
display_df.insert(0, 'Rank', range(1, 21))
display_df['importance'] = display_df['importance'].apply(lambda x: f"{x:.4f}")

# Set display options to show full content
pd.set_option('display.max_colwidth', None)  # No truncation
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)

print(display_df.to_string(index=False))
print("="*120)

## Visualize Brain Region Importance

In [None]:
plot_feature_importance(brain_regions_enriched, f"Top Brain Regions - {task_name}",
                       plots_dir / f"brain_regions_{task_name}.png", top_n=20)

display(Image(str(plots_dir / f"brain_regions_{task_name}.png")))

## Run All Tasks (Optional)

Once single task works, run complete pipeline.

In [None]:
# from core.svm.pipeline import run_svm_pipeline
# all_results = run_svm_pipeline(env, use_wandb=False)