# CBCL Subscale Regression with Nested Cross-Validation

**Goal**: Predict continuous CBCL subscale scores from brain imaging features.

**Approach**:
- 5-fold nested CV for each target subscale
- Test multiple models: Ridge, Elastic Net, Random Forest
- Compare against baseline (Ridge regression)
- Proper data hygiene: PCA fitted only on training folds

**CBCL Subscales**:
1. **Anxious/Depressed** (13 items) - anxiety and depressive symptoms
2. **Somatic** (11 items) - physiological symptoms
3. **Internalizing** (33 items) - combined anxious/depressed, somatic, withdrawn
4. **Anxiety Problems** (6 items) - DSM-5 oriented scale (GAD, SAD, phobia)

In [None]:
from core.config import initialize_notebook

env = initialize_notebook(regenerate_run_id=False)

research_question = env.configs.run['run_name']
seed = env.configs.run['seed']
use_pca = env.configs.regression.get('use_pca', True)

print(f"Research Question: {research_question.upper()}")
print(f"Seed: {seed}")
print(f"Use PCA: {use_pca}")
print(f"Outer CV Folds: {env.configs.regression['cv']['n_outer_splits']}")

# Show enabled models
models_config = env.configs.regression['models']
enabled_models = [name for name, cfg in models_config.items() if cfg.get('enabled', True)]
print(f"\nEnabled Models: {enabled_models}")

## Load Data

In [None]:
from core.regression.pipeline import load_full_dataset

# Load ALL data for nested CV (no fixed holdout)
full_df = load_full_dataset(env)

print(f"Total samples for nested CV: {len(full_df):,} subjects")

# Show target distributions
targets = env.configs.regression['targets']
print(f"\nTarget distributions:")
for target in targets:
    col = target['column']
    if col in full_df.columns:
        values = full_df[col].dropna()
        print(f"  {target['name']:20s}: n={len(values):4d}, "
              f"mean={values.mean():5.2f}, std={values.std():5.2f}, "
              f"range=[{values.min():.0f}, {values.max():.0f}]")
    else:
        print(f"  {target['name']:20s}: MISSING COLUMN '{col}'")

## Histogram & Sample Weighting Analysis

Analyze target distribution and determine optimal weighting strategy.

In [None]:
# HISTOGRAM & SAMPLE WEIGHTING ANALYSIS
import numpy as np
import matplotlib.pyplot as plt

# SELECT TASK TO ANALYZE
# 0 = anxious_depressed
# 1 = somatic
# 2 = internalizing
# 3 = anxiety_problems

task_idx = 3  # Change this number (0-3) to analyze different tasks

# Get both raw and t-score targets for this task
raw_target_config = targets[task_idx]  # Raw score
t_target_config = targets[task_idx + 4]  # T-score (4 positions later)

print("="*70)
print(f"TARGET DISTRIBUTION ANALYSIS: {raw_target_config['name'].replace('_raw', '')}")
print("="*70)

# Get configured custom bins from config
weighting_cfg = env.configs.regression.get('sample_weighting', {})
custom_bins_cfg = weighting_cfg.get('custom_bins', {})

# Visualization - 2x2 grid
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Process both raw and t-score
for row, target_config in enumerate([raw_target_config, t_target_config]):
    target_name = target_config['name']
    target_col = target_config['column']
    y = full_df[target_col].dropna().values
    
    score_type = "Raw Score" if "_raw" in target_name else "T-Score"
    
    print(f"\n{score_type}:")
    print(f"  n = {len(y):,}")
    print(f"  Mean = {y.mean():.2f}, SD = {y.std():.2f}")
    print(f"  Range = [{y.min():.0f}, {y.max():.0f}]")
    
    # Equal-width bins (10 bins)
    n_bins_equal = 10
    bins_equal = np.linspace(y.min(), y.max(), n_bins_equal + 1)
    bin_indices_equal = np.digitize(y, bins_equal) - 1
    bin_indices_equal = np.clip(bin_indices_equal, 0, n_bins_equal - 1)
    bin_counts_equal = np.bincount(bin_indices_equal, minlength=n_bins_equal)
    
    # Custom bins (if configured)
    if target_name in custom_bins_cfg:
        bin_edges = custom_bins_cfg[target_name]
        n_bins_custom = len(bin_edges) - 1
        bin_indices_custom = np.digitize(y, bin_edges) - 1
        bin_indices_custom = np.clip(bin_indices_custom, 0, n_bins_custom - 1)
        bin_counts_custom = np.bincount(bin_indices_custom, minlength=n_bins_custom)
    else:
        bin_edges = None
        bin_counts_custom = None
    
    # Determine shared y-axis limit for this row
    if bin_edges is not None:
        max_count = max(bin_counts_equal.max(), bin_counts_custom.max())
    else:
        max_count = bin_counts_equal.max()
    y_limit = max_count * 1.1
    
    # Left column: Equal-width bins
    ax = axes[row, 0]
    bin_centers_equal = [(bins_equal[i] + bins_equal[i+1])/2 for i in range(n_bins_equal)]
    min_count_equal = bin_counts_equal[bin_counts_equal > 0].min()
    colors_equal = ['red' if count == min_count_equal else 'steelblue' for count in bin_counts_equal]
    ax.bar(bin_centers_equal, bin_counts_equal, width=(bins_equal[1]-bins_equal[0]), 
           color=colors_equal, alpha=0.7, edgecolor='black')
    ax.axhline(min_count_equal, color='red', linestyle='--', linewidth=2, label=f'Min={min_count_equal}')
    ax.set_xlabel('Bin Center', fontweight='bold', fontsize=11)
    ax.set_ylabel('Sample Count', fontweight='bold', fontsize=11)
    ax.set_title(f'{score_type} - Equal-Width Bins (n={n_bins_equal})', fontweight='bold', fontsize=12)
    ax.set_ylim([0, y_limit])
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)
    
    # Right column: Custom bins
    ax = axes[row, 1]
    if bin_edges is not None:
        bin_widths_custom = [bin_edges[i+1] - bin_edges[i] for i in range(n_bins_custom)]
        bin_centers_custom = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(n_bins_custom)]
        min_count_custom = bin_counts_custom[bin_counts_custom > 0].min()
        colors_custom = ['red' if count == min_count_custom else 'orange' for count in bin_counts_custom]
        
        ax.bar(bin_centers_custom, bin_counts_custom, width=bin_widths_custom,
               color=colors_custom, alpha=0.7, edgecolor='black')
        ax.axhline(min_count_custom, color='red', linestyle='--', linewidth=2, label=f'Min={min_count_custom}')
        ax.set_xlabel('Bin Center', fontweight='bold', fontsize=11)
        ax.set_ylabel('Sample Count', fontweight='bold', fontsize=11)
        ax.set_title(f'{score_type} - Custom Bins (n={n_bins_custom})', fontweight='bold', fontsize=12)
        ax.set_ylim([0, y_limit])
        ax.legend(fontsize=9)
        ax.grid(alpha=0.3)
    else:
        ax.text(0.5, 0.5, 'No custom bins configured', 
                ha='center', va='center', fontsize=12, transform=ax.transAxes)
        ax.set_xticks([])
        ax.set_yticks([])

fig.suptitle(f'{raw_target_config["name"].replace("_raw", "").replace("_", " ").title()} - Distribution Comparison', 
             fontweight='bold', fontsize=14, y=0.995)
plt.tight_layout()
plt.show()

# Print detailed tables
print(f"\n{'='*70}")
print("EQUAL-WIDTH BINS:")
print(f"{'='*70}")
for target_config in [raw_target_config, t_target_config]:
    target_col = target_config['column']
    y = full_df[target_col].dropna().values
    score_type = "Raw" if "_raw" in target_config['name'] else "T-Score"
    
    n_bins_equal = 10
    bins_equal = np.linspace(y.min(), y.max(), n_bins_equal + 1)
    bin_indices_equal = np.digitize(y, bins_equal) - 1
    bin_indices_equal = np.clip(bin_indices_equal, 0, n_bins_equal - 1)
    bin_counts_equal = np.bincount(bin_indices_equal, minlength=n_bins_equal)
    
    print(f"\n{score_type}:")
    print(f"{'Bin':>5} {'Range':>15} {'Count':>8} {'%':>7}")
    print("-"*40)
    for i in range(n_bins_equal):
        count = bin_counts_equal[i]
        pct = 100 * count / len(y)
        print(f"{i:>5} {bins_equal[i]:6.1f}-{bins_equal[i+1]:6.1f} {count:8,} {pct:6.1f}%")

print(f"\n{'='*70}")
print("CUSTOM BINS:")
print(f"{'='*70}")
for target_config in [raw_target_config, t_target_config]:
    target_name = target_config['name']
    target_col = target_config['column']
    y = full_df[target_col].dropna().values
    score_type = "Raw" if "_raw" in target_name else "T-Score"
    
    if target_name in custom_bins_cfg:
        bin_edges = custom_bins_cfg[target_name]
        n_bins_custom = len(bin_edges) - 1
        bin_indices_custom = np.digitize(y, bin_edges) - 1
        bin_indices_custom = np.clip(bin_indices_custom, 0, n_bins_custom - 1)
        bin_counts_custom = np.bincount(bin_indices_custom, minlength=n_bins_custom)
        weights_per_bin = len(y) / (n_bins_custom * np.maximum(bin_counts_custom, 1))
        
        print(f"\n{score_type}:")
        print(f"{'Bin':>5} {'Range':>15} {'Count':>8} {'%':>7} {'Inv Weight':>12}")
        print("-"*55)
        for i in range(n_bins_custom):
            count = bin_counts_custom[i]
            pct = 100 * count / len(y)
            weight = weights_per_bin[i]
            print(f"{i:>5} {bin_edges[i]:6.1f}-{bin_edges[i+1]:6.1f} {count:8,} {pct:6.1f}% {weight:11.2f}x")
        
        min_bin = bin_counts_custom.min()
        max_bin = bin_counts_custom.max()
        imbalance = max_bin / max(min_bin, 1)
        print(f"  Imbalance ratio: {imbalance:.1f}:1")
    else:
        print(f"\n{score_type}: No custom bins configured")

print("="*70)

## Run Regression for Single Target

Start with one target and one model to test the pipeline.

In [None]:
from core.regression.pipeline import run_target_with_nested_cv

# Select target to predict
# RAW SCORES (discrete 0-26, zero-inflated):
#   0 = anxious_depressed_raw
#   1 = somatic_raw
#   2 = internalizing_raw
#   3 = anxiety_problems_raw
#
# T-SCORES (normalized, mean=50 SD=10, more continuous - RECOMMENDED):
#   4 = anxious_depressed_t
#   5 = somatic_t
#   6 = internalizing_t
#   7 = anxiety_problems_t

target_idx = 7 # Change this number (0-7) to test different subscales
target_config = targets[target_idx]

# Select model
model_name = 'linear'  # Options: 'ridge', 'random_forest', 'elastic_net', 'linear'


print(f"Running: {target_config['name']} with {model_name.upper()}\n")

# Run the regression with nested CV
results = run_target_with_nested_cv(env, full_df, target_config, model_name)

In [None]:
# DIAGNOSTIC: Check Sample Weights
from core.regression.pipeline import apply_sample_weighting, filter_target_data
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np

print("="*70)
print("SAMPLE WEIGHTING DIAGNOSTIC")
print("="*70)

# Get target
df_filtered, y = filter_target_data(full_df, target_config)
target_name = target_config['name']

# Create CV splitter
y_binned = pd.qcut(y, q=5, labels=False, duplicates='drop')
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# Get first fold
train_idx, test_idx = list(outer_cv.split(df_filtered, y_binned))[0]
y_train = y[train_idx]

print(f"\nTarget: {target_name}")
print(f"Total training samples: {len(y_train)}")
print(f"Training target stats: mean={y_train.mean():.2f}, std={y_train.std():.2f}")

# Check if weighting is enabled
weighting_cfg = env.configs.regression.get('sample_weighting', {})
if weighting_cfg.get('enabled', False):
    print(f"\nSample weighting: ENABLED")
    print(f"Method: {weighting_cfg.get('method', 'inverse_freq')}")
    
    # Apply weighting
    try:
        valid_mask, weights = apply_sample_weighting(y_train, target_name, env, method="inverse_freq")
        
        print(f"\nFiltering results:")
        print(f"  Valid samples (within bins): {valid_mask.sum()}")
        print(f"  Excluded samples: {(~valid_mask).sum()}")
        
        if (~valid_mask).sum() > 0:
            excluded_values = y_train[~valid_mask]
            print(f"  Excluded values range: [{excluded_values.min():.1f}, {excluded_values.max():.1f}]")
        
        print(f"\nWeight statistics:")
        print(f"  Min weight: {weights.min():.3f}")
        print(f"  Max weight: {weights.max():.3f}")
        print(f"  Mean weight: {weights.mean():.3f}")
        print(f"  Weight ratio (max/min): {weights.max()/weights.min():.1f}x")
        
        # Get bins from config
        custom_bins_cfg = weighting_cfg.get('custom_bins', {})
        if target_name in custom_bins_cfg:
            bin_edges = custom_bins_cfg[target_name]
            y_valid = y_train[valid_mask]
            
            print(f"\nPer-bin breakdown:")
            print(f"{'Bin':>5} {'Range':>15} {'Count':>8} {'%':>7} {'Weight':>10}")
            print("-"*55)
            
            for i in range(len(bin_edges)-1):
                bin_min = bin_edges[i]
                bin_max = bin_edges[i+1]
                mask = (y_valid >= bin_min) & (y_valid < bin_max)
                if mask.sum() > 0:
                    weight = weights[mask][0]
                    pct = 100 * mask.sum() / len(y_valid)
                    print(f"{i:>5} {bin_min:6.1f}-{bin_max:6.1f} {mask.sum():8,} {pct:6.1f}% {weight:9.3f}")
            
            # Check if high-score bins actually have high weights
            high_bin_idx = len(bin_edges) - 2  # Last bin
            high_bin_min = bin_edges[high_bin_idx]
            high_mask = y_valid >= high_bin_min
            if high_mask.sum() > 0:
                high_weight = weights[high_mask][0]
                print(f"  High-score bin ({high_bin_min}+): n={high_mask.sum()}, weight={high_weight:.3f}")
                if high_weight < 1.0:
                    print(f"  PROBLEM: High scores have LOWER weight than average!")
                else:
                    print(f"  High scores have higher weight (good)")
        
    except Exception as e:
        print(f"\nERROR applying weights: {e}")
else:
    print(f"\nSample weighting: DISABLED")

print("="*70)

## View Results

In [None]:
# Overall metrics (aggregated from all 5 folds)
print("="*70)
print("OVERALL RESULTS (All 5 folds aggregated)")
print("="*70)


print(f"\n{model_name.upper()}:")
for metric, value in results[model_name]['overall'].items():
    print(f"  {metric:15s}: {value:.4f}")

print("\n" + "="*70)
print("PER-FOLD STATISTICS (Mean ± Std)")
print("="*70)

print(f"\n{model_name.upper()} Per-Fold:")
for metric, value in results[model_name]['per_fold'].items():
    print(f"  {metric:20s}: {value:.4f}")

print(f"\nTotal samples tested: {results[model_name]['n_samples']}")
print(f"Number of folds: {results[model_name]['n_folds']}")



## Visualizations



In [None]:
from IPython.display import Image, display
from pathlib import Path

run_cfg = env.configs.run
target_name = target_config['name']
plots_dir = (env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] / 
             f"seed_{seed}" / "regression" / target_name / model_name / "plots")

print(f"Plots directory: {plots_dir}\n")

# Display NEW enhanced prediction plot with confidence intervals
print("="*70)
print("BRAIN-BEHAVIOR SCATTERPLOT")
print("="*70)

display(Image(str(plots_dir / f"predictions_{model_name}_{target_name}.png")))

# Display NEW coefficient plot (for Ridge/ElasticNet only)
if model_name in ['linear', 'ridge', 'elastic_net']:
    print("\n" + "="*70)
    print("COEFFICIENT PLOT (Top 30 Features)")
    print("="*70)
    print("Red = Positive (predicts higher symptoms)")
    print("Blue = Negative (protective)\n")
    display(Image(str(plots_dir / f"coefficients_{model_name}_{target_name}.png")))

# Display NEW comprehensive summary figure
print("\n" + "="*70)
print("COMPREHENSIVE SUMMARY (4-Panel Figure)")
print("="*70)
print("(A) Brain-behavior prediction")
print("(B) Residual analysis")
print("(C) Top feature coefficients" if model_name in ['ridge', 'elastic_net'] else "(C) Distribution comparison")
print("(D) Score distributions\n")
display(Image(str(plots_dir / f"summary_{model_name}_{target_name}.png")))

# Display residuals
print("\n" + "="*70)
print("RESIDUAL ANALYSIS")
print("="*70)
display(Image(str(plots_dir / f"residuals_{model_name}_{target_name}.png")))

print("\n" + "="*70)
print("All plots saved to:")
print(f"  {plots_dir}")
print("="*70)

## Run All Targets and Models (Optional)

Once single target works, run the complete pipeline for all targets and models.

In [None]:
# HISTOGRAM & SAMPLE WEIGHTING ANALYSIS
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

print("="*70)
print(f"TARGET DISTRIBUTION ANALYSIS: {target_config['name']}")
print("="*70)

# Get target values
target_col = target_config['column']
y = full_df[target_col].dropna().values

print(f"\nBasic Statistics:")
print(f"  n = {len(y):,}")
print(f"  Mean = {y.mean():.2f}, SD = {y.std():.2f}")
print(f"  Range = [{y.min():.0f}, {y.max():.0f}]")
print(f"  Unique values: {len(np.unique(y))}")

# Create bins for weighting
n_bins = 10  # Change this to adjust granularity
bins = np.linspace(y.min(), y.max(), n_bins + 1)
bin_indices = np.digitize(y, bins) - 1
bin_indices = np.clip(bin_indices, 0, n_bins - 1)  # Handle edge case

# Count samples per bin
bin_counts = np.bincount(bin_indices, minlength=n_bins)
bin_edges = [(bins[i], bins[i+1]) for i in range(n_bins)]

print(f"\nDistribution across {n_bins} bins:")
print(f"{'Bin':>5} {'Range':>15} {'Count':>8} {'%':>7} {'Inv Freq Weight':>17}")
print("-"*60)

for i in range(n_bins):
    count = bin_counts[i]
    pct = 100 * count / len(y)
    
    # Inverse frequency weight
    inv_freq_weight = len(y) / (n_bins * (count + 1)) if count > 0 else 0
    
    print(f"{i:>5} {bins[i]:6.1f}-{bins[i+1]:6.1f} {count:8,} {pct:6.1f}% {inv_freq_weight:16.2f}")

# Find smallest bin
min_bin_count = bin_counts[bin_counts > 0].min()
print(f"\nSmallest bin has: {min_bin_count:,} samples")

# Strategy comparison
print("\n" + "="*70)
print("WEIGHTING STRATEGIES:")
print("="*70)

print("\n1. INVERSE FREQUENCY (recommended):")
print("   Weight = n_total / (n_bins × bin_count)")
print("   → Rare bins get higher weight")
print("   → All bins contribute equally to loss")
print(f"   → Weight range: {len(y) / (n_bins * bin_counts.max()):.2f} to {len(y) / (n_bins * min_bin_count):.2f}")

print("\n2. DOWNSAMPLE TO SMALLEST BIN:")
print("   Use only {min_bin_count} samples per bin")
print(f"   → Total samples: {n_bins * min_bin_count:,} (from {len(y):,})")
print(f"   → Discards: {len(y) - n_bins * min_bin_count:,} samples ({100*(len(y) - n_bins * min_bin_count)/len(y):.1f}%)")
print("   → Equal representation but loses data")

print("\n3. UPSAMPLE RARE BINS (duplicate):")
print("   Duplicate samples in rare bins to match largest bin")
max_bin_count = bin_counts.max()
total_upsampled = sum(max_bin_count for _ in range(n_bins))
print(f"   → Total samples: {total_upsampled:,} (from {len(y):,})")
print(f"   → Creates: {total_upsampled - len(y):,} duplicates")
print("   → Balanced but overfits to duplicates")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Histogram of target values
ax = axes[0, 0]
ax.hist(y, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
ax.axvline(y.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean={y.mean():.1f}')
ax.axvline(y.median(), color='orange', linestyle='--', linewidth=2, label=f'Median={y.median():.1f}')
ax.set_xlabel('Target Value', fontweight='bold', fontsize=12)
ax.set_ylabel('Frequency', fontweight='bold', fontsize=12)
ax.set_title('(A) Target Distribution', fontweight='bold', fontsize=13)
ax.legend()
ax.grid(alpha=0.3)

# 2. Bin counts
ax = axes[0, 1]
bin_centers = [(bins[i] + bins[i+1])/2 for i in range(n_bins)]
colors = ['red' if count == min_bin_count else 'steelblue' for count in bin_counts]
ax.bar(bin_centers, bin_counts, width=(bins[1]-bins[0])*0.8, color=colors, alpha=0.7, edgecolor='black')
ax.axhline(min_bin_count, color='red', linestyle='--', linewidth=2, label=f'Min bin={min_bin_count}')
ax.set_xlabel('Bin Center', fontweight='bold', fontsize=12)
ax.set_ylabel('Sample Count', fontweight='bold', fontsize=12)
ax.set_title(f'(B) Samples per Bin (n={n_bins})', fontweight='bold', fontsize=13)
ax.legend()
ax.grid(alpha=0.3)

# 3. Inverse frequency weights
ax = axes[1, 0]
inv_weights = [len(y) / (n_bins * (count + 1)) for count in bin_counts]
ax.bar(bin_centers, inv_weights, width=(bins[1]-bins[0])*0.8, color='orange', alpha=0.7, edgecolor='black')
ax.axhline(1.0, color='black', linestyle='--', linewidth=1, alpha=0.5, label='Weight=1.0')
ax.set_xlabel('Bin Center', fontweight='bold', fontsize=12)
ax.set_ylabel('Inverse Frequency Weight', fontweight='bold', fontsize=12)
ax.set_title('(C) Inverse Frequency Weights', fontweight='bold', fontsize=13)
ax.legend()
ax.grid(alpha=0.3)

# 4. Cumulative distribution
ax = axes[1, 1]
sorted_y = np.sort(y)
cumulative = np.arange(1, len(y)+1) / len(y) * 100
ax.plot(sorted_y, cumulative, linewidth=2, color='steelblue')
ax.axhline(50, color='red', linestyle='--', alpha=0.5, label='50th percentile')
ax.axhline(75, color='orange', linestyle='--', alpha=0.5, label='75th percentile')
ax.axhline(90, color='green', linestyle='--', alpha=0.5, label='90th percentile')
ax.set_xlabel('Target Value', fontweight='bold', fontsize=12)
ax.set_ylabel('Cumulative %', fontweight='bold', fontsize=12)
ax.set_title('(D) Cumulative Distribution', fontweight='bold', fontsize=13)
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Recommendation
print("\n" + "="*70)
print("RECOMMENDATION:")
print("="*70)

imbalance_ratio = bin_counts.max() / min_bin_count
print(f"\nImbalance ratio: {imbalance_ratio:.1f}:1 (max bin / min bin)")

if imbalance_ratio > 20:
    print("\n✓ USE INVERSE FREQUENCY WEIGHTING")
    print("  → High imbalance (>20:1) makes downsampling wasteful")
    print("  → Keeps all data while balancing contribution")
    print("  → Equivalent to upsampling but computationally efficient")
elif imbalance_ratio > 5:
    print("\n✓ EITHER INVERSE FREQUENCY OR DOWNSAMPLE")
    print("  → Moderate imbalance (5-20:1)")
    print("  → Inverse freq: keeps all data")
    print("  → Downsample: simpler, forces balance")
else:
    print("\n⚠️  MINOR IMBALANCE")
    print("  → May not need weighting")
    print("  → Try standard regression first")

print(f"\nTo use inverse frequency weighting, add to regression.yaml:")
print("""
sample_weighting:
  enabled: true
  method: inverse_freq
  n_bins: 10
""")

print("="*70)


In [None]:
from core.regression.pipeline import run_regression_pipeline

# Run complete pipeline for all targets and enabled models
all_results = run_regression_pipeline(env)

## Compare Models Across All Targets

In [None]:
import pandas as pd

# Create comparison table
comparison_data = []

for target_name, target_results in all_results.items():
    for model_name, model_results in target_results.items():
        if model_name != 'baseline':
            comparison_data.append({
                'Target': target_name,
                'Model': model_name,
                'R²': model_results[model_name]['overall']['r2'],
                'MAE': model_results[model_name]['overall']['mae'],
                'RMSE': model_results[model_name]['overall']['rmse'],
                'Pearson r': model_results[model_name]['overall']['pearson_r'],
            })

comparison_df = pd.DataFrame(comparison_data)

print("\n" + "="*80)
print("MODEL COMPARISON ACROSS ALL TARGETS")
print("="*80)
print(comparison_df.to_string(index=False))

# Best model for each target
print("\n" + "="*80)
print("BEST MODEL FOR EACH TARGET (by R²)")
print("="*80)
for target_name in comparison_df['Target'].unique():
    target_data = comparison_df[comparison_df['Target'] == target_name]
    best_idx = target_data['R²'].idxmax()
    best_row = target_data.loc[best_idx]
    print(f"  {target_name:20s}: {best_row['Model']:15s} (R²={best_row['R²']:.3f}, MAE={best_row['MAE']:.2f})")

## Feature Importance (Linear Models Only)

For Ridge and Elastic Net, examine coefficients to see which brain regions predict CBCL scores.

In [None]:
import pickle
import numpy as np
import pandas as pd

# Load results for a specific target and model
target_name = targets[0]['name']  # Change to analyze different target
model_name = 'ridge'  # Must be 'ridge' or 'elastic_net'

results_path = (env.repo_root / "outputs" / run_cfg['run_name'] / run_cfg['run_id'] /
                f"seed_{seed}" / "regression" / target_name / model_name / "results.pkl")

with open(results_path, "rb") as f:
    saved_results = pickle.load(f)

# Get coefficients from last fold as representative
last_fold = saved_results[f"{model_name}_folds"][-1]
model = last_fold['model']
coefficients = model.coef_

# Create feature names
if use_pca:
    n_components = len(coefficients)
    feature_names = [f"PC{i+1}" for i in range(n_components)]
else:
    # TODO: Get actual feature names from preprocessing
    feature_names = [f"Feature{i+1}" for i in range(len(coefficients))]

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
})

importance_df = importance_df.sort_values('abs_coefficient', ascending=False)

print(f"\nTop 20 Features for {target_name} ({model_name.upper()}):")
print("="*60)
print(importance_df.head(20).to_string(index=False))