# Module 4: Feature Selection & Validation - Consolidated

**One notebook to rule them all.**

This runs the complete pipeline:
1. Feature extraction & selection
2. Model training (LASSO or Random Forest)
3. LOO cross-validation
4. Validation testing
5. Results visualization

**Runtime:** ~2 minutes per model

## Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
PROJECT_ROOT = Path('/Users/maggiebrown/Desktop/PrimaMente/wgbs_classifier')  # Update if needed
sys.path.insert(0, str(PROJECT_ROOT / 'src'))

# Import the pipeline
from feature_selection_consolidated import run_pipeline

import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image, display

print("‚úì Ready to go!")

## Option 1: Run Random Forest

Random Forest may handle batch effects better than LASSO.

In [None]:
# Run Random Forest pipeline
rf_results = run_pipeline(model_type='rf')

### Random Forest Results

In [None]:
# Load summary
rf_summary = pd.read_csv(PROJECT_ROOT / 'results' / 'rf' / 'summary.csv')

print("="*70)
print("RANDOM FOREST SUMMARY")
print("="*70)
print(f"\nFeatures selected: {rf_summary['n_features'].values[0]}")
print(f"Discovery LOO-CV AUC: {rf_summary['discovery_loo_auc'].values[0]:.3f}")
print(f"Validation AUC: {rf_summary['validation_auc'].values[0]:.3f}")
print(f"Validation Accuracy: {rf_summary['validation_accuracy'].values[0]:.3f}")
print(f"Performance Drop: {rf_summary['performance_drop'].values[0]:.3f}")

val_auc = rf_summary['validation_auc'].values[0]

print("\n" + "="*70)
if val_auc >= 0.75:
    print("üéâ SUCCESS! Model generalizes well!")
    print("\nNext steps:")
    print("  - Document selected features")
    print("  - Write up results")
    print("  - Create final presentation")
elif val_auc >= 0.60:
    print("‚ö†Ô∏è  MODERATE: Some generalization")
    print("\nNext steps:")
    print("  - Try LASSO to compare")
    print("  - Discuss batch effects in write-up")
    print("  - Still a valid result to report")
else:
    print("‚ùå POOR: Limited generalization")
    print("\nNext steps:")
    print("  - Try LASSO to compare")
    print("  - Consider reporting honestly about overfitting")
    print("  - Discuss why this happens with small n")
print("="*70)

### View Random Forest Visualizations

In [None]:
print("ROC Curves (Discovery vs Validation):")
display(Image(filename=str(PROJECT_ROOT / 'results' / 'figures' / 'rf_results' / 'roc_curves.png')))

print("\nPerformance Comparison:")
display(Image(filename=str(PROJECT_ROOT / 'results' / 'figures' / 'rf_results' / 'performance_comparison.png')))

### Random Forest: Sample Predictions

In [None]:
# Load predictions
rf_predictions = pd.read_csv(PROJECT_ROOT / 'results' / 'rf' / 'validation_predictions.csv')

print("Validation Predictions (sorted by probability):\n")
display_cols = ['sample_id', 'disease_status', 'age', 'pred_proba', 'pred_label', 'true_label']
print(rf_predictions.sort_values('pred_proba', ascending=False)[display_cols].to_string(index=False))

# Misclassifications
rf_predictions['correct'] = rf_predictions['pred_label'] == rf_predictions['true_label']
incorrect = rf_predictions[~rf_predictions['correct']]

print(f"\n\nMisclassified: {len(incorrect)} / {len(rf_predictions)}")
if len(incorrect) > 0:
    print("\nMisclassified samples:")
    print(incorrect[display_cols].to_string(index=False))

### Random Forest: Selected Features

In [None]:
# Show selected features
selected_features = rf_results['model_data']['selected_features']

print(f"\n{len(selected_features)} Features Selected by Random Forest:\n")

frag_features = [f for f in selected_features if any(x in f for x in ['frag_', '_pct', '_ratio'])]
meth_features = [f for f in selected_features if f.startswith('meth_agg_')]

print(f"Fragmentomics ({len(frag_features)}):")
for f in frag_features:
    print(f"  - {f}")

print(f"\nMethylation ({len(meth_features)}):")
for f in meth_features:
    bin_num = int(f.replace('meth_agg_', ''))
    start = bin_num * 500_000
    end = start + 500_000
    print(f"  - {f:20s} ‚Üí chr21:{start:,}-{end:,}")

---

## Option 2: Run LASSO (for comparison)

In [None]:
# Run LASSO pipeline
lasso_results = run_pipeline(model_type='lasso')

### LASSO Results

In [None]:
# Load summary
lasso_summary = pd.read_csv(PROJECT_ROOT / 'results' / 'lasso' / 'summary.csv')

print("="*70)
print("LASSO SUMMARY")
print("="*70)
print(f"\nFeatures selected: {lasso_summary['n_features'].values[0]}")
print(f"Discovery LOO-CV AUC: {lasso_summary['discovery_loo_auc'].values[0]:.3f}")
print(f"Validation AUC: {lasso_summary['validation_auc'].values[0]:.3f}")
print(f"Validation Accuracy: {lasso_summary['validation_accuracy'].values[0]:.3f}")
print(f"Performance Drop: {lasso_summary['performance_drop'].values[0]:.3f}")
print("="*70)

### View LASSO Visualizations

In [None]:
print("ROC Curves (Discovery vs Validation):")
display(Image(filename=str(PROJECT_ROOT / 'results' / 'figures' / 'lasso_results' / 'roc_curves.png')))

print("\nPerformance Comparison:")
display(Image(filename=str(PROJECT_ROOT / 'results' / 'figures' / 'lasso_results' / 'performance_comparison.png')))

---

## Compare Random Forest vs LASSO

In [None]:
# Compare both models (if you ran both)
comparison = pd.DataFrame([
    {
        'Model': 'Random Forest',
        'N Features': rf_summary['n_features'].values[0],
        'Discovery LOO-CV': rf_summary['discovery_loo_auc'].values[0],
        'Validation AUC': rf_summary['validation_auc'].values[0],
        'Drop': rf_summary['performance_drop'].values[0]
    },
    {
        'Model': 'LASSO',
        'N Features': lasso_summary['n_features'].values[0],
        'Discovery LOO-CV': lasso_summary['discovery_loo_auc'].values[0],
        'Validation AUC': lasso_summary['validation_auc'].values[0],
        'Drop': lasso_summary['performance_drop'].values[0]
    }
])

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print("\n" + comparison.to_string(index=False))

# Determine winner
rf_val = rf_summary['validation_auc'].values[0]
lasso_val = lasso_summary['validation_auc'].values[0]

print("\n" + "="*70)
if rf_val > lasso_val + 0.05:
    print("üèÜ WINNER: Random Forest")
    print(f"   RF generalizes better (+{rf_val - lasso_val:.3f} AUC)")
elif lasso_val > rf_val + 0.05:
    print("üèÜ WINNER: LASSO")
    print(f"   LASSO generalizes better (+{lasso_val - rf_val:.3f} AUC)")
else:
    print("ü§ù TIE: Similar performance")
    print("   Consider using the simpler model (fewer features)")
print("="*70)

---

## Final Recommendations

In [None]:
# Determine best model
best_model = 'rf' if rf_val >= lasso_val else 'lasso'
best_auc = max(rf_val, lasso_val)

print("="*70)
print("FINAL RECOMMENDATIONS FOR YOUR ASSIGNMENT")
print("="*70)

print(f"\nBest Model: {best_model.upper()}")
print(f"Validation AUC: {best_auc:.3f}")

if best_auc >= 0.75:
    print("\n‚úÖ REPORT THIS RESULT WITH CONFIDENCE")
    print("\nWhat to include in your write-up:")
    print("  1. Feature extraction approach (fragmentomics + methylation)")
    print("  2. Feature selection method")
    print(f"  3. Selected features ({rf_summary['n_features'].values[0] if best_model=='rf' else lasso_summary['n_features'].values[0]})")
    print("  4. LOO-CV results on discovery")
    print("  5. Validation results (this is the key result!)")
    print("  6. ROC curves and performance plots")
    print("  7. Biological interpretation of selected regions")
    
elif best_auc >= 0.60:
    print("\n‚ö†Ô∏è  REPORT HONESTLY ABOUT MODERATE PERFORMANCE")
    print("\nWhat to include in your write-up:")
    print("  1. Your approach and methods (same as above)")
    print("  2. Discovery results")
    print("  3. Validation results")
    print("  4. DISCUSSION of why validation is lower:")
    print("     - Batch effects between discovery/validation")
    print("     - Small sample size (n=8 discovery)")
    print("     - Disease severity differences (ALSFRS scores)")
    print("  5. Future work: batch correction, more samples")
    
else:
    print("\n‚ùå REPORT THOUGHTFULLY ABOUT OVERFITTING")
    print("\nWhat to include in your write-up:")
    print("  1. Your approach and methods")
    print("  2. Discovery results (high LOO-CV)")
    print("  3. Validation failure")
    print("  4. ANALYSIS of why this happened:")
    print("     - Fundamental limits of n=8 training")
    print("     - Batch effects dominate signal")
    print("     - LOO-CV limitations (can't detect batch effects)")
    print("  5. What you learned about small-sample ML")
    print("  6. Alternative approaches you would try")
    print("\n  This is still a VALUABLE result - shows scientific rigor!")

print("\n" + "="*70)
print("Good luck with your assignment! üöÄ")
print("="*70)