# Experiment 5: Ablation Studies

This notebook runs systematic ablation studies to understand the contribution of each feature group and configuration choice.

**Ablations:**
1. **Feature Set Size**: small vs medium vs large forecast features
2. **Context Symbols**: With vs without SPY/QQQ context
3. **Error Features**: With vs without forecast error monitoring
4. **Model Presets**: medium_quality vs best_quality
5. **Cross-Learning**: Per-symbol vs pooled TS models (if applicable)

**Output:**
- Results table comparing all configurations
- Statistical significance tests where applicable
- Feature importance analysis

## Configuration

In [None]:
# ============================================================
# ABLATION CONFIGURATION
# ============================================================

# Base configuration shared across all ablations
BASE_CONFIG = {
    "symbols_to_train": ["SPY"],
    "max_rows_per_symbol": 6500,
    "label_col": "label",
    "vertical_barrier_bars": 26,
    "embargo_bars": 26,
    "tune_window": 260,
    "test_window": 520,
    "min_train_size": 2000,
    "time_limit_sec": 600,  # Shorter for ablations
    "ts_prediction_length": 26,
    "ts_presets": "chronos_small",
    "ts_train_lookback_years": 5.0,
    "random_seed": 42,
}

# Ablation configurations
ABLATIONS = [
    # 1. Baseline only (Exp 1 equivalent)
    {
        "name": "baseline_only",
        "description": "Baseline features only (no forecast)",
        "include_forecast": False,
        "include_context": False,
        "include_error": False,
        "feature_set": "small",
        "presets": "best_quality",
    },
    # 2. Small feature set
    {
        "name": "small_features",
        "description": "Small feature set (2 horizons)",
        "include_forecast": True,
        "include_context": True,
        "include_error": True,
        "feature_set": "small",
        "context_symbols": ["SPY"],
        "presets": "best_quality",
    },
    # 3. Medium feature set
    {
        "name": "medium_features",
        "description": "Medium feature set (4 horizons)",
        "include_forecast": True,
        "include_context": True,
        "include_error": True,
        "feature_set": "medium",
        "context_symbols": ["SPY"],
        "presets": "best_quality",
    },
    # 4. No context features
    {
        "name": "no_context",
        "description": "Full features without context",
        "include_forecast": True,
        "include_context": False,
        "include_error": True,
        "feature_set": "small",
        "presets": "best_quality",
    },
    # 5. No error features
    {
        "name": "no_error_features",
        "description": "Full features without error monitoring",
        "include_forecast": True,
        "include_context": True,
        "include_error": False,
        "feature_set": "small",
        "context_symbols": ["SPY"],
        "presets": "best_quality",
    },
    # 6. Medium quality preset
    {
        "name": "medium_quality_preset",
        "description": "Full features with medium_quality preset",
        "include_forecast": True,
        "include_context": True,
        "include_error": True,
        "feature_set": "small",
        "context_symbols": ["SPY"],
        "presets": "medium_quality",
    },
]

print(f"Base configuration:")
for k, v in BASE_CONFIG.items():
    print(f"  {k}: {v}")

print(f"\nAblations to run: {len(ABLATIONS)}")
for abl in ABLATIONS:
    print(f"  - {abl['name']}: {abl['description']}")

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
print("Installing dependencies...")
!pip install -q autogluon.tabular[tabarena] || pip install -q autogluon.tabular[all]
!pip install -q autogluon.timeseries[chronos-openvino]
!pip install -q pandas numpy pyarrow scikit-learn pytz alpaca-py
print("\nInstallation complete!")

In [None]:
# Clone/update repository
import os

REPO_URL = "https://github.com/mh122333/ETF-Dual-Foundation-Project-CC-Version.git"
REPO_DIR = "/content/ETF-Dual-Foundation-Project-CC-Version"
BRANCH = "claude/build-pipeline-sanity-exp-iVs65"

if os.path.exists(REPO_DIR):
    %cd {REPO_DIR}
    !git fetch origin && git checkout {BRANCH} && git pull origin {BRANCH}
else:
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}
    !git checkout {BRANCH}

print(f"\nOn branch: {BRANCH}")

In [None]:
# Setup
import sys
import random
import numpy as np

sys.path.insert(0, '/content/ETF-Dual-Foundation-Project-CC-Version/src')

random.seed(BASE_CONFIG["random_seed"])
np.random.seed(BASE_CONFIG["random_seed"])

In [None]:
# Imports
from datetime import datetime
from pathlib import Path
import json
import pandas as pd
import numpy as np

from etf_pipeline.utils.paths import ensure_dirs, get_labeled_dataset_path
from etf_pipeline.splits.purged_walkforward import (
    create_single_split,
    apply_split_to_dataframe,
    validate_split_no_leakage,
)
from etf_pipeline.models.tabular_baseline import (
    train_tabular_baseline,
    predict_tabular,
)
from etf_pipeline.metrics.classification import (
    compute_all_metrics,
    save_metrics,
)

from etf_pipeline.timeseries.dataset import prepare_ts_training_data
from etf_pipeline.timeseries.train import load_or_train_timeseries_predictor
from etf_pipeline.timeseries.rolling_predict import load_or_generate_forecasts

from etf_pipeline.features.forecast_features import (
    merge_forecast_features,
    get_forecast_feature_names,
    FEATURE_SET_CONFIGS,
)
from etf_pipeline.features.context_features import add_context_features
from etf_pipeline.features.forecast_error_features import (
    compute_forecast_errors,
    compute_rolling_error_features,
    get_error_feature_names,
)
from etf_pipeline.features.baseline import get_feature_columns

print("Imports successful!")

In [None]:
# Create output directories
paths = ensure_dirs()

run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = f"exp5_ablations_{run_timestamp}"
print(f"Run ID: {RUN_ID}")

## 1. Load Data

In [None]:
# Load labeled dataset
labeled_dataset_path = get_labeled_dataset_path()
if not labeled_dataset_path.exists():
    raise FileNotFoundError("Labeled dataset not found. Run Experiment 0/1 first.")

full_df = pd.read_parquet(labeled_dataset_path)
print(f"Loaded {len(full_df)} rows")

In [None]:
# Load raw bars
bars_path = paths["raw"] / "bars_30min.parquet"

if bars_path.exists():
    bars_df = pd.read_parquet(bars_path)
else:
    from google.colab import userdata
    from alpaca.data.historical import StockHistoricalDataClient
    import pytz
    from etf_pipeline.data.alpaca import load_all_symbols
    
    api_key = userdata.get("PAPER_KEY")
    api_secret = userdata.get("PAPER_SEC")
    client = StockHistoricalDataClient(api_key, api_secret)
    
    eastern = pytz.timezone("US/Eastern")
    start = eastern.localize(datetime(2019, 1, 1))
    end = eastern.localize(datetime(2025, 12, 31))
    
    all_symbols = ["SPY", "QQQ", "IWM", "AAPL", "MSFT"]
    bars_df = load_all_symbols(client, all_symbols, start, end, cache=True)
    bars_df.to_parquet(bars_path)

print(f"Bars shape: {bars_df.shape}")

## 2. Helper Functions for Ablations

In [None]:
def run_single_ablation(ablation_config, base_config, full_df, bars_df, paths, run_id):
    """
    Run a single ablation configuration.
    
    Returns:
        Dictionary with results for this ablation.
    """
    name = ablation_config["name"]
    print(f"\n{'=' * 60}")
    print(f"ABLATION: {name}")
    print(f"Description: {ablation_config['description']}")
    print(f"{'=' * 60}")
    
    # Merge configs
    config = {**base_config, **ablation_config}
    
    results = {"name": name, "config": ablation_config, "symbols": {}}
    
    for symbol in config["symbols_to_train"]:
        print(f"\nProcessing {symbol}...")
        
        # Get data for symbol
        if isinstance(full_df.index, pd.MultiIndex):
            symbol_df = full_df.loc[symbol].copy()
        else:
            symbol_df = full_df[full_df["symbol"] == symbol].copy()
        symbol_df = symbol_df.sort_index()
        
        max_rows = config["max_rows_per_symbol"]
        if max_rows and len(symbol_df) > max_rows:
            symbol_df = symbol_df.iloc[-max_rows:]
        
        # Get decision timestamps
        decision_timestamps = symbol_df.index.tolist()
        
        # Build feature set based on config
        feature_cols = get_feature_columns(True)  # Always include baseline
        
        if config.get("include_forecast", False):
            feature_set = config.get("feature_set", "small")
            horizons = FEATURE_SET_CONFIGS[feature_set]["horizons"]
            
            # Get symbol bars
            if isinstance(bars_df.index, pd.MultiIndex):
                symbol_bars = bars_df.loc[symbol].copy()
            else:
                symbol_bars = bars_df[bars_df["symbol"] == symbol].copy()
            symbol_bars = symbol_bars.sort_index()
            
            # Train/load TS predictor and generate forecasts
            ts_model_path = paths["models"] / "ts" / symbol / f"pred_len_{config['ts_prediction_length']}"
            ts_model_path.mkdir(parents=True, exist_ok=True)
            
            first_decision = decision_timestamps[0]
            train_data = prepare_ts_training_data(
                bars_df=symbol_bars,
                symbols=[symbol],
                train_end_timestamp=first_decision,
                lookback_years=config["ts_train_lookback_years"],
            )
            
            ts_predictor = load_or_train_timeseries_predictor(
                train_data=train_data,
                model_path=ts_model_path,
                prediction_length=config["ts_prediction_length"],
                presets=config["ts_presets"],
                force_retrain=False,
            )
            
            forecast_cache_path = paths["processed"] / "forecasts" / symbol / f"fc_{feature_set}_{run_id}.parquet"
            forecast_cache_path.parent.mkdir(parents=True, exist_ok=True)
            
            forecasts = load_or_generate_forecasts(
                predictor=ts_predictor,
                bars_df=symbol_bars,
                symbol=symbol,
                decision_timestamps=decision_timestamps,
                cache_path=forecast_cache_path,
                horizons=horizons,
                force_regenerate=False,
            )
            
            # Merge forecast features
            symbol_df = merge_forecast_features(
                tabular_df=symbol_df,
                forecasts_df=forecasts,
                feature_set=feature_set,
                prefix="fc_",
            )
            feature_cols.extend(get_forecast_feature_names(feature_set, "fc_"))
            
            # Context features
            if config.get("include_context", False):
                context_symbols = config.get("context_symbols", ["SPY"])
                context_forecasts = {}
                
                for ctx_sym in context_symbols:
                    if ctx_sym == symbol:
                        context_forecasts[ctx_sym] = forecasts
                    else:
                        # Generate forecasts for context symbol
                        if isinstance(bars_df.index, pd.MultiIndex):
                            ctx_bars = bars_df.loc[ctx_sym].copy()
                        else:
                            ctx_bars = bars_df[bars_df["symbol"] == ctx_sym].copy()
                        ctx_bars = ctx_bars.sort_index()
                        
                        ctx_model_path = paths["models"] / "ts" / ctx_sym / f"pred_len_{config['ts_prediction_length']}"
                        ctx_model_path.mkdir(parents=True, exist_ok=True)
                        
                        ctx_train_data = prepare_ts_training_data(
                            bars_df=ctx_bars,
                            symbols=[ctx_sym],
                            train_end_timestamp=first_decision,
                            lookback_years=config["ts_train_lookback_years"],
                        )
                        
                        ctx_predictor = load_or_train_timeseries_predictor(
                            train_data=ctx_train_data,
                            model_path=ctx_model_path,
                            prediction_length=config["ts_prediction_length"],
                            presets=config["ts_presets"],
                            force_retrain=False,
                        )
                        
                        ctx_cache_path = paths["processed"] / "forecasts" / ctx_sym / f"fc_{feature_set}_{run_id}.parquet"
                        ctx_cache_path.parent.mkdir(parents=True, exist_ok=True)
                        
                        ctx_forecasts = load_or_generate_forecasts(
                            predictor=ctx_predictor,
                            bars_df=ctx_bars,
                            symbol=ctx_sym,
                            decision_timestamps=decision_timestamps,
                            cache_path=ctx_cache_path,
                            horizons=horizons,
                            force_regenerate=False,
                        )
                        context_forecasts[ctx_sym] = ctx_forecasts
                
                symbol_df = add_context_features(
                    df=symbol_df,
                    context_forecasts=context_forecasts,
                    target_symbol=symbol,
                    context_symbols=context_symbols,
                    feature_set=feature_set,
                    include_relative=True,
                )
                
                # Add context feature names
                for ctx_sym in context_symbols:
                    prefix = f"ctx_{ctx_sym.lower()}_"
                    for feat in FEATURE_SET_CONFIGS[feature_set]["features"]:
                        for h in horizons:
                            feature_cols.append(f"{prefix}{feat}_{h}")
                    for h in horizons:
                        feature_cols.append(f"rel_{ctx_sym.lower()}_mu_{h}")
                        feature_cols.append(f"rel_{ctx_sym.lower()}_unc_{h}")
            
            # Error features
            if config.get("include_error", False):
                # Compute realized returns
                realized_returns = []
                for ts in symbol_df.index:
                    try:
                        ts_loc = symbol_bars.index.get_loc(ts)
                        future_loc = ts_loc + 1
                        if future_loc < len(symbol_bars):
                            ret = np.log(symbol_bars.iloc[future_loc]["close"] / symbol_bars.iloc[ts_loc]["close"])
                        else:
                            ret = np.nan
                    except (KeyError, IndexError):
                        ret = np.nan
                    realized_returns.append(ret)
                
                realized_returns = pd.Series(realized_returns, index=symbol_df.index)
                
                if "mu_1" in forecasts.columns:
                    error_df = compute_forecast_errors(
                        realized_returns=realized_returns,
                        forecast_mu=forecasts["mu_1"],
                        forecast_q10=forecasts.get("q10_1", pd.Series(np.nan, index=forecasts.index)),
                        forecast_q90=forecasts.get("q90_1", pd.Series(np.nan, index=forecasts.index)),
                    )
                    
                    rolling_errors = compute_rolling_error_features(error_df)
                    rolling_errors.columns = [f"err_{c}" for c in rolling_errors.columns]
                    symbol_df = symbol_df.join(rolling_errors, how="left")
                    feature_cols.extend(get_error_feature_names("err_", 20, 20, 50))
        
        # Filter to available features
        available_features = [f for f in feature_cols if f in symbol_df.columns]
        print(f"  Features: {len(available_features)}")
        
        # Clean and prepare data
        required_cols = available_features + [config["label_col"]]
        symbol_df_clean = symbol_df.dropna(subset=required_cols).copy()
        print(f"  Rows: {len(symbol_df_clean)}")
        
        symbol_df_clean = symbol_df_clean.reset_index(drop=False)
        if "timestamp" not in symbol_df_clean.columns and "index" in symbol_df_clean.columns:
            symbol_df_clean = symbol_df_clean.rename(columns={"index": "timestamp"})
        
        # Create split
        try:
            split = create_single_split(
                n_samples=len(symbol_df_clean),
                vertical_barrier_bars=config["vertical_barrier_bars"],
                embargo_bars=config["embargo_bars"],
                tune_window=config["tune_window"],
                test_window=config["test_window"],
                min_train_size=config["min_train_size"],
            )
        except ValueError as e:
            print(f"  ERROR: {e}")
            results["symbols"][symbol] = {"error": str(e)}
            continue
        
        train_df, tune_df, test_df = apply_split_to_dataframe(symbol_df_clean, split)
        print(f"  Split: Train={len(train_df)}, Tune={len(tune_df)}, Test={len(test_df)}")
        
        # Train model
        model_path = paths["models"] / "exp5" / name / symbol / run_id
        model_path.mkdir(parents=True, exist_ok=True)
        
        presets = config.get("presets", "best_quality")
        print(f"  Training ({presets})...")
        
        predictor = train_tabular_baseline(
            train_df=train_df,
            tune_df=tune_df,
            feature_cols=available_features,
            label_col=config["label_col"],
            model_path=model_path,
            time_limit=config["time_limit_sec"],
            presets=presets,
            random_seed=config["random_seed"],
            verbosity=1,
        )
        
        # Predictions
        predictions_df = predict_tabular(predictor, test_df, available_features)
        predictions_df["actual_label"] = test_df[config["label_col"]].values
        
        # Metrics
        metrics = compute_all_metrics(
            y_true=test_df[config["label_col"]],
            y_pred=predictions_df["predicted_label"],
            y_train=train_df[config["label_col"]],
            y_tune=tune_df[config["label_col"]],
        )
        
        print(f"  Balanced Accuracy: {metrics['classification']['balanced_accuracy']:.4f}")
        
        results["symbols"][symbol] = {
            "metrics": metrics,
            "feature_count": len(available_features),
            "train_size": len(train_df),
            "test_size": len(test_df),
        }
    
    return results

## 3. Run All Ablations

In [None]:
# Run all ablations
all_ablation_results = []

for ablation in ABLATIONS:
    try:
        result = run_single_ablation(
            ablation_config=ablation,
            base_config=BASE_CONFIG,
            full_df=full_df,
            bars_df=bars_df,
            paths=paths,
            run_id=RUN_ID,
        )
        all_ablation_results.append(result)
    except Exception as e:
        print(f"ERROR in ablation {ablation['name']}: {e}")
        all_ablation_results.append({
            "name": ablation["name"],
            "error": str(e),
        })

print(f"\n{'=' * 60}")
print("ALL ABLATIONS COMPLETE!")
print(f"{'=' * 60}")

## 4. Results Summary

In [None]:
# Build results table
results_rows = []

for result in all_ablation_results:
    if "error" in result and "symbols" not in result:
        continue
    
    name = result["name"]
    config = result.get("config", {})
    
    for symbol, symbol_result in result.get("symbols", {}).items():
        if "error" in symbol_result:
            continue
        
        metrics = symbol_result["metrics"]["classification"]
        
        results_rows.append({
            "ablation": name,
            "symbol": symbol,
            "feature_count": symbol_result["feature_count"],
            "accuracy": metrics["accuracy"],
            "balanced_accuracy": metrics["balanced_accuracy"],
            "macro_f1": metrics["macro_f1"],
            "cohen_kappa": metrics.get("cohen_kappa", np.nan),
            "include_forecast": config.get("include_forecast", False),
            "include_context": config.get("include_context", False),
            "include_error": config.get("include_error", False),
            "feature_set": config.get("feature_set", "small"),
            "presets": config.get("presets", "best_quality"),
        })

results_df = pd.DataFrame(results_rows)
print("\nAblation Results:")
print(results_df.to_string(index=False))

In [None]:
# Summary statistics
print("\n" + "=" * 60)
print("ABLATION SUMMARY")
print("=" * 60)

if not results_df.empty:
    summary = results_df.groupby("ablation").agg({
        "balanced_accuracy": ["mean", "std"],
        "macro_f1": ["mean", "std"],
        "feature_count": "mean",
    }).round(4)
    
    summary.columns = ["_".join(col).strip() for col in summary.columns.values]
    summary = summary.sort_values("balanced_accuracy_mean", ascending=False)
    
    print("\nRanked by Balanced Accuracy:")
    print(summary.to_string())
    
    # Best configuration
    best_idx = results_df["balanced_accuracy"].idxmax()
    best = results_df.loc[best_idx]
    print(f"\nBest Configuration:")
    print(f"  Ablation: {best['ablation']}")
    print(f"  Symbol: {best['symbol']}")
    print(f"  Balanced Accuracy: {best['balanced_accuracy']:.4f}")
    print(f"  Feature Count: {best['feature_count']}")

In [None]:
# Save results
run_dir = paths["runs"] / f"exp5_{RUN_ID}"
run_dir.mkdir(parents=True, exist_ok=True)

# Save results table
results_path = run_dir / "ablation_results.csv"
results_df.to_csv(results_path, index=False)
print(f"Results saved to: {results_path}")

# Save full results as JSON
full_results_path = run_dir / "ablation_full_results.json"

# Convert to serializable format
serializable_results = []
for result in all_ablation_results:
    r = {"name": result["name"]}
    if "config" in result:
        r["config"] = result["config"]
    if "error" in result:
        r["error"] = result["error"]
    if "symbols" in result:
        r["symbols"] = {}
        for sym, sym_result in result["symbols"].items():
            if "error" in sym_result:
                r["symbols"][sym] = {"error": sym_result["error"]}
            else:
                r["symbols"][sym] = {
                    "metrics": sym_result["metrics"],
                    "feature_count": sym_result["feature_count"],
                    "train_size": sym_result["train_size"],
                    "test_size": sym_result["test_size"],
                }
    serializable_results.append(r)

with open(full_results_path, "w") as f:
    json.dump(serializable_results, f, indent=2, default=str)
print(f"Full results saved to: {full_results_path}")

## 5. Feature Ablation Analysis

In [None]:
# Analyze impact of each feature group
print("\nFeature Group Impact Analysis:")
print("=" * 60)

if not results_df.empty:
    # Baseline comparison
    baseline_results = results_df[results_df["ablation"] == "baseline_only"]
    if not baseline_results.empty:
        baseline_ba = baseline_results["balanced_accuracy"].mean()
        print(f"\nBaseline (no forecast features): {baseline_ba:.4f}")
        
        print("\nImpact of adding feature groups:")
        for ablation in ["small_features", "no_context", "no_error_features"]:
            abl_results = results_df[results_df["ablation"] == ablation]
            if not abl_results.empty:
                abl_ba = abl_results["balanced_accuracy"].mean()
                diff = abl_ba - baseline_ba
                print(f"  {ablation}: {abl_ba:.4f} (diff: {diff:+.4f})")
    
    # Feature set comparison
    print("\nFeature Set Comparison:")
    for fs in ["small_features", "medium_features"]:
        fs_results = results_df[results_df["ablation"] == fs]
        if not fs_results.empty:
            print(f"  {fs}: BA={fs_results['balanced_accuracy'].mean():.4f}, Features={fs_results['feature_count'].mean():.0f}")

---

**Experiment 5 Complete!**

Next: Run notebook 07 for comprehensive summary across all experiments.