# Experiment 4: Full Feature Set with Forecast Error Features

This notebook extends Experiment 3 by adding forecast error/coverage features that measure model health over time.

**New Features Added:**
- Rolling MAE: How accurate have recent forecasts been?
- Rolling Bias: Is the model systematically over/under-predicting?
- Rolling Coverage: Are realized returns falling within prediction intervals?

**Hypothesis:**
When the forecast model is performing poorly (high error, low coverage), we should be less confident in predictions. These features let the tabular model learn when to trust the forecasts.

**Prerequisites:**
- Run Experiment 0/1 first to generate labeled dataset

## Configuration

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

CONFIG = {
    # Symbols
    "symbols_to_train": ["SPY"],
    "context_symbols": ["SPY"],
    
    # Data limits
    "max_rows_per_symbol": 6500,
    
    # Label parameters
    "label_col": "label",
    "vertical_barrier_bars": 26,
    
    # Split parameters
    "embargo_bars": 26,
    "tune_window": 260,
    "test_window": 520,
    "min_train_size": 2000,
    
    # AutoGluon Tabular
    "time_limit_sec": 1200,
    "presets": "best_quality",
    
    # Time Series
    "ts_prediction_length": 26,
    "ts_presets": "chronos_small",
    "ts_train_lookback_years": 5.0,
    
    # Feature parameters
    "feature_set": "small",
    "forecast_prefix": "fc_",
    "include_relative": True,
    
    # Error feature parameters
    "error_prefix": "err_",
    "mae_window": 20,
    "bias_window": 20,
    "coverage_window": 50,
    
    # Reproducibility
    "random_seed": 42,
    
    # Force options
    "force_data_refresh": False,
    "force_ts_retrain": False,
    "force_forecast_regenerate": False,
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
print("Installing dependencies...")
!pip install -q autogluon.tabular[tabarena] || pip install -q autogluon.tabular[all]
!pip install -q autogluon.timeseries[chronos-openvino]
!pip install -q pandas numpy pyarrow scikit-learn pytz alpaca-py
print("\nInstallation complete!")

In [None]:
# Clone/update repository
import os

REPO_URL = "https://github.com/mh122333/ETF-Dual-Foundation-Project-CC-Version.git"
REPO_DIR = "/content/ETF-Dual-Foundation-Project-CC-Version"
BRANCH = "claude/build-pipeline-sanity-exp-iVs65"

if os.path.exists(REPO_DIR):
    %cd {REPO_DIR}
    !git fetch origin && git checkout {BRANCH} && git pull origin {BRANCH}
else:
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}
    !git checkout {BRANCH}

print(f"\nOn branch: {BRANCH}")

In [None]:
# Add src to path and set random seeds
import sys
import random
import numpy as np

sys.path.insert(0, '/content/ETF-Dual-Foundation-Project-CC-Version/src')

random.seed(CONFIG["random_seed"])
np.random.seed(CONFIG["random_seed"])

print(f"Random seed set to: {CONFIG['random_seed']}")

In [None]:
# Imports
from datetime import datetime
from pathlib import Path
import json

import pandas as pd
import numpy as np

from etf_pipeline.utils.paths import ensure_dirs, get_labeled_dataset_path
from etf_pipeline.splits.purged_walkforward import (
    create_single_split,
    apply_split_to_dataframe,
    validate_split_no_leakage,
)
from etf_pipeline.models.tabular_baseline import (
    train_tabular_baseline,
    predict_tabular,
)
from etf_pipeline.metrics.classification import (
    compute_all_metrics,
    save_metrics,
    print_metrics_summary,
)

from etf_pipeline.timeseries.dataset import prepare_ts_training_data
from etf_pipeline.timeseries.train import load_or_train_timeseries_predictor
from etf_pipeline.timeseries.rolling_predict import load_or_generate_forecasts

from etf_pipeline.features.forecast_features import (
    merge_forecast_features,
    get_forecast_feature_names,
    FEATURE_SET_CONFIGS,
)
from etf_pipeline.features.context_features import add_context_features
from etf_pipeline.features.forecast_error_features import (
    compute_forecast_errors,
    compute_rolling_error_features,
    get_error_feature_names,
)
from etf_pipeline.features.baseline import get_feature_columns

print("Imports successful!")

In [None]:
# Create output directories
paths = ensure_dirs()

run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
symbols_str = "_".join(CONFIG["symbols_to_train"])
RUN_ID = f"exp4_{symbols_str}_{run_timestamp}"
print(f"Run ID: {RUN_ID}")

## 1. Load Data

In [None]:
# Load labeled dataset
labeled_dataset_path = get_labeled_dataset_path()
if not labeled_dataset_path.exists():
    raise FileNotFoundError(f"Labeled dataset not found. Run Experiment 0/1 first.")

full_df = pd.read_parquet(labeled_dataset_path)
print(f"Loaded {len(full_df)} rows")

In [None]:
# Load raw bars
bars_path = paths["raw"] / "bars_30min.parquet"

if bars_path.exists():
    bars_df = pd.read_parquet(bars_path)
else:
    from google.colab import userdata
    from alpaca.data.historical import StockHistoricalDataClient
    import pytz
    from etf_pipeline.data.alpaca import load_all_symbols
    
    api_key = userdata.get("PAPER_KEY")
    api_secret = userdata.get("PAPER_SEC")
    client = StockHistoricalDataClient(api_key, api_secret)
    
    eastern = pytz.timezone("US/Eastern")
    start = eastern.localize(datetime(2019, 1, 1))
    end = eastern.localize(datetime(2025, 12, 31))
    
    all_symbols = list(set(CONFIG["symbols_to_train"] + CONFIG["context_symbols"] + ["SPY", "QQQ"]))
    bars_df = load_all_symbols(client, all_symbols, start, end, cache=True)
    bars_df.to_parquet(bars_path)

print(f"Bars shape: {bars_df.shape}")

## 2. Generate Forecasts

In [None]:
feature_set_config = FEATURE_SET_CONFIGS[CONFIG["feature_set"]]
horizons = feature_set_config["horizons"]
print(f"Feature set: {CONFIG['feature_set']}, Horizons: {horizons}")

all_forecast_symbols = list(set(CONFIG["symbols_to_train"] + CONFIG["context_symbols"]))
print(f"Symbols needing forecasts: {all_forecast_symbols}")

In [None]:
# Get decision timestamps
first_target = CONFIG["symbols_to_train"][0]
if isinstance(full_df.index, pd.MultiIndex):
    target_labeled = full_df.loc[first_target].copy()
else:
    target_labeled = full_df[full_df["symbol"] == first_target].copy()
target_labeled = target_labeled.sort_index()

max_rows = CONFIG["max_rows_per_symbol"]
if max_rows and len(target_labeled) > max_rows:
    target_labeled = target_labeled.iloc[-max_rows:]

decision_timestamps = target_labeled.index.tolist()
print(f"Decision timestamps: {len(decision_timestamps)}")

In [None]:
# Generate forecasts for all symbols
all_forecasts = {}

for symbol in all_forecast_symbols:
    print(f"\n{'=' * 40}")
    print(f"FORECASTS FOR: {symbol}")
    print(f"{'=' * 40}")
    
    if isinstance(bars_df.index, pd.MultiIndex):
        symbol_bars = bars_df.loc[symbol].copy()
    else:
        symbol_bars = bars_df[bars_df["symbol"] == symbol].copy()
    symbol_bars = symbol_bars.sort_index()
    
    ts_model_path = paths["models"] / "ts" / symbol / f"pred_len_{CONFIG['ts_prediction_length']}"
    ts_model_path.mkdir(parents=True, exist_ok=True)
    
    first_decision = decision_timestamps[0]
    train_data = prepare_ts_training_data(
        bars_df=symbol_bars,
        symbols=[symbol],
        train_end_timestamp=first_decision,
        lookback_years=CONFIG["ts_train_lookback_years"],
    )
    
    ts_predictor = load_or_train_timeseries_predictor(
        train_data=train_data,
        model_path=ts_model_path,
        prediction_length=CONFIG["ts_prediction_length"],
        presets=CONFIG["ts_presets"],
        force_retrain=CONFIG["force_ts_retrain"],
    )
    
    forecast_cache_path = paths["processed"] / "forecasts" / symbol / f"fc_{CONFIG['feature_set']}_{RUN_ID}.parquet"
    forecast_cache_path.parent.mkdir(parents=True, exist_ok=True)
    
    forecasts = load_or_generate_forecasts(
        predictor=ts_predictor,
        bars_df=symbol_bars,
        symbol=symbol,
        decision_timestamps=decision_timestamps,
        cache_path=forecast_cache_path,
        horizons=horizons,
        force_regenerate=CONFIG["force_forecast_regenerate"],
    )
    
    print(f"Forecasts shape: {forecasts.shape}")
    all_forecasts[symbol] = forecasts

print("\nForecast generation complete!")

## 3. Compute Forecast Error Features

In [None]:
# Compute error features for each target symbol
error_features_dfs = {}

# Use horizon 1 for error computation (1-step ahead accuracy)
error_horizon = 1

for symbol in CONFIG["symbols_to_train"]:
    print(f"\nComputing error features for {symbol}...")
    
    # Get bars for this symbol
    if isinstance(bars_df.index, pd.MultiIndex):
        symbol_bars = bars_df.loc[symbol].copy()
    else:
        symbol_bars = bars_df[bars_df["symbol"] == symbol].copy()
    symbol_bars = symbol_bars.sort_index()
    
    # Get forecasts
    forecasts = all_forecasts[symbol]
    
    # Compute realized returns at each timestamp
    # For timestamp t, realized return is close[t+error_horizon]/close[t] - 1
    realized_returns = []
    for ts in forecasts.index:
        try:
            ts_loc = symbol_bars.index.get_loc(ts)
            future_loc = ts_loc + error_horizon
            if future_loc < len(symbol_bars):
                current_close = symbol_bars.iloc[ts_loc]["close"]
                future_close = symbol_bars.iloc[future_loc]["close"]
                ret = np.log(future_close / current_close)
            else:
                ret = np.nan
        except (KeyError, IndexError):
            ret = np.nan
        realized_returns.append(ret)
    
    realized_returns = pd.Series(realized_returns, index=forecasts.index, name="realized_return")
    
    # Get forecast values at this horizon
    mu_col = f"mu_{error_horizon}"
    q10_col = f"q10_{error_horizon}"
    q90_col = f"q90_{error_horizon}"
    
    if mu_col not in forecasts.columns:
        print(f"  Warning: {mu_col} not in forecasts, skipping error features")
        continue
    
    forecast_mu = forecasts[mu_col]
    forecast_q10 = forecasts.get(q10_col, pd.Series(np.nan, index=forecasts.index))
    forecast_q90 = forecasts.get(q90_col, pd.Series(np.nan, index=forecasts.index))
    
    # Compute errors
    error_df = compute_forecast_errors(
        realized_returns=realized_returns,
        forecast_mu=forecast_mu,
        forecast_q10=forecast_q10,
        forecast_q90=forecast_q90,
    )
    
    # Compute rolling error features
    rolling_errors = compute_rolling_error_features(
        error_df=error_df,
        mae_window=CONFIG["mae_window"],
        bias_window=CONFIG["bias_window"],
        coverage_window=CONFIG["coverage_window"],
    )
    
    # Add prefix
    rolling_errors.columns = [f"{CONFIG['error_prefix']}{c}" for c in rolling_errors.columns]
    
    print(f"  Error feature columns: {list(rolling_errors.columns)}")
    error_features_dfs[symbol] = rolling_errors

print("\nError feature computation complete!")

## 4. Merge All Features

In [None]:
# Merge all features
merged_dfs = {}

for symbol in CONFIG["symbols_to_train"]:
    print(f"\nMerging all features for {symbol}...")
    
    # Get labeled data
    if isinstance(full_df.index, pd.MultiIndex):
        symbol_df = full_df.loc[symbol].copy()
    else:
        symbol_df = full_df[full_df["symbol"] == symbol].copy()
    symbol_df = symbol_df.sort_index()
    
    max_rows = CONFIG["max_rows_per_symbol"]
    if max_rows and len(symbol_df) > max_rows:
        symbol_df = symbol_df.iloc[-max_rows:]
    
    # 1. Add own forecast features
    merged = merge_forecast_features(
        tabular_df=symbol_df,
        forecasts_df=all_forecasts[symbol],
        feature_set=CONFIG["feature_set"],
        prefix=CONFIG["forecast_prefix"],
    )
    print(f"  After forecast features: {len(merged.columns)} cols")
    
    # 2. Add context features
    context_forecasts = {ctx: all_forecasts[ctx] for ctx in CONFIG["context_symbols"]}
    merged = add_context_features(
        df=merged,
        context_forecasts=context_forecasts,
        target_symbol=symbol,
        context_symbols=CONFIG["context_symbols"],
        feature_set=CONFIG["feature_set"],
        include_relative=CONFIG["include_relative"],
    )
    print(f"  After context features: {len(merged.columns)} cols")
    
    # 3. Add error features
    if symbol in error_features_dfs:
        error_features = error_features_dfs[symbol]
        merged = merged.join(error_features, how="left")
        print(f"  After error features: {len(merged.columns)} cols")
    
    merged_dfs[symbol] = merged

print("\nFeature merging complete!")

In [None]:
# Define all feature columns
baseline_features = get_feature_columns(True)
forecast_features = get_forecast_feature_names(CONFIG["feature_set"], CONFIG["forecast_prefix"])

# Context features
context_features = []
for ctx_sym in CONFIG["context_symbols"]:
    prefix = f"ctx_{ctx_sym.lower()}_"
    for feat in feature_set_config["features"]:
        for h in horizons:
            context_features.append(f"{prefix}{feat}_{h}")

# Relative features
relative_features = []
if CONFIG["include_relative"]:
    for ctx_sym in CONFIG["context_symbols"]:
        for h in horizons:
            relative_features.append(f"rel_{ctx_sym.lower()}_mu_{h}")
            relative_features.append(f"rel_{ctx_sym.lower()}_unc_{h}")

# Error features
error_features = get_error_feature_names(
    CONFIG["error_prefix"],
    CONFIG["mae_window"],
    CONFIG["bias_window"],
    CONFIG["coverage_window"],
)

all_feature_cols = baseline_features + forecast_features + context_features + relative_features + error_features

print(f"Feature breakdown:")
print(f"  Baseline: {len(baseline_features)}")
print(f"  Forecast: {len(forecast_features)}")
print(f"  Context: {len(context_features)}")
print(f"  Relative: {len(relative_features)}")
print(f"  Error: {len(error_features)}")
print(f"  TOTAL: {len(all_feature_cols)}")

## 5. Train Tabular Model

In [None]:
# Store results
all_results = {}

for symbol in CONFIG["symbols_to_train"]:
    print(f"\n{'=' * 60}")
    print(f"TRAINING MODEL FOR: {symbol}")
    print(f"{'=' * 60}")
    
    symbol_df = merged_dfs[symbol].copy()
    
    # Filter to available features
    available_features = [f for f in all_feature_cols if f in symbol_df.columns]
    print(f"Available features: {len(available_features)}/{len(all_feature_cols)}")
    
    # Clean data
    required_cols = available_features + [CONFIG["label_col"]]
    symbol_df_clean = symbol_df.dropna(subset=required_cols).copy()
    
    print(f"Data: {len(symbol_df_clean)} rows (dropped {len(symbol_df) - len(symbol_df_clean)})")
    
    symbol_df_clean = symbol_df_clean.reset_index(drop=False)
    if "timestamp" not in symbol_df_clean.columns and "index" in symbol_df_clean.columns:
        symbol_df_clean = symbol_df_clean.rename(columns={"index": "timestamp"})
    
    # Create split
    try:
        split = create_single_split(
            n_samples=len(symbol_df_clean),
            vertical_barrier_bars=CONFIG["vertical_barrier_bars"],
            embargo_bars=CONFIG["embargo_bars"],
            tune_window=CONFIG["tune_window"],
            test_window=CONFIG["test_window"],
            min_train_size=CONFIG["min_train_size"],
        )
    except ValueError as e:
        print(f"ERROR: {e}")
        continue
    
    is_valid = validate_split_no_leakage(split, CONFIG["vertical_barrier_bars"])
    print(f"Split valid: {is_valid}")
    
    train_df, tune_df, test_df = apply_split_to_dataframe(symbol_df_clean, split)
    print(f"Split: Train={len(train_df)}, Tune={len(tune_df)}, Test={len(test_df)}")
    
    # Train
    model_path = paths["models"] / "exp4" / symbol / RUN_ID
    model_path.mkdir(parents=True, exist_ok=True)
    
    print(f"\nTraining with {len(available_features)} features...")
    
    predictor = train_tabular_baseline(
        train_df=train_df,
        tune_df=tune_df,
        feature_cols=available_features,
        label_col=CONFIG["label_col"],
        model_path=model_path,
        time_limit=CONFIG["time_limit_sec"],
        presets=CONFIG["presets"],
        random_seed=CONFIG["random_seed"],
        verbosity=2,
    )
    print("Training complete!")
    
    # Predictions
    predictions_df = predict_tabular(predictor, test_df, available_features)
    predictions_df["actual_label"] = test_df[CONFIG["label_col"]].values
    if "timestamp" in test_df.columns:
        predictions_df["timestamp"] = test_df["timestamp"].values
    
    run_dir = paths["runs"] / f"exp4_{RUN_ID}"
    run_dir.mkdir(parents=True, exist_ok=True)
    predictions_path = run_dir / f"predictions_{symbol}.parquet"
    predictions_df.to_parquet(predictions_path)
    
    # Metrics
    metrics = compute_all_metrics(
        y_true=test_df[CONFIG["label_col"]],
        y_pred=predictions_df["predicted_label"],
        y_train=train_df[CONFIG["label_col"]],
        y_tune=tune_df[CONFIG["label_col"]],
    )
    
    metrics["run_info"] = {
        "run_id": RUN_ID,
        "experiment": "exp4",
        "symbol": symbol,
        "timestamp": datetime.now().isoformat(),
        "config": CONFIG,
        "feature_count": len(available_features),
    }
    
    metrics_path = run_dir / f"metrics_{symbol}.json"
    save_metrics(metrics, metrics_path)
    
    print_metrics_summary(metrics)
    
    all_results[symbol] = {
        "metrics": metrics,
        "predictor": predictor,
        "model_path": model_path,
    }

print(f"\n{'=' * 60}")
print("ALL SYMBOLS COMPLETE!")
print(f"{'=' * 60}")

## 6. Summary

In [None]:
print(f"\n{'=' * 60}")
print("EXPERIMENT 4 SUMMARY")
print(f"{'=' * 60}")

print(f"\nRun ID: {RUN_ID}")
print(f"Symbols: {list(all_results.keys())}")
print(f"Total features: {len(all_feature_cols)}")

print(f"\nPerformance Summary:")
print("-" * 50)
print(f"{'Symbol':<10} {'Accuracy':>10} {'Bal Acc':>10} {'Macro F1':>10}")
print("-" * 50)

for symbol, result in all_results.items():
    m = result["metrics"]["classification"]
    print(f"{symbol:<10} {m['accuracy']:>10.4f} {m['balanced_accuracy']:>10.4f} {m['macro_f1']:>10.4f}")

In [None]:
# Compare with all previous experiments
print("\nComparison with Previous Experiments:")
print("-" * 70)
print(f"{'Experiment':<12} {'Symbol':<10} {'Bal Acc':>10} {'Diff':>10}")
print("-" * 70)

for symbol in all_results.keys():
    curr_ba = all_results[symbol]["metrics"]["classification"]["balanced_accuracy"]
    
    for exp_name in ["exp1", "exp2", "exp3"]:
        exp_runs = list((paths["runs"]).glob(f"{exp_name}_*"))
        if exp_runs:
            latest_run = sorted(exp_runs)[-1]
            prev_metrics_path = latest_run / f"metrics_{symbol}.json"
            if prev_metrics_path.exists():
                with open(prev_metrics_path) as f:
                    prev_metrics = json.load(f)
                prev_ba = prev_metrics["classification"]["balanced_accuracy"]
                diff = curr_ba - prev_ba
                print(f"{exp_name:<12} {symbol:<10} {prev_ba:>10.4f} {diff:>+10.4f}")
    
    print(f"{'exp4':<12} {symbol:<10} {curr_ba:>10.4f} {'(current)':>10}")
    print("-" * 70)

---

**Experiment 4 Complete!**

Next: Experiment 5 runs ablation studies across feature sets and model configurations.