# Experiment 2: Baseline + Time-Series Forecast Features

This notebook extends Experiment 1 by adding time-series forecast-derived features from AutoGluon TimeSeriesPredictor with Chronos models.

**New Features Added:**
- Mean forecast (mu) at multiple horizons
- Forecast uncertainty (q90-q10 spread)
- Forecast trend (direction indicator)
- Position in interval (where current price sits in forecast range)

**Key Design:**
- Rolling forecasts generated causally (no future data leakage)
- Cached to Drive for reproducibility
- Uses same purged + embargoed splits as Exp 1

**Prerequisites:**
- Run Experiment 0/1 first to generate labeled dataset

## Configuration

In [None]:
# ============================================================
# CONFIGURATION - Modify these parameters as needed
# ============================================================

CONFIG = {
    # Symbols to train on
    "symbols_to_train": ["SPY"],
    
    # Data limits
    "max_rows_per_symbol": 6500,
    
    # Label parameters (must match Experiment 0)
    "label_col": "label",
    "vertical_barrier_bars": 26,
    
    # Split parameters
    "embargo_bars": 26,
    "tune_window": 260,
    "test_window": 520,
    "min_train_size": 2000,
    
    # AutoGluon Tabular parameters
    "time_limit_sec": 1200,
    "presets": "best_quality",
    
    # Time Series parameters
    "ts_prediction_length": 26,
    "ts_presets": "chronos_small",  # chronos_small, chronos_base, chronos_large
    "ts_train_lookback_years": 5.0,
    
    # Forecast feature parameters
    "feature_set": "small",  # small, medium, large
    "forecast_prefix": "fc_",
    
    # Reproducibility
    "random_seed": 42,
    
    # Force options
    "force_data_refresh": False,
    "force_ts_retrain": False,
    "force_forecast_regenerate": False,
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
print("Installing dependencies...")

# AutoGluon tabular with tabarena for best quality
!pip install -q autogluon.tabular[tabarena] || pip install -q autogluon.tabular[all]

# AutoGluon timeseries with Chronos
!pip install -q autogluon.timeseries[chronos-openvino]

# Other dependencies
!pip install -q pandas numpy pyarrow scikit-learn pytz alpaca-py

print("\nInstallation complete!")

In [None]:
# Clone/update repository
import os

REPO_URL = "https://github.com/mh122333/ETF-Dual-Foundation-Project-CC-Version.git"
REPO_DIR = "/content/ETF-Dual-Foundation-Project-CC-Version"
BRANCH = "claude/build-pipeline-sanity-exp-iVs65"

if os.path.exists(REPO_DIR):
    print("Repository exists, updating...")
    %cd {REPO_DIR}
    !git fetch origin
    !git checkout {BRANCH}
    !git pull origin {BRANCH}
else:
    print("Cloning repository...")
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}
    !git checkout {BRANCH}

print(f"\nOn branch: {BRANCH}")

In [None]:
# Add src to path and set random seeds
import sys
import random
import numpy as np

sys.path.insert(0, '/content/ETF-Dual-Foundation-Project-CC-Version/src')

# Set random seeds
random.seed(CONFIG["random_seed"])
np.random.seed(CONFIG["random_seed"])

print(f"Random seed set to: {CONFIG['random_seed']}")

In [None]:
# Imports
from datetime import datetime
from pathlib import Path
import json

import pandas as pd
import numpy as np

# Project imports
from etf_pipeline.utils.paths import get_drive_paths, ensure_dirs, get_labeled_dataset_path
from etf_pipeline.splits.purged_walkforward import (
    create_single_split,
    apply_split_to_dataframe,
    validate_split_no_leakage,
)
from etf_pipeline.models.tabular_baseline import (
    get_feature_columns_for_training,
    train_tabular_baseline,
    predict_tabular,
    LABEL_LEAK_COLUMNS,
)
from etf_pipeline.metrics.classification import (
    compute_all_metrics,
    save_metrics,
    print_metrics_summary,
)

# Time series imports
from etf_pipeline.timeseries.dataset import (
    build_returns_series,
    prepare_ts_training_data,
)
from etf_pipeline.timeseries.train import load_or_train_timeseries_predictor
from etf_pipeline.timeseries.rolling_predict import load_or_generate_forecasts

# Forecast feature imports
from etf_pipeline.features.forecast_features import (
    merge_forecast_features,
    get_forecast_feature_names,
    FEATURE_SET_CONFIGS,
)
from etf_pipeline.features.baseline import get_feature_columns

print("Imports successful!")

In [None]:
# Create output directories
paths = ensure_dirs()
print("Output directories:")
for name, path in paths.items():
    print(f"  {name}: {path}")

# Generate run ID
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
symbols_str = "_".join(CONFIG["symbols_to_train"])
RUN_ID = f"exp2_{symbols_str}_{run_timestamp}"
print(f"\nRun ID: {RUN_ID}")

## 1. Load Data

In [None]:
# Load labeled dataset from Experiment 0/1
labeled_dataset_path = get_labeled_dataset_path()
print(f"Loading labeled dataset from: {labeled_dataset_path}")

if not labeled_dataset_path.exists():
    raise FileNotFoundError(
        f"Labeled dataset not found at {labeled_dataset_path}. "
        "Please run Experiment 0 or 1 first."
    )

full_df = pd.read_parquet(labeled_dataset_path)
print(f"Loaded {len(full_df)} rows")
print(f"Columns: {list(full_df.columns)}")

In [None]:
# Load raw bars for time series training
# We need the full price history, not just labeled data
bars_path = paths["raw"] / "bars_30min.parquet"

if bars_path.exists():
    print(f"Loading cached bars from: {bars_path}")
    bars_df = pd.read_parquet(bars_path)
else:
    print("Fetching bars from Alpaca...")
    from google.colab import userdata
    from alpaca.data.historical import StockHistoricalDataClient
    import pytz
    from etf_pipeline.data.alpaca import load_all_symbols
    
    api_key = userdata.get("PAPER_KEY")
    api_secret = userdata.get("PAPER_SEC")
    client = StockHistoricalDataClient(api_key, api_secret)
    
    eastern = pytz.timezone("US/Eastern")
    start = eastern.localize(datetime(2019, 1, 1))  # 5+ years for TS training
    end = eastern.localize(datetime(2025, 12, 31))
    
    all_symbols = ["SPY", "QQQ", "IWM", "AAPL", "MSFT"]
    bars_df = load_all_symbols(client, all_symbols, start, end, cache=True)
    
    # Save for future use
    bars_df.to_parquet(bars_path)
    print(f"Saved bars to: {bars_path}")

print(f"Bars shape: {bars_df.shape}")

## 2. Train Time Series Model and Generate Forecasts

In [None]:
# Define forecast horizons based on feature set
feature_set_config = FEATURE_SET_CONFIGS[CONFIG["feature_set"]]
horizons = feature_set_config["horizons"]
print(f"Feature set: {CONFIG['feature_set']}")
print(f"Horizons: {horizons}")
print(f"Features per horizon: {feature_set_config['features']}")

In [None]:
# Store forecasts for all symbols
all_forecasts = {}

for symbol in CONFIG["symbols_to_train"]:
    print(f"\n{'=' * 60}")
    print(f"GENERATING FORECASTS FOR: {symbol}")
    print(f"{'=' * 60}")
    
    # Get symbol bars
    if isinstance(bars_df.index, pd.MultiIndex):
        symbol_bars = bars_df.loc[symbol].copy()
    else:
        symbol_bars = bars_df[bars_df["symbol"] == symbol].copy()
    symbol_bars = symbol_bars.sort_index()
    
    # Get decision timestamps from labeled data
    if isinstance(full_df.index, pd.MultiIndex):
        symbol_labeled = full_df.loc[symbol].copy()
    else:
        symbol_labeled = full_df[full_df["symbol"] == symbol].copy()
    symbol_labeled = symbol_labeled.sort_index()
    
    # Apply row limit
    max_rows = CONFIG["max_rows_per_symbol"]
    if max_rows and len(symbol_labeled) > max_rows:
        symbol_labeled = symbol_labeled.iloc[-max_rows:]
    
    decision_timestamps = symbol_labeled.index.tolist()
    print(f"Decision timestamps: {len(decision_timestamps)}")
    print(f"  First: {decision_timestamps[0]}")
    print(f"  Last: {decision_timestamps[-1]}")
    
    # Model path for TS predictor
    ts_model_path = paths["models"] / "ts" / symbol / f"pred_len_{CONFIG['ts_prediction_length']}"
    ts_model_path.mkdir(parents=True, exist_ok=True)
    
    # Prepare TS training data (up to first decision timestamp)
    first_decision = decision_timestamps[0]
    train_data = prepare_ts_training_data(
        bars_df=symbol_bars,
        symbols=[symbol],
        train_end_timestamp=first_decision,
        lookback_years=CONFIG["ts_train_lookback_years"],
    )
    print(f"\nTS training data shape: {train_data.shape}")
    
    # Train or load TS predictor
    print(f"\nTraining/loading TimeSeriesPredictor...")
    ts_predictor = load_or_train_timeseries_predictor(
        train_data=train_data,
        model_path=ts_model_path,
        prediction_length=CONFIG["ts_prediction_length"],
        presets=CONFIG["ts_presets"],
        force_retrain=CONFIG["force_ts_retrain"],
    )
    
    # Generate rolling forecasts
    print(f"\nGenerating rolling forecasts...")
    forecast_cache_path = paths["processed"] / "forecasts" / symbol / f"fc_{CONFIG['feature_set']}_{RUN_ID}.parquet"
    forecast_cache_path.parent.mkdir(parents=True, exist_ok=True)
    
    forecasts = load_or_generate_forecasts(
        predictor=ts_predictor,
        bars_df=symbol_bars,
        symbol=symbol,
        decision_timestamps=decision_timestamps,
        cache_path=forecast_cache_path,
        horizons=horizons,
        force_regenerate=CONFIG["force_forecast_regenerate"],
    )
    
    print(f"Forecasts shape: {forecasts.shape}")
    print(f"Forecast columns: {list(forecasts.columns)}")
    
    all_forecasts[symbol] = forecasts

print("\nForecast generation complete!")

## 3. Merge Forecast Features with Baseline Features

In [None]:
# Merge forecast features for each symbol
merged_dfs = {}

for symbol in CONFIG["symbols_to_train"]:
    print(f"\nMerging features for {symbol}...")
    
    # Get labeled data
    if isinstance(full_df.index, pd.MultiIndex):
        symbol_df = full_df.loc[symbol].copy()
    else:
        symbol_df = full_df[full_df["symbol"] == symbol].copy()
    symbol_df = symbol_df.sort_index()
    
    # Apply row limit
    max_rows = CONFIG["max_rows_per_symbol"]
    if max_rows and len(symbol_df) > max_rows:
        symbol_df = symbol_df.iloc[-max_rows:]
    
    # Get forecasts
    forecasts = all_forecasts[symbol]
    
    # Merge forecast features
    merged = merge_forecast_features(
        tabular_df=symbol_df,
        forecasts_df=forecasts,
        feature_set=CONFIG["feature_set"],
        prefix=CONFIG["forecast_prefix"],
    )
    
    print(f"  Original columns: {len(symbol_df.columns)}")
    print(f"  Merged columns: {len(merged.columns)}")
    
    # Show new forecast columns
    fc_cols = [c for c in merged.columns if c.startswith(CONFIG["forecast_prefix"])]
    print(f"  Forecast feature columns: {fc_cols}")
    
    merged_dfs[symbol] = merged

print("\nFeature merging complete!")

In [None]:
# Define all feature columns for Exp 2
baseline_features = get_feature_columns(CONFIG.get("include_volume_zscore", True))
forecast_features = get_forecast_feature_names(CONFIG["feature_set"], CONFIG["forecast_prefix"])

all_feature_cols = baseline_features + forecast_features

print(f"Baseline features ({len(baseline_features)}): {baseline_features}")
print(f"\nForecast features ({len(forecast_features)}): {forecast_features}")
print(f"\nTotal features: {len(all_feature_cols)}")

## 4. Train Tabular Model with Combined Features

In [None]:
# Store results
all_results = {}

for symbol in CONFIG["symbols_to_train"]:
    print(f"\n{'=' * 60}")
    print(f"TRAINING MODEL FOR: {symbol}")
    print(f"{'=' * 60}")
    
    # Get merged data
    symbol_df = merged_dfs[symbol].copy()
    
    # Drop rows with NaN in features or labels
    required_cols = all_feature_cols + [CONFIG["label_col"]]
    symbol_df_clean = symbol_df.dropna(subset=required_cols).copy()
    
    print(f"\nData after cleaning:")
    print(f"  Original rows: {len(symbol_df)}")
    print(f"  Clean rows: {len(symbol_df_clean)}")
    print(f"  Dropped: {len(symbol_df) - len(symbol_df_clean)}")
    
    # Reset index
    symbol_df_clean = symbol_df_clean.reset_index(drop=False)
    if "timestamp" not in symbol_df_clean.columns and "index" in symbol_df_clean.columns:
        symbol_df_clean = symbol_df_clean.rename(columns={"index": "timestamp"})
    
    # Create split
    print("\nCreating time-series split with purging + embargo...")
    try:
        split = create_single_split(
            n_samples=len(symbol_df_clean),
            vertical_barrier_bars=CONFIG["vertical_barrier_bars"],
            embargo_bars=CONFIG["embargo_bars"],
            tune_window=CONFIG["tune_window"],
            test_window=CONFIG["test_window"],
            min_train_size=CONFIG["min_train_size"],
        )
    except ValueError as e:
        print(f"ERROR: {e}")
        print("Skipping this symbol due to insufficient data.")
        continue
    
    # Validate
    is_valid = validate_split_no_leakage(split, CONFIG["vertical_barrier_bars"])
    print(f"  Split valid (no leakage): {is_valid}")
    if not is_valid:
        raise ValueError("Split validation failed!")
    
    # Apply split
    train_df, tune_df, test_df = apply_split_to_dataframe(symbol_df_clean, split)
    
    print(f"\nSplit sizes:")
    print(f"  Train: {len(train_df)} rows")
    print(f"  Tune:  {len(tune_df)} rows")
    print(f"  Test:  {len(test_df)} rows")
    
    # Train model
    print(f"\n{'-' * 40}")
    print("TRAINING AUTOGLUON MODEL")
    print(f"{'-' * 40}")
    
    model_path = paths["models"] / "exp2" / symbol / RUN_ID
    model_path.mkdir(parents=True, exist_ok=True)
    
    print(f"\nFeatures used: {len(all_feature_cols)}")
    print(f"Training with presets='{CONFIG['presets']}', time_limit={CONFIG['time_limit_sec']}s...")
    
    predictor = train_tabular_baseline(
        train_df=train_df,
        tune_df=tune_df,
        feature_cols=all_feature_cols,
        label_col=CONFIG["label_col"],
        model_path=model_path,
        time_limit=CONFIG["time_limit_sec"],
        presets=CONFIG["presets"],
        random_seed=CONFIG["random_seed"],
        verbosity=2,
    )
    print("\nTraining complete!")
    
    # Predictions
    print("\nGenerating predictions...")
    predictions_df = predict_tabular(predictor, test_df, all_feature_cols)
    predictions_df["actual_label"] = test_df[CONFIG["label_col"]].values
    if "timestamp" in test_df.columns:
        predictions_df["timestamp"] = test_df["timestamp"].values
    
    # Save predictions
    run_dir = paths["runs"] / f"exp2_{RUN_ID}"
    run_dir.mkdir(parents=True, exist_ok=True)
    predictions_path = run_dir / f"predictions_{symbol}.parquet"
    predictions_df.to_parquet(predictions_path)
    print(f"Predictions saved to: {predictions_path}")
    
    # Metrics
    print("\nComputing metrics...")
    metrics = compute_all_metrics(
        y_true=test_df[CONFIG["label_col"]],
        y_pred=predictions_df["predicted_label"],
        y_train=train_df[CONFIG["label_col"]],
        y_tune=tune_df[CONFIG["label_col"]],
    )
    
    metrics["run_info"] = {
        "run_id": RUN_ID,
        "experiment": "exp2",
        "symbol": symbol,
        "timestamp": datetime.now().isoformat(),
        "config": CONFIG,
        "feature_count": len(all_feature_cols),
        "baseline_feature_count": len(baseline_features),
        "forecast_feature_count": len(forecast_features),
    }
    
    # Save metrics
    metrics_path = run_dir / f"metrics_{symbol}.json"
    save_metrics(metrics, metrics_path)
    print(f"Metrics saved to: {metrics_path}")
    
    print_metrics_summary(metrics)
    
    all_results[symbol] = {
        "metrics": metrics,
        "predictor": predictor,
        "predictions_path": predictions_path,
        "metrics_path": metrics_path,
        "model_path": model_path,
    }

print(f"\n{'=' * 60}")
print("ALL SYMBOLS COMPLETE!")
print(f"{'=' * 60}")

## 5. Summary

In [None]:
# Print summary
print(f"\n{'=' * 60}")
print("EXPERIMENT 2 SUMMARY")
print(f"{'=' * 60}")

print(f"\nRun ID: {RUN_ID}")
print(f"Symbols trained: {list(all_results.keys())}")
print(f"Feature set: {CONFIG['feature_set']}")
print(f"Total features: {len(all_feature_cols)}")
print(f"  - Baseline: {len(baseline_features)}")
print(f"  - Forecast: {len(forecast_features)}")

print(f"\nPerformance Summary:")
print("-" * 50)
print(f"{'Symbol':<10} {'Accuracy':>10} {'Bal Acc':>10} {'Macro F1':>10}")
print("-" * 50)

for symbol, result in all_results.items():
    m = result["metrics"]["classification"]
    print(f"{symbol:<10} {m['accuracy']:>10.4f} {m['balanced_accuracy']:>10.4f} {m['macro_f1']:>10.4f}")

print(f"\nArtifacts saved to:")
print(f"  Runs: {paths['runs'] / f'exp2_{RUN_ID}'}")
print(f"  Models: {paths['models'] / 'exp2'}")

In [None]:
# Compare with Exp 1 if available
print("\nComparison with Experiment 1 (if available):")
print("-" * 50)

exp1_runs = list((paths["runs"]).glob("exp1_*"))
if exp1_runs:
    latest_exp1 = sorted(exp1_runs)[-1]
    print(f"Latest Exp 1 run: {latest_exp1.name}")
    
    for symbol in all_results.keys():
        exp1_metrics_path = latest_exp1 / f"metrics_{symbol}.json"
        if exp1_metrics_path.exists():
            with open(exp1_metrics_path) as f:
                exp1_metrics = json.load(f)
            
            exp1_ba = exp1_metrics["classification"]["balanced_accuracy"]
            exp2_ba = all_results[symbol]["metrics"]["classification"]["balanced_accuracy"]
            diff = exp2_ba - exp1_ba
            
            print(f"\n{symbol}:")
            print(f"  Exp 1 Balanced Accuracy: {exp1_ba:.4f}")
            print(f"  Exp 2 Balanced Accuracy: {exp2_ba:.4f}")
            print(f"  Improvement: {diff:+.4f} ({100*diff/exp1_ba:+.2f}%)")
else:
    print("No Experiment 1 runs found for comparison.")

---

**Experiment 2 Complete!**

Next: Experiment 3 adds context asset forecast features (SPY, QQQ).