# Experiment 1: Tabular Baseline Model

This notebook trains and evaluates an AutoGluon TabularPredictor using baseline causal features
to predict triple-barrier labels {-1, 0, +1}.

**Key Features:**
- Time-series-correct evaluation with purging + embargo
- No future leakage from overlapping label horizons
- Comprehensive metrics and baselines

**Prerequisites:**
- Run Experiment 0 first to generate labeled dataset, OR this notebook will regenerate it
- Colab A100 recommended for faster training

## Configuration

All configurable parameters are defined here.

In [None]:
# ============================================================
# CONFIGURATION - Modify these parameters as needed
# ============================================================

CONFIG = {
    # Symbols to train on
    "symbols_to_train": ["SPY"],  # Can expand to ["SPY", "QQQ", "IWM", "AAPL", "MSFT"]
    
    # Data limits
    "max_rows_per_symbol": 6500,  # Max rows to use (set to None for all)
    
    # Label parameters (must match Experiment 0)
    "label_col": "label",
    "vertical_barrier_bars": 26,  # N - label horizon
    
    # Split parameters
    "embargo_bars": 26,  # Additional embargo after purge
    "tune_window": 260,  # ~1 month of 30-min bars
    "test_window": 520,  # ~2 months of 30-min bars
    "min_train_size": 2000,
    
    # AutoGluon parameters
    "time_limit_sec": 1200,  # 20 minutes (adjust based on compute)
    "presets": "best_quality",  # Options: "medium_quality", "high_quality", "best_quality"
    
    # Feature parameters
    "include_volume_zscore": True,
    
    # Reproducibility
    "random_seed": 42,
    
    # Force regenerate data even if cached
    "force_data_refresh": False,
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
print("Installing dependencies...")

# Try installing AutoGluon with tabarena extras first
import subprocess
import sys

def install_autogluon():
    """Install AutoGluon with fallback options."""
    # First try: tabarena extras for extreme preset
    print("Attempting to install autogluon.tabular[tabarena]...")
    result = subprocess.run(
        [sys.executable, "-m", "pip", "install", "-U", "-q", "autogluon.tabular[tabarena]"],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        print("tabarena install failed, falling back to standard install...")
        result = subprocess.run(
            [sys.executable, "-m", "pip", "install", "-U", "-q", "autogluon.tabular[all]"],
            capture_output=True,
            text=True
        )
    
    if result.returncode != 0:
        print("Full install failed, trying minimal install...")
        result = subprocess.run(
            [sys.executable, "-m", "pip", "install", "-U", "-q", "autogluon.tabular"],
            capture_output=True,
            text=True
        )
    
    return result.returncode == 0

# Install AutoGluon
autogluon_installed = install_autogluon()

# Install other dependencies
!pip install -q pandas numpy pyarrow scikit-learn pytz alpaca-py

print("\nInstallation complete!")

In [None]:
# Clone/update repository
import os

REPO_URL = "https://github.com/mh122333/ETF-Dual-Foundation-Project-CC-Version.git"
REPO_DIR = "/content/ETF-Dual-Foundation-Project-CC-Version"
BRANCH = "claude/build-pipeline-sanity-exp-iVs65"  # Branch with Experiment 1 code

if os.path.exists(REPO_DIR):
    print("Repository exists, updating...")
    %cd {REPO_DIR}
    !git fetch origin
    !git checkout {BRANCH}
    !git pull origin {BRANCH}
else:
    print("Cloning repository...")
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}
    !git checkout {BRANCH}

print(f"\nOn branch: {BRANCH}")

In [None]:
# Add src to path and set random seeds
import sys
import random
import numpy as np

sys.path.insert(0, '/content/ETF-Dual-Foundation-Project-CC-Version/src')

# Set random seeds for reproducibility
random.seed(CONFIG["random_seed"])
np.random.seed(CONFIG["random_seed"])

print(f"Random seed set to: {CONFIG['random_seed']}")

In [None]:
# Imports
from datetime import datetime
from pathlib import Path
import json

import pandas as pd
import numpy as np

# Project imports
from etf_pipeline.utils.paths import get_drive_paths, ensure_dirs, get_labeled_dataset_path
from etf_pipeline.splits.purged_walkforward import (
    create_single_split,
    apply_split_to_dataframe,
    validate_split_no_leakage,
)
from etf_pipeline.models.tabular_baseline import (
    get_feature_columns_for_training,
    train_tabular_baseline,
    predict_tabular,
    run_leakage_smoke_test,
    LABEL_LEAK_COLUMNS,
)
from etf_pipeline.metrics.classification import (
    compute_all_metrics,
    save_metrics,
    print_metrics_summary,
    compute_label_distribution,
)

print("Imports successful!")

In [None]:
# Create output directories
paths = ensure_dirs()
print("Output directories:")
for name, path in paths.items():
    print(f"  {name}: {path}")

# Generate run ID
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
symbols_str = "_".join(CONFIG["symbols_to_train"])
RUN_ID = f"exp1_{symbols_str}_{run_timestamp}"
print(f"\nRun ID: {RUN_ID}")

## 1. Load or Generate Data

Load the labeled dataset from Experiment 0, or regenerate if missing.

In [None]:
# Check for existing labeled dataset
labeled_dataset_path = get_labeled_dataset_path()
print(f"Looking for labeled dataset at: {labeled_dataset_path}")

if labeled_dataset_path.exists() and not CONFIG["force_data_refresh"]:
    print("\nLoading existing labeled dataset...")
    full_df = pd.read_parquet(labeled_dataset_path)
    print(f"Loaded {len(full_df)} rows")
else:
    print("\nLabeled dataset not found. Running Experiment 0 pipeline...")
    
    # Import Experiment 0 components
    from google.colab import userdata
    from alpaca.data.historical import StockHistoricalDataClient
    import pytz
    
    from etf_pipeline.data.alpaca import load_all_symbols
    from etf_pipeline.labels.triple_barrier import compute_labels_multi
    from etf_pipeline.features.baseline import compute_baseline_features_multi
    
    # Initialize Alpaca client
    api_key = userdata.get("PAPER_KEY")
    api_secret = userdata.get("PAPER_SEC")
    client = StockHistoricalDataClient(api_key, api_secret)
    
    # Fetch data (last ~18 months)
    eastern = pytz.timezone("US/Eastern")
    start = eastern.localize(datetime(2024, 7, 1))
    end = eastern.localize(datetime(2025, 12, 31))
    
    all_symbols = ["SPY", "QQQ", "IWM", "AAPL", "MSFT"]
    
    print(f"Fetching bars for {all_symbols}...")
    bars_df = load_all_symbols(client, all_symbols, start, end, cache=True)
    
    if bars_df.empty:
        raise ValueError("No data fetched! Check Alpaca API credentials.")
    
    print(f"Fetched {len(bars_df)} bars")
    
    # Compute labels
    print("\nComputing triple-barrier labels...")
    labeled_df = compute_labels_multi(
        bars_df,
        atr_window=14,
        k_up=2.0,
        k_dn=1.0,
        n_bars=CONFIG["vertical_barrier_bars"],
    )
    
    # Compute features
    print("Computing baseline features...")
    full_df = compute_baseline_features_multi(
        labeled_df,
        vol_window=20,
        vol_zscore_window=50,
        include_volume_zscore=CONFIG["include_volume_zscore"],
    )
    
    # Save
    full_df.to_parquet(labeled_dataset_path)
    print(f"Saved labeled dataset to: {labeled_dataset_path}")

print(f"\nDataset shape: {full_df.shape}")
print(f"Columns: {list(full_df.columns)}")

In [None]:
# Filter to symbols we want to train on
symbols_to_train = CONFIG["symbols_to_train"]

# Get available symbols
if isinstance(full_df.index, pd.MultiIndex):
    available_symbols = full_df.index.get_level_values("symbol").unique().tolist()
else:
    available_symbols = full_df["symbol"].unique().tolist()

print(f"Available symbols: {available_symbols}")
print(f"Symbols to train: {symbols_to_train}")

# Validate
missing = set(symbols_to_train) - set(available_symbols)
if missing:
    raise ValueError(f"Symbols not found in data: {missing}")

## 2. Train Model Per Symbol

For each symbol:
1. Create purged + embargoed split
2. Train AutoGluon TabularPredictor
3. Evaluate on test set
4. Save artifacts

In [None]:
# Store results for all symbols
all_results = {}

for symbol in symbols_to_train:
    print("\n" + "=" * 60)
    print(f"TRAINING MODEL FOR: {symbol}")
    print("=" * 60)
    
    # --------------------------------------------------------
    # 2.1 Extract symbol data
    # --------------------------------------------------------
    if isinstance(full_df.index, pd.MultiIndex):
        symbol_df = full_df.loc[symbol].copy()
    else:
        symbol_df = full_df[full_df["symbol"] == symbol].copy()
    
    # Sort by time
    symbol_df = symbol_df.sort_index()
    
    # Apply row limit if specified
    max_rows = CONFIG["max_rows_per_symbol"]
    if max_rows and len(symbol_df) > max_rows:
        # Take most recent data
        symbol_df = symbol_df.iloc[-max_rows:]
        print(f"Limited to last {max_rows} rows")
    
    # Get feature columns (exclude label leak columns)
    feature_cols = get_feature_columns_for_training(
        symbol_df, 
        include_volume_zscore=CONFIG["include_volume_zscore"]
    )
    print(f"\nFeature columns: {feature_cols}")
    
    # Drop rows with NaN in features or labels
    required_cols = feature_cols + [CONFIG["label_col"]]
    symbol_df_clean = symbol_df.dropna(subset=required_cols).copy()
    
    print(f"\nData after cleaning:")
    print(f"  Original rows: {len(symbol_df)}")
    print(f"  Clean rows: {len(symbol_df_clean)}")
    print(f"  Dropped: {len(symbol_df) - len(symbol_df_clean)}")
    
    # Reset index for proper slicing
    symbol_df_clean = symbol_df_clean.reset_index(drop=False)
    if "timestamp" not in symbol_df_clean.columns and "index" in symbol_df_clean.columns:
        symbol_df_clean = symbol_df_clean.rename(columns={"index": "timestamp"})
    
    # --------------------------------------------------------
    # 2.2 Create purged + embargoed split
    # --------------------------------------------------------
    print("\nCreating time-series split with purging + embargo...")
    
    try:
        split = create_single_split(
            n_samples=len(symbol_df_clean),
            vertical_barrier_bars=CONFIG["vertical_barrier_bars"],
            embargo_bars=CONFIG["embargo_bars"],
            tune_window=CONFIG["tune_window"],
            test_window=CONFIG["test_window"],
            min_train_size=CONFIG["min_train_size"],
        )
    except ValueError as e:
        print(f"ERROR: {e}")
        print("Skipping this symbol due to insufficient data.")
        continue
    
    # Validate no leakage
    is_valid = validate_split_no_leakage(split, CONFIG["vertical_barrier_bars"])
    print(f"  Split valid (no leakage): {is_valid}")
    if not is_valid:
        raise ValueError("Split validation failed! Possible label leakage.")
    
    # Apply split
    train_df, tune_df, test_df = apply_split_to_dataframe(symbol_df_clean, split)
    
    print(f"\nSplit sizes:")
    print(f"  Train: {len(train_df)} rows (indices {split.train_start}-{split.train_end})")
    print(f"  Tune:  {len(tune_df)} rows (indices {split.tune_start}-{split.tune_end})")
    print(f"  Test:  {len(test_df)} rows (indices {split.test_start}-{split.test_end})")
    
    # Date ranges
    if "timestamp" in train_df.columns:
        print(f"\nDate ranges:")
        print(f"  Train: {train_df['timestamp'].min()} to {train_df['timestamp'].max()}")
        print(f"  Tune:  {tune_df['timestamp'].min()} to {tune_df['timestamp'].max()}")
        print(f"  Test:  {test_df['timestamp'].min()} to {test_df['timestamp'].max()}")
    
    # Label distributions
    print("\nLabel distributions:")
    for name, df in [("Train", train_df), ("Tune", tune_df), ("Test", test_df)]:
        dist = df[CONFIG["label_col"]].value_counts(normalize=True).sort_index() * 100
        print(f"  {name}: " + ", ".join([f"{k}: {v:.1f}%" for k, v in dist.items()]))
    
    # --------------------------------------------------------
    # 2.3 Train AutoGluon model
    # --------------------------------------------------------
    print("\n" + "-" * 40)
    print("TRAINING AUTOGLUON MODEL")
    print("-" * 40)
    
    # Model save path
    model_path = paths["models"] / "exp1" / symbol / RUN_ID
    model_path.mkdir(parents=True, exist_ok=True)
    print(f"\nModel will be saved to: {model_path}")
    
    # Train
    print(f"\nTraining with presets='{CONFIG['presets']}', time_limit={CONFIG['time_limit_sec']}s...")
    print("This may take a while...\n")
    
    try:
        predictor = train_tabular_baseline(
            train_df=train_df,
            tune_df=tune_df,
            feature_cols=feature_cols,
            label_col=CONFIG["label_col"],
            model_path=model_path,
            time_limit=CONFIG["time_limit_sec"],
            presets=CONFIG["presets"],
            random_seed=CONFIG["random_seed"],
            verbosity=2,
        )
        print("\nTraining complete!")
    except Exception as e:
        print(f"\nERROR during training: {e}")
        print("Trying fallback with simpler presets...")
        
        # Fallback to simpler preset
        predictor = train_tabular_baseline(
            train_df=train_df,
            tune_df=tune_df,
            feature_cols=feature_cols,
            label_col=CONFIG["label_col"],
            model_path=model_path,
            time_limit=600,  # Shorter time
            presets="medium_quality",  # Simpler preset
            random_seed=CONFIG["random_seed"],
            verbosity=2,
        )
        print("\nFallback training complete!")
    
    # --------------------------------------------------------
    # 2.4 Generate predictions on test set
    # --------------------------------------------------------
    print("\nGenerating predictions on test set...")
    predictions_df = predict_tabular(predictor, test_df, feature_cols)
    
    # Add actual labels
    predictions_df["actual_label"] = test_df[CONFIG["label_col"]].values
    
    # Add timestamp if available
    if "timestamp" in test_df.columns:
        predictions_df["timestamp"] = test_df["timestamp"].values
    
    # Save predictions
    run_dir = paths["runs"] / f"exp1_{RUN_ID}"
    run_dir.mkdir(parents=True, exist_ok=True)
    predictions_path = run_dir / f"predictions_{symbol}.parquet"
    predictions_df.to_parquet(predictions_path)
    print(f"Predictions saved to: {predictions_path}")
    
    # --------------------------------------------------------
    # 2.5 Compute metrics
    # --------------------------------------------------------
    print("\nComputing metrics...")
    
    y_true = test_df[CONFIG["label_col"]]
    y_pred = predictions_df["predicted_label"]
    y_train = train_df[CONFIG["label_col"]]
    y_tune = tune_df[CONFIG["label_col"]]
    
    metrics = compute_all_metrics(
        y_true=y_true,
        y_pred=y_pred,
        y_train=y_train,
        y_tune=y_tune,
    )
    
    # Add run metadata
    metrics["run_info"] = {
        "run_id": RUN_ID,
        "symbol": symbol,
        "timestamp": datetime.now().isoformat(),
        "config": CONFIG,
        "split_info": split.to_dict(),
    }
    
    # --------------------------------------------------------
    # 2.6 Leakage smoke test
    # --------------------------------------------------------
    print("\nRunning leakage smoke test...")
    leakage_results = run_leakage_smoke_test(
        train_df=train_df,
        test_df=test_df,
        feature_cols=feature_cols,
        label_col=CONFIG["label_col"],
        random_seed=CONFIG["random_seed"],
    )
    metrics["leakage_test"] = leakage_results
    print(f"  {leakage_results.get('interpretation', 'N/A')}")
    
    # Save metrics
    metrics_path = run_dir / f"metrics_{symbol}.json"
    save_metrics(metrics, metrics_path)
    print(f"\nMetrics saved to: {metrics_path}")
    
    # Print summary
    print_metrics_summary(metrics)
    
    # Store results
    all_results[symbol] = {
        "metrics": metrics,
        "predictor": predictor,
        "predictions_path": predictions_path,
        "metrics_path": metrics_path,
        "model_path": model_path,
    }

print("\n" + "=" * 60)
print("ALL SYMBOLS COMPLETE!")
print("=" * 60)

## 3. Summary

In [None]:
# Print overall summary
print("\n" + "=" * 60)
print("EXPERIMENT 1 SUMMARY")
print("=" * 60)

print(f"\nRun ID: {RUN_ID}")
print(f"Symbols trained: {list(all_results.keys())}")

print("\nPerformance Summary:")
print("-" * 40)
print(f"{'Symbol':<10} {'Accuracy':>10} {'Bal Acc':>10} {'Macro F1':>10}")
print("-" * 40)

for symbol, result in all_results.items():
    m = result["metrics"]["classification"]
    print(f"{symbol:<10} {m['accuracy']:>10.4f} {m['balanced_accuracy']:>10.4f} {m['macro_f1']:>10.4f}")

print("\nArtifacts saved to:")
print(f"  Runs: {paths['runs'] / f'exp1_{RUN_ID}'}")
print(f"  Models: {paths['models'] / 'exp1'}")

In [None]:
# List all saved artifacts
print("\nSaved artifacts:")
run_dir = paths["runs"] / f"exp1_{RUN_ID}"

if run_dir.exists():
    for f in sorted(run_dir.glob("*")):
        size_kb = f.stat().st_size / 1024
        print(f"  {f.name} ({size_kb:.1f} KB)")

print("\nModel directories:")
model_base = paths["models"] / "exp1"
if model_base.exists():
    for symbol_dir in sorted(model_base.glob("*")):
        if symbol_dir.is_dir():
            print(f"  {symbol_dir}")

## 4. Optional: Inspect Model Details

In [None]:
# Show AutoGluon leaderboard for first symbol
if all_results:
    first_symbol = list(all_results.keys())[0]
    predictor = all_results[first_symbol]["predictor"]
    
    print(f"\nAutoGluon Leaderboard for {first_symbol}:")
    try:
        leaderboard = predictor.leaderboard(silent=True)
        print(leaderboard.to_string())
    except Exception as e:
        print(f"Could not get leaderboard: {e}")

In [None]:
# Show feature importance (if available)
if all_results:
    first_symbol = list(all_results.keys())[0]
    predictor = all_results[first_symbol]["predictor"]
    
    print(f"\nFeature Importance for {first_symbol}:")
    try:
        importance = predictor.feature_importance(
            test_df[feature_cols + [CONFIG["label_col"]]],
            silent=True
        )
        print(importance.to_string())
    except Exception as e:
        print(f"Could not compute feature importance: {e}")

---

**Experiment 1 Complete!**

Next steps (Experiment 2+):
- Add time-series forecasting features (Chronos/TimeSeriesPredictor)
- Implement more sophisticated feature engineering
- Multi-fold cross-validation