# Experiment 0: Pipeline Sanity Check

This notebook implements the full Experiment 0 pipeline:

1. Download and cache 30-minute bars from Alpaca
2. Compute ATR and triple-barrier labels
3. Generate baseline features
4. Run sanity checks and leakage smoke test
5. Save artifacts to Google Drive

**Prerequisites:** Run `00_setup_colab.ipynb` first.

## Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install dependencies
%%capture
!pip install alpaca-py pandas numpy pyarrow scikit-learn pytz

In [None]:
# Clone/update repository
import os

REPO_URL = "https://github.com/mh122333/ETF-Dual-Foundation-Project-CC-Version.git"
REPO_DIR = "/content/ETF-Dual-Foundation-Project-CC-Version"
BRANCH = "claude/build-pipeline-sanity-exp-iVs65"  # Branch with latest code

if os.path.exists(REPO_DIR):
    print("Repository exists, updating...")
    %cd {REPO_DIR}
    !git fetch origin
    !git checkout {BRANCH}
    !git pull origin {BRANCH}
else:
    print("Cloning repository...")
    !git clone {REPO_URL} {REPO_DIR}
    %cd {REPO_DIR}
    !git checkout {BRANCH}

print(f"\nOn branch: {BRANCH}")

In [None]:
# Add src to path
import sys
sys.path.insert(0, '/content/ETF-Dual-Foundation-Project-CC-Version/src')

In [None]:
# Imports
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
import numpy as np
import pytz

from google.colab import userdata
from alpaca.data.historical import StockHistoricalDataClient

# Project imports
from etf_pipeline.data.alpaca import load_all_symbols
from etf_pipeline.labels.triple_barrier import compute_labels_multi
from etf_pipeline.features.baseline import compute_baseline_features_multi, get_feature_columns
from etf_pipeline.sanity.checks import run_sanity_checks, save_summary
from etf_pipeline.utils.paths import ensure_dirs, get_labeled_dataset_path, get_summary_path

## Configuration

In [None]:
# Experiment configuration (all configurable parameters)
CONFIG = {
    # Symbols to fetch
    "symbols": ["SPY", "QQQ", "IWM", "AAPL", "MSFT"],
    
    # Date range: last ~18 months
    "start_date": "2024-07-01",
    "end_date": "2025-12-31",
    
    # ATR parameters
    "atr_window": 14,
    
    # Triple-barrier parameters
    "k_up": 2.0,  # Take-profit multiplier
    "k_dn": 1.0,  # Stop-loss multiplier
    "n_bars": 26,  # Vertical barrier (bars)
    
    # Feature parameters
    "vol_window": 20,
    "vol_zscore_window": 50,
    "include_volume_zscore": True,
    
    # Cache settings
    "force_refresh": False,  # Set True to re-fetch data
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

In [None]:
# Create output directories
paths = ensure_dirs()
print("Output directories:")
for name, path in paths.items():
    print(f"  {name}: {path}")

## 1. Fetch Market Data

In [None]:
# Initialize Alpaca client
# NEVER print these keys!
api_key = userdata.get("PAPER_KEY")
api_secret = userdata.get("PAPER_SEC")

client = StockHistoricalDataClient(api_key, api_secret)
print("Alpaca client initialized.")

In [None]:
# Parse dates
eastern = pytz.timezone("US/Eastern")
start = eastern.localize(datetime.strptime(CONFIG["start_date"], "%Y-%m-%d"))
end = eastern.localize(datetime.strptime(CONFIG["end_date"], "%Y-%m-%d"))

print(f"Fetching data from {start} to {end}")
print(f"Symbols: {CONFIG['symbols']}")

In [None]:
# Fetch bars for all symbols
print("\nFetching 30-minute bars...")
bars_df = load_all_symbols(
    client=client,
    symbols=CONFIG["symbols"],
    start=start,
    end=end,
    cache=True,
    force_refresh=CONFIG["force_refresh"],
)

print(f"\nFetched {len(bars_df)} total bars")
print(f"Shape: {bars_df.shape}")

# Check if we got any data
if bars_df.empty:
    raise ValueError(
        "No data fetched! Please check:\n"
        "  1. Your Alpaca API keys are valid (PAPER_KEY, PAPER_SEC)\n"
        "  2. The date range is valid and in the past\n"
        "  3. Your Alpaca account has market data access\n"
        "  4. Try setting force_refresh=True in CONFIG"
    )

print(f"\nBars per symbol:")
print(bars_df.groupby(level="symbol").size())

In [None]:
# Quick look at the data
print("Sample data:")
bars_df.head(10)

## 2. Compute Triple-Barrier Labels

In [None]:
print("Computing triple-barrier labels...")
print(f"  ATR window: {CONFIG['atr_window']}")
print(f"  k_up (TP): {CONFIG['k_up']}")
print(f"  k_dn (SL): {CONFIG['k_dn']}")
print(f"  n_bars (vertical): {CONFIG['n_bars']}")

labeled_df = compute_labels_multi(
    bars_df,
    atr_window=CONFIG["atr_window"],
    k_up=CONFIG["k_up"],
    k_dn=CONFIG["k_dn"],
    n_bars=CONFIG["n_bars"],
)

print(f"\nLabeled {len(labeled_df)} rows")

In [None]:
# Check label distribution
print("Label distribution (before dropping NaN):")
print(labeled_df["label"].value_counts().sort_index())
print("\n")
print(labeled_df["first_hit"].value_counts())

## 3. Compute Baseline Features

In [None]:
print("Computing baseline features...")

featured_df = compute_baseline_features_multi(
    labeled_df,
    vol_window=CONFIG["vol_window"],
    vol_zscore_window=CONFIG["vol_zscore_window"],
    include_volume_zscore=CONFIG["include_volume_zscore"],
)

print(f"\nFeatures computed. Shape: {featured_df.shape}")
print(f"\nColumns: {list(featured_df.columns)}")

In [None]:
# Check feature columns
feature_cols = get_feature_columns(CONFIG["include_volume_zscore"])
print("Feature columns:")
for col in feature_cols:
    if col in featured_df.columns:
        print(f"  {col}: {featured_df[col].notna().sum()} non-null values")

## 4. Clean Dataset

Drop rows with NaN in features or labels (typically first few rows due to rolling windows).

In [None]:
# Columns needed for clean dataset
required_cols = feature_cols + ["label"]
available_required = [c for c in required_cols if c in featured_df.columns]

# Count NaN before cleaning
print("NaN counts before cleaning:")
for col in available_required:
    nan_count = featured_df[col].isna().sum()
    print(f"  {col}: {nan_count}")

In [None]:
# Drop rows with NaN in required columns
clean_df = featured_df.dropna(subset=available_required).copy()

print(f"\nRows before cleaning: {len(featured_df)}")
print(f"Rows after cleaning: {len(clean_df)}")
print(f"Rows dropped: {len(featured_df) - len(clean_df)}")

In [None]:
# Verify no NaN in clean dataset
print("\nNaN counts after cleaning:")
for col in available_required:
    nan_count = clean_df[col].isna().sum()
    print(f"  {col}: {nan_count}")
    assert nan_count == 0, f"NaN found in {col}!"

## 5. Run Sanity Checks

In [None]:
print("Running sanity checks...\n")

results = run_sanity_checks(
    clean_df,
    feature_cols=feature_cols,
    label_col="label",
    config=CONFIG,
)

In [None]:
# Display label distribution
print("\n" + "="*50)
print("LABEL DISTRIBUTION")
print("="*50)

print("\nOverall:")
for label, count in results["label_distribution"]["overall"]["counts"].items():
    pct = results["label_distribution"]["overall"]["percentages"][label]
    label_name = {-1: "SL (-1)", 0: "Timeout (0)", 1: "TP (+1)"}.get(int(label), str(label))
    print(f"  {label_name}: {count:,} ({pct:.1f}%)")

print("\nPer Symbol:")
for symbol, data in results["label_distribution"]["per_symbol"].items():
    print(f"\n  {symbol}:")
    for label, count in data["counts"].items():
        pct = data["percentages"][label]
        label_name = {-1: "SL", 0: "TO", 1: "TP"}.get(int(label), str(label))
        print(f"    {label_name}: {count:,} ({pct:.1f}%)")

In [None]:
# Display leakage test results
print("\n" + "="*50)
print("LEAKAGE SMOKE TEST")
print("="*50)

lt = results["leakage_test"]
if "error" in lt:
    print(f"\nError: {lt['error']}")
else:
    print(f"\nOriginal features accuracy: {lt['original_accuracy']:.4f}")
    print(f"Shifted features accuracy:  {lt['shifted_accuracy']:.4f}")
    print(f"Degradation:                {lt['degradation']:.4f}")
    print(f"\nResult: {lt['interpretation']}")
    print(f"\nTrain samples: {lt['train_samples']:,}")
    print(f"Test samples:  {lt['test_samples']:,}")

## 6. Save Artifacts

In [None]:
# Save labeled dataset
dataset_path = get_labeled_dataset_path()
clean_df.to_parquet(dataset_path)
print(f"Labeled dataset saved to: {dataset_path}")
print(f"Size: {dataset_path.stat().st_size / 1024 / 1024:.2f} MB")

In [None]:
# Save summary JSON
summary_path = save_summary(results)

In [None]:
# List all saved artifacts
print("\n" + "="*50)
print("SAVED ARTIFACTS")
print("="*50)

base_path = paths["base"]
print(f"\nBase directory: {base_path}")

# List files
for root, dirs, files in os.walk(base_path):
    level = root.replace(str(base_path), '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        filepath = Path(root) / file
        size_mb = filepath.stat().st_size / 1024 / 1024
        print(f"{subindent}{file} ({size_mb:.2f} MB)")

## Summary

Experiment 0 pipeline completed successfully!

In [None]:
print("\n" + "="*50)
print("EXPERIMENT 0 COMPLETE")
print("="*50)

print(f"\nDataset summary:")
print(f"  Symbols: {CONFIG['symbols']}")
print(f"  Date range: {CONFIG['start_date']} to {CONFIG['end_date']}")
print(f"  Total rows (clean): {len(clean_df):,}")
print(f"  Features: {feature_cols}")

print(f"\nLabel distribution:")
overall = results["label_distribution"]["overall"]
for label in [-1, 0, 1]:
    if label in overall["counts"]:
        print(f"  {label}: {overall['counts'][label]:,} ({overall['percentages'][label]:.1f}%)")

print(f"\nLeakage test: {'PASSED' if results['leakage_test'].get('passed', False) else 'CHECK RESULTS'}")

print(f"\nArtifacts saved to:")
print(f"  {paths['base']}")