# XAUUSD Condition-Based Signal Engine - Exploration Notebook

This notebook demonstrates the complete workflow of the Dr. Chen-style signal generation system:

1. **Data Loading**: Load minute OHLCV and quotes data
2. **Feature Engineering**: Build the feature matrix
3. **Labeling**: Generate labels for 5m, 15m, 30m horizons
4. **Model Training**: Train XGBoost classifiers
5. **Signal Generation**: Generate signals with SL/TP levels
6. **Evaluation**: Analyze model and backtest performance


In [None]:
# Standard imports
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Add src to path for imports
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Project imports
from src.data_loader import (
    load_minute_bars,
    load_quotes,
    get_combined_dataset,
    get_combined_dataset_multi_year
)
from src.feature_engineering import build_feature_matrix, get_feature_info
from src.labeling import generate_labels_for_all_horizons, get_label_statistics
from src.model_training import (
    train_all_horizon_models,
    get_feature_importance_ranking,
    time_series_train_test_split
)
from src.signal_generator import (
    generate_signals_for_latest_row,
    format_signals_summary
)
from src.backtest import run_backtest_all_horizons
from src.evaluation import evaluate_backtest_results, compare_horizons
from src.utils.plotting_utils import (
    set_plotting_style,
    plot_feature_distributions,
    plot_label_distribution,
    plot_confusion_matrix,
    plot_feature_importance,
    plot_equity_curve
)
from src.config import (
    HORIZONS,
    VOL_PARAMS,
    FEATURE_COLUMNS,
    MODEL_DIR,
    MINUTE_OHLCV_DIR,
    QUOTES_DIR
)

# Set plotting style
set_plotting_style()

print("Imports complete!")
print(f"Project root: {project_root}")


## 1. Data Loading

Load minute OHLCV data and quotes, then align them to create a combined dataset.


In [None]:
# Configuration - adjust paths and years as needed
DATA_DIR = project_root.parent / "Data"
MINUTE_DIR = DATA_DIR / "ohlcv_minute"
QUOTES_DIR_PATH = DATA_DIR / "quotes"

# Choose years to load (start with 1-2 years for faster exploration)
YEARS_TO_LOAD = [2023, 2024]  # Adjust as needed

print(f"Data directory: {DATA_DIR}")
print(f"Loading years: {YEARS_TO_LOAD}")


In [None]:
# Load combined dataset
print("Loading and combining data...")

try:
    df = get_combined_dataset_multi_year(
        minute_dir=str(MINUTE_DIR),
        quotes_dir=str(QUOTES_DIR_PATH),
        years=YEARS_TO_LOAD,
        symbol="XAUUSD"
    )
    print(f"\nLoaded {len(df):,} rows")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("\nTrying single year load...")
    # Fallback to single file if multi-year fails
    df = get_combined_dataset(
        minute_path=str(MINUTE_DIR / "XAUUSD_minute_2024.parquet"),
        quotes_path=str(QUOTES_DIR_PATH / "XAUUSD_quotes_2024.parquet")
    )
    print(f"Loaded {len(df):,} rows")


In [None]:
# Inspect raw data
print("Raw data columns:")
print(df.columns.tolist())
print("\nFirst few rows:")
df.head()


## 2. Feature Engineering

Build the complete feature matrix including:
- Microstructure features (mid, spread, imbalance)
- Volatility features (log returns, sigma)
- VWAP features
- Time features
- News placeholders


In [None]:
# Display available features
feature_info = get_feature_info()
print("Available feature categories:")
for category, features in feature_info.items():
    print(f"\n{category.upper()}:")
    for name, desc in features.items():
        print(f"  - {name}: {desc}")


In [None]:
# Build feature matrix
print("Building feature matrix...")
df = build_feature_matrix(df, vol_lookback=60, vwap_lookback=60)

print(f"\nFeature matrix shape: {df.shape}")
print(f"\nAll columns: {df.columns.tolist()}")


In [None]:
# Plot feature distributions
features_to_plot = ['mid', 'spread_pct', 'imbalance', 'log_ret', 'sigma', 
                    'vwap_deviation', 'minute_of_day', 'day_of_week']
fig = plot_feature_distributions(df, features_to_plot, ncols=4, figsize=(16, 8))
plt.suptitle("Feature Distributions", y=1.02)
plt.show()


## 3. Labeling

Generate environment labels for 5m, 15m, and 30m horizons using volatility-based SL/TP.


In [None]:
# Display horizon parameters
print("Horizon Parameters:")
for horizon, params in HORIZONS.items():
    print(f"  {horizon}: minutes={params['minutes']}, k1={params['k1']}, k2={params['k2']}")


In [None]:
# Generate labels
print("Generating labels for all horizons...")
df = generate_labels_for_all_horizons(df, verbose=True)


In [None]:
# Get label statistics
label_stats = get_label_statistics(df)
print("\nLabel Statistics:")
for label_col, stats in label_stats.items():
    print(f"\n{label_col}:")
    print(f"  Total labeled: {stats['count']:,}")
    print(f"  NaN count: {stats['nan_count']:,}")
    print(f"  Class distribution:")
    for cls, pct in stats['class_pcts'].items():
        cls_name = {-1: 'Short', 0: 'Flat', 1: 'Long'}.get(cls, str(cls))
        print(f"    {cls_name}: {pct:.1f}%")


In [None]:
# Plot label distributions
fig = plot_label_distribution(df)
plt.show()


## 4. Model Training

Train XGBoost classifiers for each horizon with time-based train/test split.


In [None]:
# Create models directory
models_dir = project_root / "models"
models_dir.mkdir(exist_ok=True)
print(f"Models will be saved to: {models_dir}")


In [None]:
# Train all horizon models
print("Training models for all horizons...\n")
training_results = train_all_horizon_models(
    df=df,
    model_dir=str(models_dir),
    train_ratio=0.8,
    verbose=True
)


In [None]:
# Get feature importance rankings
importance_rankings = get_feature_importance_ranking(training_results, top_n=15)

print("Top 15 Most Important Features by Horizon:\n")
for horizon, ranking in importance_rankings.items():
    print(f"\n{horizon}:")
    for i, (feat, imp) in enumerate(ranking[:10], 1):
        print(f"  {i}. {feat}: {imp:.4f}")


In [None]:
# Plot feature importances
for horizon in ["5m", "15m", "30m"]:
    if horizon in training_results:
        fig = plot_feature_importance(
            training_results[horizon]["feature_importance"],
            top_n=15,
            title=f"Feature Importance - {horizon} Horizon"
        )
        plt.show()


In [None]:
# Plot confusion matrices
for horizon in ["5m", "15m", "30m"]:
    if horizon in training_results:
        fig = plot_confusion_matrix(
            training_results[horizon]["confusion_matrix"],
            title=f"Confusion Matrix - {horizon} Horizon"
        )
        plt.show()


## 5. Signal Generation

Generate signals using the trained models on recent data.


In [None]:
# Generate signals for the latest row
print("Generating signals for the latest data point...\n")

signals = generate_signals_for_latest_row(
    df=df,
    model_dir=str(models_dir),
    vol_params=VOL_PARAMS,
    threshold=0.6
)

# Print formatted summary
print(format_signals_summary(signals))


In [None]:
# Detailed signal inspection
print("Detailed Signal Information:\n")
for horizon, sig_data in signals["signals"].items():
    print(f"{horizon} Horizon:")
    print(f"  Signal: {sig_data['signal']}")
    print(f"  Confidence: {sig_data['confidence']:.2%}")
    print(f"  Probabilities: Short={sig_data['probabilities'][0]:.2%}, "
          f"Flat={sig_data['probabilities'][1]:.2%}, "
          f"Long={sig_data['probabilities'][2]:.2%}")
    if sig_data['sl_price']:
        print(f"  SL Price: ${sig_data['sl_price']:.2f}")
        print(f"  TP Price: ${sig_data['tp_price']:.2f}")
    print()


## 6. Backtesting

Run a simple backtest on out-of-sample data.


In [None]:
# Split data for backtesting (use last 20% as test)
train_df, test_df = time_series_train_test_split(df, train_ratio=0.8)

print(f"Backtest period: {test_df.index.min()} to {test_df.index.max()}")
print(f"Backtest rows: {len(test_df):,}")


In [None]:
# Run backtest (may take a few minutes)
print("Running backtest...")

backtest_results = run_backtest_all_horizons(
    df=test_df,
    model_dir=str(models_dir),
    confidence_threshold=0.6,
    max_trades_per_horizon=500,  # Limit for faster testing
    verbose=True
)


In [None]:
# Evaluate backtest results
summaries = evaluate_backtest_results(backtest_results, verbose=True)


In [None]:
# Compare horizons
comparison_df = compare_horizons(summaries)
print("\nHorizon Comparison:")
comparison_df


In [None]:
# Plot equity curves
for horizon in ["5m", "15m", "30m"]:
    if horizon in backtest_results and len(backtest_results[horizon]) > 0:
        fig = plot_equity_curve(
            backtest_results[horizon],
            title=f"Equity Curve - {horizon} Horizon"
        )
        plt.show()


## 7. Analysis & Insights

Additional analysis of the results.


In [None]:
# Analyze trades by direction
for horizon in ["5m", "15m", "30m"]:
    if horizon not in backtest_results or len(backtest_results[horizon]) == 0:
        continue
    
    trades = backtest_results[horizon]
    print(f"\n{horizon} Trades by Direction:")
    
    for direction, name in [(1, "Long"), (-1, "Short")]:
        subset = trades[trades["direction"] == direction]
        if len(subset) > 0:
            win_rate = (subset["pnl_ret"] > 0).mean()
            avg_ret = subset["pnl_ret"].mean()
            print(f"  {name}: {len(subset)} trades, Win Rate: {win_rate:.1%}, Avg Return: {avg_ret:.4%}")


In [None]:
# Analyze trades by exit reason
for horizon in ["5m", "15m", "30m"]:
    if horizon not in backtest_results or len(backtest_results[horizon]) == 0:
        continue
    
    trades = backtest_results[horizon]
    print(f"\n{horizon} Trades by Exit Reason:")
    
    for reason in trades["exit_reason"].unique():
        subset = trades[trades["exit_reason"] == reason]
        if len(subset) > 0:
            win_rate = (subset["pnl_ret"] > 0).mean()
            avg_ret = subset["pnl_ret"].mean()
            print(f"  {reason}: {len(subset)} trades, Win Rate: {win_rate:.1%}, Avg Return: {avg_ret:.4%}")


## Summary

This notebook demonstrated the complete workflow of the XAUUSD Condition-Based Signal Engine:

1. **Data Loading**: Successfully loaded and aligned minute OHLCV with quotes data
2. **Feature Engineering**: Built comprehensive features including microstructure, volatility, VWAP, and time
3. **Labeling**: Generated environment labels using volatility-based SL/TP thresholds
4. **Model Training**: Trained XGBoost classifiers with reasonable accuracy
5. **Signal Generation**: Demonstrated real-time signal output with SL/TP levels
6. **Backtesting**: Evaluated out-of-sample performance

### Next Steps

- Tune confidence thresholds for better precision vs. recall trade-off
- Experiment with different k1/k2 multipliers
- Add actual news calendar integration
- Implement more sophisticated position sizing
- Consider walk-forward validation for more robust estimates
