In [2]:
# ==================================================
# 02_Model_Training_Selection.ipynb
# Multi-target, Regime-Aware, Confidence-Based Trading Models
# ==================================================

import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

print("Advanced AI Trading - Multi-Strategy Modeling")
print("=" * 60)

# Verify data freshness
conn = sqlite3.connect("enhanced_trading_dataset_v2.db")
df_check = pd.read_sql("SELECT MIN(date) as start_date, MAX(date) as end_date, COUNT(*) as records FROM enhanced_trading_data", conn)
conn.close()

print(f"Data Range: {df_check['start_date'].iloc[0]} to {df_check['end_date'].iloc[0]}")
print(f"Total Records: {df_check['records'].iloc[0]}")

if df_check['records'].iloc[0] < 2500:
    print("Warning: Using old dataset! Run feature engineering first!")

# ==================================================
# Step 1: Load Enhanced Dataset V2
# ==================================================
print("Step 1: Loading Enhanced Dataset V2")
print("-" * 40)

# Load from enhanced dataset v2
conn = sqlite3.connect("enhanced_trading_dataset_v2.db")
df = pd.read_sql("SELECT * FROM enhanced_trading_data", conn)
feature_info = pd.read_sql("SELECT * FROM feature_metadata", conn)
conn.close()

df['date'] = pd.to_datetime(df['date'])

print(f"Enhanced dataset v2 loaded: {df.shape}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")

# Get safe features
safe_features = feature_info['feature_name'].tolist()
print(f"Using {len(safe_features)} enhanced features")

# ==================================================
# Step 2: Multi-Target Strategy Definition
# ==================================================
print("\n" + "=" * 60)
print("Step 2: Multi-Target Strategy Analysis")
print("=" * 60)

# Define multiple trading strategies
STRATEGIES = {
    'direction': {
        'target': 'target_direction_5d',
        'description': 'Basic 5-day direction prediction',
        'baseline': df['target_direction_5d'].mean()
    },
    'strong_moves': {
        'target': 'target_strong_5d', 
        'description': 'Focus on significant moves only',
        'baseline': df['target_strong_5d'].mean()
    },
    'beat_spy': {
        'target': 'target_beat_spy_5d',
        'description': 'Outperform SPY benchmark',
        'baseline': df['target_beat_spy_5d'].mean()
    }
}

print("Strategy Baselines:")
for strategy, info in STRATEGIES.items():
    print(f"   {strategy:15}: {info['baseline']:.1%} - {info['description']}")

# ==================================================
# Step 3: Regime-Aware Data Preparation
# ==================================================
print("\n" + "=" * 60)
print("Step 3: Regime-Aware Data Preparation")
print("=" * 60)

def prepare_regime_aware_data(df, features, target_col):
    """Prepare data with regime-aware splitting"""
    
    # Sort by date for time series
    df_sorted = df.sort_values('date').reset_index(drop=True)
    X = df_sorted[features]
    y = df_sorted[target_col]
    
    # Time-based split (80/20)
    split_date = df_sorted['date'].quantile(0.8)
    train_mask = df_sorted['date'] <= split_date
    test_mask = df_sorted['date'] > split_date
    
    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]
    
    # Get regime information
    train_regimes = df_sorted[train_mask]['volatility_regime']
    test_regimes = df_sorted[test_mask]['volatility_regime']
    
    print(f"Data split:")
    print(f"   Training: {X_train.shape} ({train_mask.mean():.1%})")
    print(f"   Testing:  {X_test.shape} ({test_mask.mean():.1%})")
    print(f"   Train regimes: {train_regimes.value_counts().to_dict()}")
    print(f"   Test regimes:  {test_regimes.value_counts().to_dict()}")
    
    return X_train, X_test, y_train, y_test, train_regimes, test_regimes

# ==================================================
# Step 4: Advanced Modeling Pipeline
# ==================================================
print("\n" + "=" * 60)
print("Step 4: Advanced Modeling Pipeline")
print("=" * 60)

class AdvancedTradingModel:
    def __init__(self):
        self.models = {
            'HistGradientBoosting': HistGradientBoostingClassifier(
                max_iter=200, random_state=42, early_stopping=True,
                validation_fraction=0.1, max_depth=8, learning_rate=0.05,
                min_samples_leaf=20, l2_regularization=0.1
            ),
            'RandomForest': RandomForestClassifier(
                n_estimators=150, random_state=42, max_depth=12,
                min_samples_split=15, min_samples_leaf=10,
                class_weight='balanced', max_features='sqrt'
            )
        }
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        
    def train_strategy(self, X_train, y_train, strategy_name):
        """Train models for a specific strategy"""
        print(f"Training {strategy_name} strategy...")
        
        # Preprocess data
        X_imputed = self.imputer.fit_transform(X_train)
        X_scaled = self.scaler.fit_transform(X_imputed)
        
        # Apply SMOTE for class balancing
        smote = SMOTE(random_state=42)
        X_balanced, y_balanced = smote.fit_resample(X_scaled, y_train)
        
        print(f"   Data: {X_train.shape} → {X_balanced.shape} (balanced)")
        
        # Train all models
        results = {}
        for name, model in self.models.items():
            model.fit(X_balanced, y_balanced)
            results[name] = model
            
        return results, self.scaler, self.imputer
    
    def evaluate_strategy(self, models, scaler, imputer, X_test, y_test, test_regimes, strategy_name):
        """Comprehensive strategy evaluation"""
        print(f"Evaluating {strategy_name} strategy...")
        
        # Preprocess test data
        X_imputed = imputer.transform(X_test)
        X_scaled = scaler.transform(X_imputed)
        
        strategy_results = {}
        
        for model_name, model in models.items():
            # Get predictions and probabilities
            y_pred = model.predict(X_scaled)
            y_proba = model.predict_proba(X_scaled)[:, 1]
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_proba)
            
            # Regime-specific performance
            regime_performance = {}
            for regime in test_regimes.unique():
                regime_mask = test_regimes == regime
                if regime_mask.sum() > 10:  # Minimum samples
                    regime_accuracy = accuracy_score(y_test[regime_mask], y_pred[regime_mask])
                    regime_performance[regime] = regime_accuracy
            
            strategy_results[model_name] = {
                'accuracy': accuracy,
                'f1_score': f1,
                'roc_auc': auc,
                'predictions': y_pred,
                'probabilities': y_proba,
                'regime_performance': regime_performance
            }
            
            print(f"   {model_name:20}: Accuracy={accuracy:.3f}, F1={f1:.3f}, AUC={auc:.3f}")
            
        return strategy_results

# ==================================================
# Step 5: Confidence-Based Trading Strategy
# ==================================================
print("\n" + "=" * 60)
print("Step 5: Confidence-Based Trading")
print("=" * 60)

def confidence_based_trading(y_proba, thresholds):
    """Implement confidence-based trading"""
    low_thresh, high_thresh = thresholds
    decisions = np.full_like(y_proba, -1)  # -1 = no trade
    
    # High confidence trades only
    decisions[y_proba > high_thresh] = 1   # Confident BUY
    decisions[y_proba < low_thresh] = 0    # Confident SELL
    
    return decisions

def optimize_confidence_thresholds(y_proba, y_true):
    """Find optimal confidence thresholds"""
    best_accuracy = 0
    best_thresholds = (0.4, 0.6)
    
    threshold_combinations = [
        (0.35, 0.65),  # Conservative
        (0.4, 0.6),    # Moderate  
        (0.45, 0.55),  # Aggressive
        (0.3, 0.7),    # Very conservative
        (0.25, 0.75)   # Ultra conservative
    ]
    
    print("Optimizing confidence thresholds...")
    for thresholds in threshold_combinations:
        decisions = confidence_based_trading(y_proba, thresholds)
        trade_mask = decisions != -1
        
        if trade_mask.sum() > 20:  # Reasonable sample size
            trade_accuracy = accuracy_score(y_true[trade_mask], decisions[trade_mask])
            coverage = trade_mask.mean()
            
            print(f"   Thresholds {thresholds}: Accuracy={trade_accuracy:.3f}, Coverage={coverage:.1%}")
            
            if trade_accuracy > best_accuracy:
                best_accuracy = trade_accuracy
                best_thresholds = thresholds
    
    print(f"Best thresholds: {best_thresholds} (Accuracy: {best_accuracy:.3f})")
    return best_thresholds

# ==================================================
# Step 6: Comprehensive Strategy Testing
# ==================================================
print("\n" + "=" * 60)
print("Step 6: Comprehensive Strategy Testing")
print("=" * 60)

# Initialize model
trading_model = AdvancedTradingModel()

# Test all strategies
all_strategy_results = {}

for strategy_name, strategy_info in STRATEGIES.items():
    print(f"\n{'='*50}")
    print(f"TESTING STRATEGY: {strategy_name.upper()}")
    print(f"{'='*50}")
    
    target_col = strategy_info['target']
    baseline = strategy_info['baseline']
    
    # Prepare data
    X_train, X_test, y_train, y_test, train_regimes, test_regimes = prepare_regime_aware_data(
        df, safe_features, target_col
    )
    
    # Train models
    models, scaler, imputer = trading_model.train_strategy(X_train, y_train, strategy_name)
    
    # Evaluate
    strategy_results = trading_model.evaluate_strategy(
        models, scaler, imputer, X_test, y_test, test_regimes, strategy_name
    )
    
    # Confidence optimization for best model
    best_model_name = max(strategy_results.keys(), 
                         key=lambda x: strategy_results[x]['roc_auc'])
    best_proba = strategy_results[best_model_name]['probabilities']
    
    optimal_thresholds = optimize_confidence_thresholds(best_proba, y_test)
    
    # Store results
    all_strategy_results[strategy_name] = {
        'models': models,
        'scaler': scaler,
        'imputer': imputer,
        'results': strategy_results,
        'optimal_thresholds': optimal_thresholds,
        'baseline': baseline,
        'best_model': best_model_name
    }

# ==================================================
# Step 7: Strategy Comparison & Selection
# ==================================================
print("\n" + "=" * 60)
print("Step 7: Strategy Comparison & Selection")
print("=" * 60)

print("\nStrategy Performance Summary:")
print("="*80)
print(f"{'Strategy':<15} {'Best Model':<20} {'Accuracy':<10} {'Baseline':<10} {'Improvement':<12} {'AUC':<8}")
print("-"*80)

best_overall_strategy = None
best_overall_performance = 0

for strategy_name, strategy_data in all_strategy_results.items():
    best_model = strategy_data['best_model']
    results = strategy_data['results'][best_model]
    baseline = strategy_data['baseline']
    
    accuracy = results['accuracy']
    improvement = accuracy - baseline
    auc = results['roc_auc']
    
    print(f"{strategy_name:<15} {best_model:<20} {accuracy:.3f}     {baseline:.3f}     {improvement:+.3f}      {auc:.3f}")
    
    # Track best overall strategy
    if improvement > best_overall_performance:
        best_overall_performance = improvement
        best_overall_strategy = strategy_name

print("-"*80)
print(f"Best Overall Strategy: {best_overall_strategy} (Improvement: {best_overall_performance:+.3f})")

# ==================================================
# Step 8: Regime-Specific Performance Analysis
# ==================================================
print("\n" + "=" * 60)
print("Step 8: Regime-Specific Performance")
print("=" * 60)

for strategy_name, strategy_data in all_strategy_results.items():
    best_model = strategy_data['best_model']
    regime_perf = strategy_data['results'][best_model]['regime_performance']
    
    print(f"\n{strategy_name.upper()} - Regime Performance:")
    for regime, accuracy in regime_perf.items():
        print(f"   {regime:15}: {accuracy:.3f} accuracy")

# ==================================================
# Step 9: Save Final Models & Results
# ==================================================
print("\n" + "=" * 60)
print("Step 9: Saving Models & Results")
print("=" * 60)

import joblib

# Save best strategy
best_strategy_data = all_strategy_results[best_overall_strategy]
best_model = best_strategy_data['best_model']

final_artifacts = {
    'best_strategy': best_overall_strategy,
    'best_model': best_strategy_data['models'][best_model],
    'scaler': best_strategy_data['scaler'],
    'imputer': best_strategy_data['imputer'],
    'feature_names': safe_features,
    'confidence_thresholds': best_strategy_data['optimal_thresholds'],
    'all_strategy_results': all_strategy_results,
    'performance_summary': {
        'best_strategy': best_overall_strategy,
        'best_accuracy': best_strategy_data['results'][best_model]['accuracy'],
        'best_auc': best_strategy_data['results'][best_model]['roc_auc'],
        'baseline_improvement': best_overall_performance,
        'regime_performance': best_strategy_data['results'][best_model]['regime_performance']
    }
}

joblib.dump(final_artifacts, 'advanced_trading_models.pkl')
print("Advanced models saved: 'advanced_trading_models.pkl'")

# Save performance report
performance_df = pd.DataFrame([
    {
        'strategy': strategy,
        'best_model': data['best_model'],
        'accuracy': data['results'][data['best_model']]['accuracy'],
        'auc': data['results'][data['best_model']]['roc_auc'],
        'baseline': data['baseline'],
        'improvement': data['results'][data['best_model']]['accuracy'] - data['baseline'],
        'confidence_thresholds': str(data['optimal_thresholds'])
    }
    for strategy, data in all_strategy_results.items()
])

performance_df.to_csv('strategy_performance_report.csv', index=False)
print("Performance report saved: 'strategy_performance_report.csv'")

# ==================================================
# Step 10: Trading Recommendations
# ==================================================
print("\n" + "=" * 60)
print("Step 10: Trading Recommendations")
print("=" * 60)

best_data = all_strategy_results[best_overall_strategy]
best_model_name = best_data['best_model']
best_results = best_data['results'][best_model_name]

print(f"\nRecommended Trading Strategy:")
print(f"   Strategy: {best_overall_strategy}")
print(f"   Model: {best_model_name}")
print(f"   Accuracy: {best_results['accuracy']:.3f} (Baseline: {best_data['baseline']:.3f})")
print(f"   Improvement: {best_results['accuracy'] - best_data['baseline']:+.3f}")
print(f"   Confidence Thresholds: {best_data['optimal_thresholds']}")

print(f"\nRegime-Specific Guidance:")
for regime, accuracy in best_results['regime_performance'].items():
    performance = "STRONG" if accuracy > 0.55 else "MODERATE" if accuracy > 0.52 else "WEAK"
    print(f"   {regime:15}: {accuracy:.3f} - {performance}")

print(f"\nRisk Management Recommendations:")
if best_overall_performance > 0.03:
    print("   Good predictive power - Consider active trading")
    print("   • Use confidence-based position sizing")
    print("   • Focus on high-regime-performance periods")
elif best_overall_performance > 0.01:
    print("   Moderate predictive power - Conservative approach")
    print("   • Use very conservative confidence thresholds")
    print("   • Small position sizes only")
else:
    print("   Limited predictive power - Focus on risk management")
    print("   • Use as secondary confirmation only")
    print("   • Consider alternative strategies")

Advanced AI Trading - Multi-Strategy Modeling
Data Range: 2023-01-03 00:00:00 to 2025-10-24 00:00:00
Total Records: 2824
Step 1: Loading Enhanced Dataset V2
----------------------------------------
Enhanced dataset v2 loaded: (2824, 35)
Date range: 2023-01-03 to 2025-10-24
Using 25 enhanced features

Step 2: Multi-Target Strategy Analysis
Strategy Baselines:
   direction      : 60.8% - Basic 5-day direction prediction
   strong_moves   : 40.0% - Focus on significant moves only
   beat_spy       : 39.9% - Outperform SPY benchmark

Step 3: Regime-Aware Data Preparation

Step 4: Advanced Modeling Pipeline

Step 5: Confidence-Based Trading

Step 6: Comprehensive Strategy Testing

TESTING STRATEGY: DIRECTION
Data split:
   Training: (2260, 25) (80.0%)
   Testing:  (564, 25) (20.0%)
   Train regimes: {'MEDIUM': 955, 'LOW': 662, 'HIGH': 643}
   Test regimes:  {'HIGH': 198, 'MEDIUM': 187, 'LOW': 179}
Training direction strategy...
   Data: (2260, 25) → (2668, 25) (balanced)
Evaluating directio