# Baseline Model Training (XGBoost)

This notebook trains and evaluates a baseline XGBoost model.

In [None]:
import pandas as pd
import sys
import os
import joblib

# Add src to path
sys.path.append(os.path.abspath('../src'))
from models import ModelFactory, TimeSeriesTrainer

# Load features
data_path = '../data/features/AAPL_1h_features.parquet'

if os.path.exists(data_path):
    df = pd.read_parquet(data_path)
    print(f"Loaded {len(df)} rows")
    
    # Prepare data
    feature_cols = [col for col in df.columns 
                   if col not in ['target_return', 'target_direction', 'target_binary', 
                                  'target_next_close', 'symbol', 'timestamp', 'datetime']]
    
    X = df[feature_cols]
    y = df['target_binary']  # Binary classification
    
    # Train XGBoost
    print("Training XGBoost...")
    model = ModelFactory.get_xgboost_model()
    trainer = TimeSeriesTrainer(model)
    
    metrics = trainer.train_evaluate_xgboost(X, y)
    print("\nCross-Validation Results:")
    print(metrics)
    
    # Train final model on all data
    model.fit(X, y)
    
    # Feature Importance
    importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Features:")
    print(importance.head(10))
    
    # Save model
    os.makedirs('../models', exist_ok=True)
    joblib.dump(model, '../models/xgboost_baseline.joblib')
    print("Model saved to models/xgboost_baseline.joblib")
else:
    print(f"File not found: {data_path}")