# LSTM Model for Stock Price Prediction

This notebook trains an LSTM (Long Short-Term Memory) neural network to predict stock prices using historical OHLCV data from the trading system database.

## Objectives
- Load historical market data from PostgreSQL
- Engineer features (technical indicators, returns, volatility)
- Create time-aware train/validation/test splits
- Train an LSTM model using PyTorch
- Evaluate model performance with financial metrics
- Visualize predictions and residuals

## Requirements
- PyTorch (install with: `pip install torch`)
- All other dependencies from `requirements.txt`

In [None]:
"""
Setup and Imports
"""
import sys
from pathlib import Path
from datetime import datetime, timedelta, timezone
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Database
from sqlalchemy import select, desc
from src.shared.database.base import db_readonly_session
from src.shared.database.models.market_data import MarketData

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 1. Data Loading

Load historical market data from the database for a specific symbol.

In [None]:
def load_market_data(
    symbol: str,
    start_date: Optional[datetime] = None,
    end_date: Optional[datetime] = None,
    data_source: str = "yahoo",
    min_records: int = 1000
) -> pd.DataFrame:
    """
    Load market data from database for a specific symbol.
    
    Args:
        symbol: Stock symbol (e.g., 'AAPL')
        start_date: Start date (default: 1 year ago)
        end_date: End date (default: today)
        data_source: Data source filter ('yahoo', 'polygon', 'alpaca')
        min_records: Minimum number of records required
        
    Returns:
        DataFrame with OHLCV data indexed by timestamp (UTC)
    """
    if start_date is None:
        start_date = datetime.now(timezone.utc) - timedelta(days=365)
    if end_date is None:
        end_date = datetime.now(timezone.utc)
    
    with db_readonly_session() as session:
        query = (
            select(MarketData)
            .where(MarketData.symbol == symbol.upper())
            .where(MarketData.data_source == data_source.lower())
            .where(MarketData.timestamp >= start_date)
            .where(MarketData.timestamp <= end_date)
            .order_by(MarketData.timestamp)
        )
        
        result = session.execute(query)
        records = result.scalars().all()
    
    if len(records) < min_records:
        raise ValueError(
            f"Insufficient data: {len(records)} records found, "
            f"minimum {min_records} required"
        )
    
    # Convert to DataFrame
    data = []
    for record in records:
        if record.is_complete:  # Only include complete OHLCV records
            data.append({
                'timestamp': record.timestamp,
                'open': float(record.open),
                'high': float(record.high),
                'low': float(record.low),
                'close': float(record.close),
                'volume': int(record.volume) if record.volume else 0,
            })
    
    df = pd.DataFrame(data)
    df.set_index('timestamp', inplace=True)
    df.sort_index(inplace=True)
    
    # Remove duplicates (keep last)
    df = df[~df.index.duplicated(keep='last')]
    
    print(f"Loaded {len(df)} records for {symbol}")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"Data completeness: {df.isnull().sum().sum()} missing values")
    
    return df

# Load data for a symbol (change as needed)
SYMBOL = "AAPL"
df_raw = load_market_data(SYMBOL, data_source="yahoo")
df_raw.head()

## 2. Feature Engineering

Create technical indicators and derived features for the model.

In [None]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Engineer features from OHLCV data.
    
    Features include:
    - Returns (log and simple)
    - Volatility (rolling standard deviation)
    - Technical indicators (RSI, MACD, moving averages)
    - Price ratios (high/low, close/open)
    - Volume features
    
    Args:
        df: DataFrame with OHLCV data
        
    Returns:
        DataFrame with additional features
    """
    df = df.copy()
    
    # Returns
    df['returns'] = df['close'].pct_change()
    df['log_returns'] = np.log(df['close'] / df['close'].shift(1))
    
    # Volatility (rolling 20-period)
    df['volatility'] = df['returns'].rolling(window=20).std()
    
    # Price ratios
    df['high_low_ratio'] = df['high'] / df['low']
    df['close_open_ratio'] = df['close'] / df['open']
    
    # Moving averages
    df['sma_5'] = df['close'].rolling(window=5).mean()
    df['sma_20'] = df['close'].rolling(window=20).mean()
    df['sma_50'] = df['close'].rolling(window=50).mean()
    
    # RSI (Relative Strength Index)
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    
    # MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # Volume features
    df['volume_ma'] = df['volume'].rolling(window=20).mean()
    df['volume_ratio'] = df['volume'] / df['volume_ma']
    
    # Price position (where close is relative to high-low range)
    df['price_position'] = (df['close'] - df['low']) / (df['high'] - df['low'])
    
    # Drop rows with NaN (from rolling calculations)
    df = df.dropna()
    
    print(f"Feature engineering complete. Shape: {df.shape}")
    print(f"Features: {list(df.columns)}")
    
    return df

df_features = engineer_features(df_raw)
df_features.head()

## 3. Data Preprocessing

Prepare data for LSTM: select features, normalize, and create sequences.

In [None]:
# Configuration
SEQUENCE_LENGTH = 60  # Number of time steps to look back
PREDICTION_HORIZON = 1  # Predict next 1 period
TARGET_COLUMN = 'close'  # What we're predicting

# Select features (exclude target and timestamp-related)
feature_columns = [
    'open', 'high', 'low', 'close', 'volume',
    'returns', 'log_returns', 'volatility',
    'high_low_ratio', 'close_open_ratio',
    'sma_5', 'sma_20', 'sma_50',
    'rsi', 'macd', 'macd_signal', 'macd_hist',
    'volume_ma', 'volume_ratio', 'price_position'
]

# Ensure all columns exist
available_features = [col for col in feature_columns if col in df_features.columns]
print(f"Using {len(available_features)} features: {available_features}")

# Extract feature matrix and target
X = df_features[available_features].values
y = df_features[TARGET_COLUMN].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Normalize features (fit on training data only - will be done in split)
scaler_X = StandardScaler()
scaler_y = MinMaxScaler()

# For now, fit on all data (will refit on train only later)
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

print(f"Normalized feature range: [{X_scaled.min():.2f}, {X_scaled.max():.2f}]")
print(f"Normalized target range: [{y_scaled.min():.2f}, {y_scaled.max():.2f}]")

In [None]:
def create_sequences(
    X: np.ndarray,
    y: np.ndarray,
    seq_length: int,
    prediction_horizon: int = 1
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Create sequences for LSTM training.
    
    Args:
        X: Feature matrix (n_samples, n_features)
        y: Target vector (n_samples,)
        seq_length: Length of input sequences
        prediction_horizon: Steps ahead to predict
        
    Returns:
        X_seq: Sequences (n_samples - seq_length, seq_length, n_features)
        y_seq: Targets (n_samples - seq_length,)
    """
    X_seq, y_seq = [], []
    
    for i in range(len(X) - seq_length - prediction_horizon + 1):
        X_seq.append(X[i:i + seq_length])
        y_seq.append(y[i + seq_length + prediction_horizon - 1])
    
    return np.array(X_seq), np.array(y_seq)

# Create sequences
X_seq, y_seq = create_sequences(X_scaled, y_scaled, SEQUENCE_LENGTH, PREDICTION_HORIZON)

print(f"Sequence shape: {X_seq.shape}")
print(f"Target shape: {y_seq.shape}")
print(f"Total sequences: {len(X_seq)}")

## 4. Time-Aware Data Splitting

Split data chronologically to avoid look-ahead bias (critical for financial data).

In [None]:
def time_aware_split(
    X: np.ndarray,
    y: np.ndarray,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Split data chronologically (no shuffling for time series).
    
    Args:
        X: Feature sequences
        y: Target values
        train_ratio: Proportion for training
        val_ratio: Proportion for validation
        
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test
    """
    n_samples = len(X)
    train_end = int(n_samples * train_ratio)
    val_end = int(n_samples * (train_ratio + val_ratio))
    
    X_train = X[:train_end]
    X_val = X[train_end:val_end]
    X_test = X[val_end:]
    
    y_train = y[:train_end]
    y_val = y[train_end:val_end]
    y_test = y[val_end:]
    
    print(f"Train: {len(X_train)} samples ({len(X_train)/n_samples*100:.1f}%)")
    print(f"Validation: {len(X_val)} samples ({len(X_val)/n_samples*100:.1f}%)")
    print(f"Test: {len(X_test)} samples ({len(X_test)/n_samples*100:.1f}%)")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Split data
X_train, X_val, X_test, y_train, y_val, y_test = time_aware_split(
    X_seq, y_seq, train_ratio=0.7, val_ratio=0.15
)

## 5. PyTorch Dataset and DataLoader

Create PyTorch datasets for efficient batching.

In [None]:
class TimeSeriesDataset(Dataset):
    """PyTorch Dataset for time series sequences."""
    
    def __init__(self, X: np.ndarray, y: np.ndarray):
        """
        Args:
            X: Feature sequences (n_samples, seq_length, n_features)
            y: Target values (n_samples,)
        """
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    
    def __len__(self) -> int:
        return len(self.X)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[idx], self.y[idx]

# Create datasets
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

# Create data loaders
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

## 6. LSTM Model Architecture

Define the LSTM model with dropout for regularization.

In [None]:
class LSTMPredictor(nn.Module):
    """
    LSTM model for stock price prediction.
    
    Architecture:
    - LSTM layers for sequence learning
    - Dropout for regularization
    - Fully connected layers for output
    """
    
    def __init__(
        self,
        input_size: int,
        hidden_size: int = 64,
        num_layers: int = 2,
        dropout: float = 0.2,
        output_size: int = 1
    ):
        """
        Args:
            input_size: Number of features per time step
            hidden_size: Number of LSTM hidden units
            num_layers: Number of LSTM layers
            dropout: Dropout probability
            output_size: Size of output (typically 1 for price prediction)
        """
        super(LSTMPredictor, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        # Additional dropout
        self.dropout = nn.Dropout(dropout)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.
        
        Args:
            x: Input tensor (batch_size, seq_length, input_size)
            
        Returns:
            Output tensor (batch_size, output_size)
        """
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)
        
        # Take the last output from the sequence
        lstm_out = lstm_out[:, -1, :]
        
        # Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Fully connected layer
        output = self.fc(lstm_out)
        
        return output.squeeze(-1)  # Remove last dimension if output_size=1

# Initialize model
INPUT_SIZE = X_train.shape[2]  # Number of features
HIDDEN_SIZE = 64
NUM_LAYERS = 2
DROPOUT = 0.2

model = LSTMPredictor(
    input_size=INPUT_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model architecture:")
print(f"  Input size: {INPUT_SIZE}")
print(f"  Hidden size: {HIDDEN_SIZE}")
print(f"  LSTM layers: {NUM_LAYERS}")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"Using device: {device}")

## 7. Training Setup

Define loss function, optimizer, and learning rate scheduler.

In [None]:
# Loss function (MSE for regression)
criterion = nn.MSELoss()

# Optimizer (Adam with weight decay for regularization)
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-5
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Learning rate scheduler (reduce on plateau)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# Training parameters
NUM_EPOCHS = 50
EARLY_STOPPING_PATIENCE = 10
MIN_DELTA = 1e-6

print(f"Training configuration:")
print(f"  Loss function: MSE")
print(f"  Optimizer: Adam (lr={LEARNING_RATE}, weight_decay={WEIGHT_DECAY})")
print(f"  Scheduler: ReduceLROnPlateau")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Early stopping patience: {EARLY_STOPPING_PATIENCE}")

## 8. Training Loop

Train the model with early stopping and validation monitoring.

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0.0
    n_batches = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        n_batches += 1
    
    return total_loss / n_batches

def validate(model, val_loader, criterion, device):
    """Validate the model."""
    model.eval()
    total_loss = 0.0
    n_batches = 0
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            total_loss += loss.item()
            n_batches += 1
    
    return total_loss / n_batches

# Training loop
train_losses = []
val_losses = []
best_val_loss = float('inf')
patience_counter = 0

print("Starting training...")
print("=" * 60)

for epoch in range(NUM_EPOCHS):
    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    
    # Validate
    val_loss = validate(model, val_loader, criterion, device)
    val_losses.append(val_loss)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Early stopping check
    if val_loss < best_val_loss - MIN_DELTA:
        best_val_loss = val_loss
        patience_counter = 0
        # Save best model (in practice, save to disk)
        best_model_state = model.state_dict().copy()
    else:
        patience_counter += 1
    
    # Print progress
    if (epoch + 1) % 5 == 0 or epoch == 0:
        print(
            f"Epoch [{epoch+1}/{NUM_EPOCHS}] | "
            f"Train Loss: {train_loss:.6f} | "
            f"Val Loss: {val_loss:.6f} | "
            f"LR: {optimizer.param_groups[0]['lr']:.6f}"
        )
    
    # Early stopping
    if patience_counter >= EARLY_STOPPING_PATIENCE:
        print(f"\nEarly stopping at epoch {epoch+1}")
        model.load_state_dict(best_model_state)
        break

print("=" * 60)
print("Training completed!")

## 9. Training Visualization

Plot training and validation loss curves.

In [None]:
# Plot training curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss', alpha=0.7)
plt.plot(val_losses, label='Validation Loss', alpha=0.7)
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(train_losses, label='Train Loss', alpha=0.7)
plt.plot(val_losses, label='Validation Loss', alpha=0.7)
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE) - Log Scale')
plt.title('Training and Validation Loss (Log Scale)')
plt.yscale('log')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Final train loss: {train_losses[-1]:.6f}")
print(f"Final validation loss: {val_losses[-1]:.6f}")
print(f"Best validation loss: {best_val_loss:.6f}")

## 10. Model Evaluation

Evaluate on test set and calculate financial metrics.

In [None]:
def evaluate_model(model, test_loader, scaler_y, device):
    """Evaluate model and return predictions and targets (denormalized)."""
    model.eval()
    predictions = []
    targets = []
    
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            pred = model(X_batch)
            predictions.extend(pred.cpu().numpy())
            targets.extend(y_batch.numpy())
    
    # Convert to numpy arrays
    predictions = np.array(predictions)
    targets = np.array(targets)
    
    # Denormalize
    predictions_denorm = scaler_y.inverse_transform(predictions.reshape(-1, 1)).flatten()
    targets_denorm = scaler_y.inverse_transform(targets.reshape(-1, 1)).flatten()
    
    return predictions_denorm, targets_denorm

# Evaluate on test set
y_pred_test, y_true_test = evaluate_model(model, test_loader, scaler_y, device)

# Calculate metrics
mse = mean_squared_error(y_true_test, y_pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true_test, y_pred_test)
mape = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100

# Directional accuracy (percentage of correct direction predictions)
returns_true = np.diff(y_true_test) / y_true_test[:-1]
returns_pred = np.diff(y_pred_test) / y_pred_test[:-1]
direction_accuracy = np.mean((returns_true * returns_pred) > 0) * 100

print("Test Set Performance:")
print("=" * 60)
print(f"RMSE: ${rmse:.2f}")
print(f"MAE: ${mae:.2f}")
print(f"MAPE: {mape:.2f}%")
print(f"Directional Accuracy: {direction_accuracy:.2f}%")
print("=" * 60)

## 11. Visualization of Predictions

Visualize predictions vs actual prices.

In [None]:
# Create time index for test set (approximate, based on sequence length)
test_start_idx = len(X_train) + len(X_val) + SEQUENCE_LENGTH
test_indices = df_features.index[test_start_idx:test_start_idx + len(y_pred_test)]

# Plot predictions vs actual
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Full test set
axes[0].plot(test_indices, y_true_test, label='Actual', alpha=0.7, linewidth=1.5)
axes[0].plot(test_indices, y_pred_test, label='Predicted', alpha=0.7, linewidth=1.5)
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Price ($)')
axes[0].set_title(f'{SYMBOL} - Actual vs Predicted Prices (Test Set)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Residuals
residuals = y_true_test - y_pred_test
axes[1].plot(test_indices, residuals, alpha=0.7, color='red', linewidth=1)
axes[1].axhline(y=0, color='black', linestyle='--', linewidth=1)
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Residual ($)')
axes[1].set_title('Residuals (Actual - Predicted)')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Distribution of residuals
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(residuals, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Residual ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(y_pred_test, residuals, alpha=0.5)
plt.xlabel('Predicted Price ($)')
plt.ylabel('Residual ($)')
plt.title('Residuals vs Predicted Values')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 12. Save Model

Save the trained model and scalers for future use.

In [None]:
# Create models directory if it doesn't exist
models_dir = project_root / "models"
models_dir.mkdir(exist_ok=True)

# Save model
model_path = models_dir / f"lstm_{SYMBOL.lower()}_model.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'input_size': INPUT_SIZE,
        'hidden_size': HIDDEN_SIZE,
        'num_layers': NUM_LAYERS,
        'dropout': DROPOUT,
    },
    'scaler_X': scaler_X,
    'scaler_y': scaler_y,
    'sequence_length': SEQUENCE_LENGTH,
    'feature_columns': available_features,
    'target_column': TARGET_COLUMN,
    'symbol': SYMBOL,
    'training_metrics': {
        'best_val_loss': best_val_loss,
        'final_train_loss': train_losses[-1],
        'final_val_loss': val_losses[-1],
        'test_rmse': rmse,
        'test_mae': mae,
        'test_mape': mape,
        'direction_accuracy': direction_accuracy,
    }
}, model_path)

print(f"Model saved to: {model_path}")
print(f"Model size: {model_path.stat().st_size / 1024 / 1024:.2f} MB")

## 13. Model Loading Example

Example of how to load the saved model for inference.

In [None]:
# Example: Load model for inference
def load_model_for_inference(model_path: Path):
    """Load a saved model for inference."""
    checkpoint = torch.load(model_path, map_location=device)
    
    # Recreate model
    model_config = checkpoint['model_config']
    model = LSTMPredictor(**model_config)
    model.load_state_dict(checkpoint['model_state_dict'])
    model = model.to(device)
    model.eval()
    
    return model, checkpoint

# Load model (example)
# loaded_model, checkpoint = load_model_for_inference(model_path)
# print("Model loaded successfully!")
# print(f"Model was trained on: {checkpoint['symbol']}")
# print(f"Test RMSE: ${checkpoint['training_metrics']['test_rmse']:.2f}")

## Notes and Next Steps

### Improvements to Consider:
1. **Hyperparameter Optimization**: Use Optuna or grid search to find optimal hyperparameters
2. **Feature Selection**: Identify most important features using SHAP or feature importance
3. **Ensemble Methods**: Combine multiple models for better predictions
4. **Multi-step Forecasting**: Predict multiple steps ahead
5. **Regime Detection**: Adapt model based on market conditions
6. **Attention Mechanisms**: Add attention layers to focus on important time steps
7. **Transformer Models**: Experiment with Transformer architectures
8. **Walk-Forward Validation**: Implement proper walk-forward analysis for backtesting

### Financial Metrics to Add:
- Sharpe Ratio
- Sortino Ratio
- Maximum Drawdown
- Win Rate
- Profit Factor

### Model Deployment:
- Create a prediction service/API
- Set up model versioning with MLflow
- Implement real-time inference pipeline
- Add model monitoring and retraining schedule