In [1]:
# Core libraries
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Data processing and analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
from typing import Tuple, List, Optional, NamedTuple
from dataclasses import dataclass, asdict




# Progress tracking and logging
from tqdm import tqdm
import logging

# Random seed
import random

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class DataConfig:
    """Configuration for data preprocessing"""
    seq_length: int = 5
    start_season: int = 2016
    min_pa: int = 50
    input_features: List[str] = None
    train_ratio: float = 0.7
    valid_ratio: float = 0.2
    random_seed: int = 17

class SequenceHandler:
    """Handles creation and padding of sequences for LSTM input"""
    def __init__(self, seq_length: int, feature_dim: int):
        self.seq_length = seq_length
        self.feature_dim = feature_dim
        self.pad_value = 0
        
    def create_sequence(self, player_data: pd.DataFrame, input_features: List[str]) -> Tuple[np.ndarray, torch.Tensor]:
        available_seasons = len(player_data)
        
        if available_seasons >= self.seq_length:
            # Take most recent seasons
            sequence = player_data.iloc[-self.seq_length:][input_features].values
            mask = torch.ones(self.seq_length, dtype=torch.bool)
        else:
            # Create padding
            padding_size = self.seq_length - available_seasons
            real_data = player_data[input_features].values
            padding = np.full((padding_size, len(input_features)), self.pad_value)
            sequence = np.vstack([padding, real_data])
            mask = torch.zeros(self.seq_length, dtype=torch.bool)
            mask[padding_size:] = 1
            
        return sequence, mask
def prepare_sequences(df: pd.DataFrame, 
                     input_features: List[str], 
                     seq_length: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Create sequences for LSTM input"""
    sequences = []
    masks = []
    handler = SequenceHandler(seq_length, len(input_features))
    
    for _, player_data in df.groupby('IDfg'):
        player_data = player_data.sort_values(by='Season')
        
        for i in range(len(player_data) - 1):
            history = player_data.iloc[:i+1]
            target = player_data.iloc[i+1][input_features].values
            
            sequence, mask = handler.create_sequence(history, input_features)
            sequences.append((sequence, target))
            masks.append(mask)
    
    return sequences, masks

def validate_features(df: pd.DataFrame, features: List[str]) -> None:
    """Validate that all required features exist in dataframe"""
    missing_features = [f for f in features if f not in df.columns]
    if missing_features:
        raise ValueError(f"Missing required features: {missing_features}")

def load_and_validate_data(file_path: str, config: DataConfig) -> pd.DataFrame:
    """Load data and perform initial validation"""
    logger.info(f"Loading data from {file_path}")
    try:
        df = pd.read_csv(file_path, low_memory=False)
        validate_features(df, config.input_features)
        return df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise
def split_data(sequences: List[Tuple], 
               masks: List[torch.Tensor],
               train_ratio: float = 0.7,
               valid_ratio: float = 0.2) -> Tuple:
    """Split data into train, validation and test sets"""
    # Validate ratios
    if not 0 < train_ratio + valid_ratio < 1:
        raise ValueError("Train and validation ratios must sum to less than 1")
    
    logger.info("Splitting data into train, validation, and test sets")
    n = len(sequences)
    indices = np.random.permutation(n)
    
    train_size = int(n * train_ratio)
    valid_size = int(n * valid_ratio)
    
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:train_size + valid_size]
    test_indices = indices[train_size + valid_size:]
    
    # Split sequences and masks
    train_data = ([sequences[i] for i in train_indices], [masks[i] for i in train_indices])
    valid_data = ([sequences[i] for i in valid_indices], [masks[i] for i in valid_indices])
    test_data = ([sequences[i] for i in test_indices], [masks[i] for i in test_indices])
    
    logger.info(f"Split sizes - Train: {len(train_indices)}, Valid: {len(valid_indices)}, Test: {len(test_indices)}")
    
    return train_data, valid_data, test_data
def filter_data(df: pd.DataFrame, config: DataConfig) -> pd.DataFrame:
    """Filter data based on configuration"""
    logger.info("Filtering data...")
    initial_size = len(df)
    
    # Filter by season
    df = df[df['Season'] >= config.start_season]
    
    # Filter by minimum PA
    df = df[df['PA'] >= config.min_pa]
    
    # Drop NaN values in input features early
    df = df.dropna(subset=config.input_features)
    
    # Log statistics before filtering
    logger.info("NaN counts before filtering:")
    for col in config.input_features:
        nan_count = df[col].isna().sum()
        if nan_count > 0:
            logger.warning(f"{col}: {nan_count} NaN values")
            
    logger.info(f"Filtered from {initial_size} to {len(df)} rows")
    return df

def convert_column_types(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    """Convert columns to float32 for LSTM compatibility"""
    logger.info("Converting column types...")
    
    for col in features:
        try:
            # Convert percentage strings to floats if needed
            if df[col].dtype == object and df[col].str.contains('%').any():
                df[col] = df[col].str.rstrip('%').astype('float32') / 100
            else:
                df[col] = df[col].astype('float32')
        except Exception as e:
            logger.error(f"Error converting column {col}: {str(e)}")
            raise
            
    return df

from sklearn.preprocessing import MinMaxScaler

def scale_features(df: pd.DataFrame, 
                  features: List[str], 
                  scaler: Optional[MinMaxScaler] = None) -> Tuple[pd.DataFrame, MinMaxScaler]:
    """Scale features using MinMaxScaler and add player-specific normalized features"""
    
    # Calculate player career averages
    player_stats = df.groupby('IDfg')[features].transform('mean')
    
    # Create deviation from career average features
    for feature in features:
        df[f'{feature}_vs_career'] = df[feature] - player_stats[feature]
    
    # Combine original and new features
    all_features = features + [f'{feature}_vs_career' for feature in features]
    
    if scaler is None:
        scaler = MinMaxScaler(feature_range=(-1, 1))  # Use (-1,1) range for better neural network training
        scaled_data = scaler.fit_transform(df[all_features])
        joblib.dump(scaler, 'scaler.pkl')
        logger.info(f"Created new MinMaxScaler")
        logger.info(f"Scaler data range: [{scaled_data.min():.4f}, {scaled_data.max():.4f}]")
    else:
        scaled_data = scaler.transform(df[all_features])
    
    # Validate scaled data
    if np.isnan(scaled_data).any():
        raise ValueError("NaN values found after scaling")
    if np.isinf(scaled_data).any():
        raise ValueError("Infinite values found after scaling")
    
    # Update DataFrame with scaled values
    scaled_df = pd.DataFrame(scaled_data, columns=all_features, index=df.index)
    df[all_features] = scaled_df
    
    return df, scaler

def prepare_sequences(df: pd.DataFrame, 
                      input_features: List[str],
                      seq_length: int) -> Tuple[List, List]:
    """Create sequences for LSTM input with padding for shorter histories."""
    sequences = []
    masks = []
    handler = SequenceHandler(seq_length, len(input_features))
    skipped_sequences = 0

    # Convert types before processing
    df = convert_column_types(df, input_features)

    # Sort by 'IDfg' and 'Season' to ensure correct order
    df = df.sort_values(['IDfg', 'Season'])

    for player_id, player_data in df.groupby('IDfg'):
        player_data = player_data.reset_index(drop=True)
        num_seasons = len(player_data)

        if num_seasons < 2:
            continue  # Need at least two seasons to create a sequence

        # Generate sequences starting from the second season
        for i in range(1, num_seasons):
            history = player_data.iloc[:i]
            target = player_data.iloc[i][input_features].values

            if history[input_features].isna().any().any() or pd.isna(target).any():
                skipped_sequences += 1
                continue

            sequence, mask = handler.create_sequence(history, input_features)

            if np.isnan(sequence).any() or np.isinf(sequence).any():
                skipped_sequences += 1
                continue

            sequences.append((sequence, target))
            masks.append(mask)

    logger.info(f"Created {len(sequences)} valid sequences")
    logger.info(f"Skipped {skipped_sequences} sequences due to invalid values")

    if not sequences:
        raise ValueError("No valid sequences created after filtering")

    return sequences, masks

def to_tensor(sequences: List[Tuple], masks: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Convert sequences and masks to PyTorch tensors with validation"""
    sequences_array = np.array([s[0] for s in sequences], dtype=np.float32)
    targets_array = np.array([s[1] for s in sequences], dtype=np.float32)
    
    # Validate arrays before conversion
    if np.isnan(sequences_array).any():
        raise ValueError("NaN values found in input sequences")
    if np.isnan(targets_array).any():
        raise ValueError("NaN values found in target values")
        
    X = torch.FloatTensor(sequences_array)
    y = torch.FloatTensor(targets_array)
    masks = torch.stack(masks)
    
    logger.info(f"Tensor shapes - X: {X.shape}, y: {y.shape}, masks: {masks.shape}")
    logger.info(f"Value ranges - X: [{X.min():.2f}, {X.max():.2f}], y: [{y.min():.2f}, {y.max():.2f}]")
    
    return X, y, masks

def preprocess_data(file_path: str, config: DataConfig) -> Tuple:
    """Main preprocessing function with enhanced validation"""
    try:
        # Set random seeds
        torch.manual_seed(config.random_seed)
        np.random.seed(config.random_seed)
        
        # Load and validate raw data
        df = load_and_validate_data(file_path, config)
        
        # Filter and clean data
        df = filter_data(df, config)
        
        # Scale features before sequence creation
        df, scaler = scale_features(df, config.input_features)
        
        # Create sequences
        sequences, masks = prepare_sequences(df, config.input_features, config.seq_length)
        
        # Split data
        train_data, valid_data, test_data = split_data(sequences, masks, 
                                                      train_ratio=config.train_ratio,
                                                      valid_ratio=config.valid_ratio)
        
        # Convert to tensors
        X_train, y_train, train_masks = to_tensor(*train_data)
        X_valid, y_valid, valid_masks = to_tensor(*valid_data)
        X_test, y_test, test_masks = to_tensor(*test_data)
        
        logger.info(f"Created datasets - Train: {len(X_train)}, Valid: {len(X_valid)}, Test: {len(X_test)}")
        
        return (X_train, y_train, X_valid, y_valid, X_test, y_test, 
                train_masks, valid_masks, test_masks)
    
    except Exception as e:
        logger.error(f"Error in preprocessing: {str(e)}")
        raise

# Usage
config = DataConfig(
    input_features=[
        'Age', 'BB%', 'K%', 'ISO', 'BABIP', 'OBP', 'SLG', 'wOBA', 'wRC+',
        'Barrel%', 'HardHit%', 'EV', 'LA', 'maxEV', 'xBA', 'xSLG', 'xwOBA',
        'GB%', 'FB%', 'LD%', 'Pull%', 'Oppo%', 'O-Swing%', 'Z-Swing%',
        'Contact%', 'SwStr%', 'CSW%', 'TTO%'
    ]
)

data = preprocess_data('../data/mlb_batting_data_2010_2024.csv', config)

INFO:__main__:Loading data from ../data/mlb_batting_data_2010_2024.csv
INFO:__main__:Filtering data...
INFO:__main__:NaN counts before filtering:
INFO:__main__:Filtered from 20503 to 4784 rows
INFO:__main__:Created new MinMaxScaler
INFO:__main__:Scaler data range: [-1.0000, 1.0000]
INFO:__main__:Converting column types...
INFO:__main__:Created 3434 valid sequences
INFO:__main__:Skipped 0 sequences due to invalid values
INFO:__main__:Splitting data into train, validation, and test sets
INFO:__main__:Split sizes - Train: 2403, Valid: 686, Test: 345
INFO:__main__:Tensor shapes - X: torch.Size([2403, 5, 28]), y: torch.Size([2403, 28]), masks: torch.Size([2403, 5])
INFO:__main__:Value ranges - X: [-1.00, 1.00], y: [-1.00, 1.00]
INFO:__main__:Tensor shapes - X: torch.Size([686, 5, 28]), y: torch.Size([686, 28]), masks: torch.Size([686, 5])
INFO:__main__:Value ranges - X: [-1.00, 1.00], y: [-1.00, 1.00]
INFO:__main__:Tensor shapes - X: torch.Size([345, 5, 28]), y: torch.Size([345, 28]), masks

In [3]:
class MultiHeadAttention(nn.Module):
    def __init__(
        self, 
        hidden_size: int,
        num_heads: int = 8,
        dropout: float = 0.1,
        bias: bool = True
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.scaling = self.head_dim ** -0.5
        
        assert self.head_dim * num_heads == hidden_size, "hidden_size must be divisible by num_heads"
        
        # Linear projections
        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        self.k_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        self.v_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        self.out_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Initialize parameters
        self._reset_parameters()
    
    def _reset_parameters(self):
        # Use Xavier uniform initialization
        nn.init.xavier_uniform_(self.q_proj.weight)
        nn.init.xavier_uniform_(self.k_proj.weight)
        nn.init.xavier_uniform_(self.v_proj.weight)
        nn.init.xavier_uniform_(self.out_proj.weight)
        if self.q_proj.bias is not None:
            nn.init.zeros_(self.q_proj.bias)
            nn.init.zeros_(self.k_proj.bias)
            nn.init.zeros_(self.v_proj.bias)
            nn.init.zeros_(self.out_proj.bias)
    
    def forward(
        self,
        query: torch.Tensor,
        key: Optional[torch.Tensor] = None,
        value: Optional[torch.Tensor] = None,
        key_padding_mask: Optional[torch.Tensor] = None,
        need_weights: bool = False
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Set key and value to query if not provided
        if key is None:
            key = query
        if value is None:
            value = query
            
        batch_size, seq_len, _ = query.size()
        
        # Project inputs
        q = self.q_proj(query)
        k = self.k_proj(key)
        v = self.v_proj(value)
        
        # Reshape for multi-head attention
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Compute attention scores
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scaling
        
        # Apply key padding mask if provided
        if key_padding_mask is not None:
            attn_weights = attn_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf')
            )
        
        # Apply softmax and dropout
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Get attention output
        attn_output = torch.matmul(attn_weights, v)
        
        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, seq_len, self.hidden_size)
        attn_output = self.out_proj(attn_output)
        
        if need_weights:
            return attn_output, attn_weights
        return attn_output, None

In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, hidden_size: int, dropout: float = 0.1):
        super().__init__()
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.layers = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.layers(self.layer_norm(x))

class ImprovedLSTM(nn.Module):
    def __init__(
        self, 
        input_size: int,
        hidden_size: int,
        num_layers: int,
        output_size: int,
        dropout: float = 0.4,
        bidirectional: bool = True,
        num_heads: int = 8
    ):
        super().__init__()
        
        # Reduce model complexity
        self.hidden_size = hidden_size // 2  # Reduce hidden size
        self.num_layers = 2  # Reduce from 4 to 2 layers
        self.directions = 2 if bidirectional else 1
        
        # Add batch normalization after input
        self.input_proj = nn.Linear(input_size, self.hidden_size)
        
        # LSTM layers with better gradient flow
        self.lstm_layers = nn.ModuleList([
            nn.ModuleDict({
                'lstm': nn.LSTM(
                    self.hidden_size if i == 0 else self.hidden_size * self.directions,
                    self.hidden_size,
                    1,
                    batch_first=True,
                    bidirectional=bidirectional,
                    dropout=0
                ),
                'norm': nn.LayerNorm(self.hidden_size * self.directions),
                'dropout': nn.Dropout(dropout/2)  # Reduce dropout
            }) for i in range(self.num_layers)
        ])
        
        # Keep existing attention and output layers
        self.attention = MultiHeadAttention(
            self.hidden_size * self.directions,
            num_heads=4,  # Reduce number of heads
            dropout=dropout/2
        )
        
        # Add skip connection in feature mixer
        self.feature_mixer = nn.Sequential(
            nn.Linear(self.hidden_size * self.directions, self.hidden_size * 2),
            nn.GELU(),
            nn.Dropout(dropout/2),
            nn.Linear(self.hidden_size * 2, self.hidden_size * self.directions)
        )
        
        # Simplify output layers but keep structure
        self.output_layers = nn.ModuleList([
            ResidualBlock(self.hidden_size * self.directions, dropout/2)
            for _ in range(2)  # Reduce from 3 to 2
        ])
        
        self.final_norm = nn.LayerNorm(self.hidden_size * self.directions)
        self.final_layer = nn.Linear(self.hidden_size * self.directions, output_size)

    
    def forward(
        self,
        x: torch.Tensor,
        lengths: torch.Tensor
    ) -> torch.Tensor:
        batch_size, seq_len, _ = x.size()
        
        # Input processing
        x = self.input_proj(x)
        
        # Create padding mask
        mask = torch.arange(seq_len, device=x.device)[None, :] < lengths[:, None]
        
        # Process LSTM layers
        for layer in self.lstm_layers:
            # Pack padded sequence
            packed = pack_padded_sequence(
                x, lengths.cpu(),
                batch_first=True,
                enforce_sorted=False
            )
            
            # LSTM forward pass
            packed_output, _ = layer['lstm'](packed)
            lstm_out, _ = pad_packed_sequence(
                packed_output,
                batch_first=True,
                total_length=seq_len
            )
            
            # Apply normalization and dropout
            lstm_out = layer['norm'](lstm_out)
            lstm_out = layer['dropout'](lstm_out)
            
            # Residual connection if shapes match
            if lstm_out.size(-1) == x.size(-1):
                x = x + lstm_out
            else:
                x = lstm_out
        
        # Apply attention mechanism
        attended, _ = self.attention(
            query=x,
            key=x,
            value=x,
            key_padding_mask=~mask  # Invert mask for key_padding_mask
        )
        
        # Mix features with residual connection
        mixed = self.feature_mixer(attended)
        mixed = mixed + attended
        
        # Process through output layers
        output = mixed
        for layer in self.output_layers:
            output = layer(output)
        
        # Final normalization
        output = self.final_norm(output)
        
        # Get final states using sequence lengths
        batch_indices = torch.arange(batch_size, device=output.device)
        final_states = output[batch_indices, lengths - 1]
        
        # Project to output size
        return self.final_layer(final_states)

In [5]:
# Data Loading and Configuration Cell

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class DataBatch(NamedTuple):
    """Container for training data batches."""
    train: TensorDataset
    valid: TensorDataset
    test: TensorDataset

@dataclass
class Config:
    """Advanced configuration for LSTM-based baseball statistics prediction."""
    
    # Dynamic sizes from data
    input_size: int = None
    output_size: int = None
    
    # Model Architecture 
    hidden_size: int = 512
    num_layers: int = 4     
    num_heads: int = 8
    bidirectional: bool = True
    attention_dropout: float = 0.1
    residual_dropout: float = 0.2
    layer_norm_eps: float = 1e-5
    
    # Training Parameters
    batch_size: int = 32
    dropout: float = 0.3    # Reduced from 0.4
    learning_rate: float = 1e-3  # Increased from 3e-4
    weight_decay: float = 1e-5
    gradient_clip: float = 1.0    # Increased from 0.5
    num_epochs: int = 20
    warmup_epochs: int = 5
    
    # Learning Rate Schedule
    lr_schedule: str = 'cosine'
    min_lr: float = 1e-6
    lr_decay_rate: float = 0.1
    lr_patience: int = 5
    
    # Early Stopping
    early_stopping_patience: int = 10
    early_stopping_min_delta: float = 1e-4
    
    # Hardware Optimization
    mixed_precision: bool = True
    num_workers: int = 0  # For Windows
    pin_memory: bool = True
    
    # Logging
    log_interval: int = 100
    checkpoint_interval: int = 1
    
    def __init__(self, X_train: torch.Tensor, y_train: torch.Tensor):
        """Initialize and validate configuration with data dimensions."""
        self.input_size = X_train.shape[2]
        self.output_size = y_train.shape[1]
        self._validate_config()
        self._log_config()
    
    def _validate_config(self) -> None:
        """Validate configuration parameters."""
        assert self.hidden_size % self.num_heads == 0, \
            "Hidden size must be divisible by number of attention heads"
        assert self.hidden_size >= self.input_size, \
            "Hidden size must be greater than or equal to input size"
        assert 0 <= self.dropout <= 1, "Dropout must be between 0 and 1"
        assert self.num_layers >= 1, "Must have at least one LSTM layer"
        assert self.batch_size > 0, "Batch size must be positive"
        assert self.learning_rate > 0, "Learning rate must be positive"
        assert self.lr_schedule in ['cosine', 'linear', 'exponential'], \
            "Invalid learning rate schedule"
    
    def _log_config(self) -> None:
        """Log configuration parameters."""
        logger.info("Model Configuration:")
        for key, value in asdict(self).items():
            logger.info(f"{key}: {value}")
    
    @property
    def device(self) -> torch.device:
        """Get appropriate device for training."""
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def create_data_loaders(data_tuple: tuple) -> DataBatch:
    """Create DataLoader objects from preprocessed data tuple."""
    try:
        X_train, y_train, X_valid, y_valid, X_test, y_test, \
        train_masks, valid_masks, test_masks = data_tuple
        
        # Create datasets
        train_dataset = TensorDataset(X_train, train_masks, y_train)
        valid_dataset = TensorDataset(X_valid, valid_masks, y_valid)
        test_dataset = TensorDataset(X_test, test_masks, y_test)
        
        return DataBatch(train_dataset, valid_dataset, test_dataset)
    
    except ValueError as e:
        logger.error(f"Error unpacking data: {str(e)}")
        raise

# Initialize everything
try:
    # Create data loaders
    data_batch = create_data_loaders(data)
    
    # Initialize config
    config = Config(data_batch.train.tensors[0], data_batch.train.tensors[2])
    
    # Create DataLoaders
    train_loader = DataLoader(
        data_batch.train,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    valid_loader = DataLoader(
        data_batch.valid,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    test_loader = DataLoader(
        data_batch.test,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )

    # Initialize model
    model = ImprovedLSTM(
        input_size=config.input_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        output_size=config.output_size,
        dropout=config.dropout,
        bidirectional=config.bidirectional,
        num_heads=config.num_heads
    ).to(config.device)
    
    # Initialize optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )
    
    # Initialize learning rate scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.learning_rate,
        epochs=config.num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=config.warmup_epochs / config.num_epochs,
        anneal_strategy='cos',
        final_div_factor=1e3
    )
    
    # Initialize loss function
    criterion = nn.MSELoss()
    
    logger.info(f"Data loaded successfully. Device: {config.device}")
    logger.info(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
    logger.info(f"Training batches: {len(train_loader)}")
    logger.info(f"Validation batches: {len(valid_loader)}")
    logger.info(f"Test batches: {len(test_loader)}")

except Exception as e:
    logger.error(f"Error during initialization: {str(e)}")
    raise

INFO:__main__:Model Configuration:
INFO:__main__:input_size: 28


INFO:__main__:output_size: 28
INFO:__main__:hidden_size: 512
INFO:__main__:num_layers: 4
INFO:__main__:num_heads: 8
INFO:__main__:bidirectional: True
INFO:__main__:attention_dropout: 0.1
INFO:__main__:residual_dropout: 0.2
INFO:__main__:layer_norm_eps: 1e-05
INFO:__main__:batch_size: 32
INFO:__main__:dropout: 0.3
INFO:__main__:learning_rate: 0.001
INFO:__main__:weight_decay: 1e-05
INFO:__main__:gradient_clip: 1.0
INFO:__main__:num_epochs: 20
INFO:__main__:warmup_epochs: 5
INFO:__main__:lr_schedule: cosine
INFO:__main__:min_lr: 1e-06
INFO:__main__:lr_decay_rate: 0.1
INFO:__main__:lr_patience: 5
INFO:__main__:early_stopping_patience: 10
INFO:__main__:early_stopping_min_delta: 0.0001
INFO:__main__:mixed_precision: True
INFO:__main__:num_workers: 0
INFO:__main__:pin_memory: True
INFO:__main__:log_interval: 100
INFO:__main__:checkpoint_interval: 1
INFO:__main__:Data loaded successfully. Device: cuda
INFO:__main__:Model initialized with 8431900 parameters
INFO:__main__:Training batches: 76
I

In [6]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    config: Config,
    optimizer: optim.Optimizer,
    scheduler: optim.lr_scheduler._LRScheduler,
    criterion: nn.Module,
    checkpoint_dir: str = './checkpoints'
) -> dict:
    """Train LSTM model with advanced optimizations and monitoring."""
    import os
    
    logger.info(f"Starting training on device: {config.device}")
    model = model.to(config.device)
    
    # Mixed precision training
    scaler = torch.cuda.amp.GradScaler(enabled=config.mixed_precision)
    
    # Training state tracking
    best_val_loss = float('inf')
    early_stopping_counter = 0
    train_metrics = {
        'train_losses': [],
        'val_losses': [],
        'learning_rates': [],
        'best_epoch': 0
    }
    
    # Create checkpoint directory
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    for epoch in range(config.num_epochs):
        # Training phase
        model.train()
        epoch_loss = 0.0
        
        with tqdm(train_loader, desc=f'Epoch {epoch+1}/{config.num_epochs}') as pbar:
            for batch_idx, (data, masks, targets) in enumerate(pbar):
                try:
                    # Move data to device
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    
                    # Calculate sequence lengths from masks
                    lengths = masks.sum(1).clamp(min=1)  # Ensure minimum length of 1
                    
                    # Debug info
                    if batch_idx == 0 and epoch == 0:
                        logger.info(f"Batch shapes - Data: {data.shape}, Masks: {masks.shape}, "
                                  f"Targets: {targets.shape}, Lengths: {lengths.shape}")
                    
                    # Forward pass with mixed precision
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        if batch_idx == 0 and epoch == 0:
                            logger.info(f"Output shape: {outputs.shape}")
                        loss = criterion(outputs, targets)
                    
                    # Backward pass with gradient scaling
                    optimizer.zero_grad(set_to_none=True)
                    scaler.scale(loss).backward()
                    
                    # Gradient clipping
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                    
                    # Optimizer step with scaler
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    
                    # Update metrics
                    epoch_loss += loss.item()
                    current_lr = scheduler.get_last_lr()[0]
                    
                    # Update progress bar
                    pbar.set_postfix({
                        'loss': f'{loss.item():.3f}',
                        'lr': f'{current_lr:.2e}'
                    })
                    
                except RuntimeError as e:
                    logger.error(f"Error in batch {batch_idx}: {str(e)}")
                    logger.error(f"Data shapes - Input: {data.shape}, Mask: {masks.shape}, "
                               f"Target: {targets.shape}, Lengths: {lengths.shape}")
                    raise
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for data, masks, targets in valid_loader:
                try:
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    lengths = masks.sum(1).clamp(min=1)
                    
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        loss = criterion(outputs, targets)
                        val_loss += loss.item()
                        
                except RuntimeError as e:
                    logger.error(f"Error in validation: {str(e)}")
                    raise
        
        # Calculate epoch metrics
        epoch_loss /= len(train_loader)
        val_loss /= len(valid_loader)
        
        # Update training metrics
        train_metrics['train_losses'].append(epoch_loss)
        train_metrics['val_losses'].append(val_loss)
        train_metrics['learning_rates'].append(current_lr)
        
        # Model checkpointing
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            train_metrics['best_epoch'] = epoch
            early_stopping_counter = 0
            
            # Save checkpoint
            checkpoint_path = os.path.join(checkpoint_dir, 'best_model.pth')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'val_loss': val_loss,
                'config': asdict(config),
                'metrics': train_metrics,
                'scaler_state_dict': scaler.state_dict()
            }, checkpoint_path)
            
            logger.info(f'New best model saved with validation loss: {val_loss:.4f}')
        else:
            early_stopping_counter += 1
        
        # Log epoch metrics
        logger.info(
            f'Epoch {epoch+1}: '
            f'Train Loss = {epoch_loss:.4f}, '
            f'Val Loss = {val_loss:.4f}, '
            f'LR = {current_lr:.2e}'
        )
        
        # Early stopping check
        if early_stopping_counter >= config.early_stopping_patience:
            logger.info(f'Early stopping triggered after {epoch+1} epochs')
            break
    
    return train_metrics

# Initialize training
try:
    metrics = train_model(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        config=config,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion
    )
except Exception as e:
    logger.error(f"Training failed: {str(e)}")
    raise

INFO:__main__:Starting training on device: cuda
Epoch 1/20:   0%|          | 0/76 [00:00<?, ?it/s]

INFO:__main__:Batch shapes - Data: torch.Size([32, 5, 28]), Masks: torch.Size([32, 5]), Targets: torch.Size([32, 28]), Lengths: torch.Size([32])
INFO:__main__:Output shape: torch.Size([32, 28])
Epoch 1/20: 100%|██████████| 76/76 [00:09<00:00,  8.30it/s, loss=0.030, lr=1.32e-04]
INFO:__main__:New best model saved with validation loss: 0.0443
INFO:__main__:Epoch 1: Train Loss = 0.0896, Val Loss = 0.0443, LR = 1.32e-04
Epoch 2/20: 100%|██████████| 76/76 [00:09<00:00,  8.20it/s, loss=0.057, lr=3.73e-04]
INFO:__main__:Epoch 2: Train Loss = 0.0508, Val Loss = 0.0488, LR = 3.73e-04
Epoch 3/20: 100%|██████████| 76/76 [00:08<00:00,  8.50it/s, loss=0.068, lr=6.71e-04]
INFO:__main__:Epoch 3: Train Loss = 0.0503, Val Loss = 0.0686, LR = 6.71e-04
Epoch 4/20: 100%|██████████| 76/76 [00:08<00:00,  8.56it/s, loss=0.028, lr=9.10e-04]
INFO:__main__:Epoch 4: Train Loss = 0.0498, Val Loss = 0.0574, LR = 9.10e-04
Epoch 5/20: 100%|██████████| 76/76 [00:09<00:00,  8.29it/s, loss=0.032, lr=1.00e-03]
INFO:__ma

In [10]:
def load_model_from_checkpoint(checkpoint_path: str, data_config, device: torch.device) -> nn.Module:
    """Load model with proper error handling and validation"""
    try:
        logger.info(f"Loading model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        
        # Initialize model with correct parameters
        model = ImprovedLSTM(
            input_size=len(data_config.input_features),
            hidden_size=512,
            num_layers=4,
            output_size=len(data_config.input_features),
            dropout=0.2,
            bidirectional=True,
            num_heads=8
        ).to(device)
        
        # Load state dict
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()
        
        # Validate model configuration
        if hasattr(checkpoint, 'config'):
            logger.info(f"Loaded model config: {checkpoint['config']}")
        
        return model
        
    except FileNotFoundError:
        logger.error(f"Checkpoint file not found: {checkpoint_path}")
        raise
    except KeyError as e:
        logger.error(f"Invalid checkpoint structure: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise

def predict_future_stats(player_id, input_features, model, scaler, raw_df, player_names):
    """Predict future stats for a given player"""
    # Get player data
    player_data = raw_df[raw_df['IDfg'] == player_id].sort_values('Season')
    if len(player_data) < 3:
        logger.warning(f"Insufficient data for player {player_id}")
        return None
        
    # Get player info
    player_name = player_names[player_names['IDfg'] == player_id]['Name'].iloc[0]
    last_season = player_data['Season'].max()
    last_age = player_data[player_data['Season'] == last_season]['Age'].iloc[0]
    
    logger.info(f"\nGenerating predictions for {player_name}")
    logger.info(f"Last season: {last_season}, Last age: {last_age}")
    
    # Get device from model
    device = next(model.parameters()).device
    
    # Get most recent sequence and calculate career stats
    recent_data = player_data[input_features].iloc[-3:]
    career_stats = player_data[input_features].mean()
    
    # Create enhanced features with career stats
    enhanced_features = []
    for idx, row in recent_data.iterrows():
        # Combine original features with career deviation
        career_dev = row - career_stats
        combined = np.concatenate([row.values, career_dev.values])
        enhanced_features.append(combined)
    
    sequence = np.array(enhanced_features)
    
    # Scale the sequence
    sequence_scaled = scaler.transform(sequence)
    # Take only the first 28 features (original features without career stats)
    sequence_scaled = sequence_scaled[:, :28]
    
    mask = torch.ones(3, dtype=torch.bool, device=device)
    
    predictions = []
    with torch.no_grad():
        # Change range from (1,4) to (1,4) to generate 2025-2027
        for year_offset in range(1, 4):
            current_year = last_season + year_offset
            current_age = last_age + year_offset
            
            data = torch.FloatTensor(sequence_scaled).unsqueeze(0).to(device)
            lengths = torch.tensor([3], dtype=torch.int64, device=device)
            
            output = model(data, lengths)
            pred_numpy = output.cpu().numpy()[0]
            
            # Pad predictions with zeros for career stats before inverse transform
            pred_padded = np.pad(pred_numpy, (0, 28), 'constant')
            unscaled_pred = scaler.inverse_transform(pred_padded.reshape(1, -1))[0][:28]
            
            prediction = {
                'Name': player_name,
                'Age': current_age,
                'Year': current_year,
                'IDfg': player_id
            }
            
            for i, feature in enumerate(input_features):
                prediction[feature] = unscaled_pred[i]
            
            predictions.append(prediction)
            
            # Update sequence for next prediction
            sequence_scaled = np.vstack([sequence_scaled[1:], pred_numpy])
    
    return predictions
def predict_all_2024_players(raw_df, player_names, model, scaler, input_features):
    """Predict 2025-2027 stats for all 2024 players"""
    logger.info("Starting predictions for all 2024 players")
    
    # Get all players from 2024 with minimum PA
    players_2024 = raw_df[
        (raw_df['Season'] == 2024) & 
        (raw_df['PA'] >= 100)
    ]['IDfg'].unique()
    
    logger.info(f"Found {len(players_2024)} players from 2024")
    
    all_predictions = []
    
    for player_id in tqdm(players_2024):
        try:
            predictions = predict_future_stats(
                player_id=player_id,
                input_features=input_features,
                model=model,
                scaler=scaler,
                raw_df=raw_df,
                player_names=player_names
            )
            if predictions:
                all_predictions.extend(predictions)
        except Exception as e:
            logger.error(f"Error predicting for player {player_id}: {str(e)}")
            continue
            
    if all_predictions:
        # Convert to DataFrame and save
        predictions_df = pd.DataFrame(all_predictions)
        predictions_df.to_csv('predictions_2025_2027.csv', index=False)
        logger.info(f"Saved predictions for {len(players_2024)} players")
    else:
        logger.warning("No predictions were generated")
    
    # Convert to DataFrame
    predictions_df = pd.DataFrame(all_predictions)
    
    # Sort by year first, then wRC+ within each year
    predictions_df = predictions_df.sort_values(
        ['Year', 'wRC+'], 
        ascending=[True, False]
    )
    
    # Export to CSV
    output_path = '../data/predictions_2025_2027.csv'
    predictions_df.to_csv(output_path, index=False)
    logger.info(f"Predictions exported to {output_path}")
    
    return predictions_df

raw_df = pd.read_csv('../data/mlb_batting_data_2010_2024.csv')
player_names = pd.read_csv('../data/player_names.csv')
scaler = joblib.load('scaler.pkl')

# Recreate data_config
data_config = DataConfig(
    input_features=[
        'Age', 'BB%', 'K%', 'ISO', 'BABIP', 'OBP', 'SLG', 'wOBA', 'wRC+',
        'Barrel%', 'HardHit%', 'EV', 'LA', 'maxEV', 'xBA', 'xSLG', 'xwOBA',
        'GB%', 'FB%', 'LD%', 'Pull%', 'Oppo%', 'O-Swing%', 'Z-Swing%',
        'Contact%', 'SwStr%', 'CSW%', 'TTO%'
    ]
)

# Load model
model = load_model_from_checkpoint(
    checkpoint_path='checkpoints/best_model.pth',
    data_config=data_config,
    device=device
)

predictions_df = predict_all_2024_players(
    raw_df=raw_df,
    player_names=player_names,
    model=model,
    scaler=scaler,
    input_features=data_config.input_features
)

# Display top 10 predicted performers for 2024
print("\nTop 10 Predicted Performers for 2024:")
print(predictions_df[predictions_df['Year'] == 2024][
    ['Name', 'Age', 'wRC+']
].head(10))

INFO:__main__:Loading model from checkpoints/best_model.pth
INFO:__main__:Starting predictions for all 2024 players
INFO:__main__:Found 455 players from 2024
  0%|          | 0/455 [00:00<?, ?it/s]INFO:__main__:
Generating predictions for Aaron Judge
INFO:__main__:Last season: 2024, Last age: 32
INFO:__main__:
Generating predictions for Bobby Witt Jr.
INFO:__main__:Last season: 2024, Last age: 24
  0%|          | 2/455 [00:00<00:36, 12.34it/s]INFO:__main__:
Generating predictions for Shohei Ohtani
INFO:__main__:Last season: 2024, Last age: 29
INFO:__main__:
Generating predictions for Juan Soto
INFO:__main__:Last season: 2024, Last age: 25
INFO:__main__:
Generating predictions for Gunnar Henderson
INFO:__main__:Last season: 2024, Last age: 23
  1%|          | 5/455 [00:00<00:24, 18.23it/s]INFO:__main__:
Generating predictions for Francisco Lindor
INFO:__main__:Last season: 2024, Last age: 30
INFO:__main__:
Generating predictions for Jarren Duran
INFO:__main__:Last season: 2024, Last age


Top 10 Predicted Performers for 2024:
Empty DataFrame
Columns: [Name, Age, wRC+]
Index: []
