In [111]:
# Core libraries
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Data processing and analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
from typing import Tuple, List, Optional, NamedTuple, Dict, Any
from dataclasses import dataclass, asdict, field




# Progress tracking and logging
from tqdm import tqdm
import logging

# Random seed
import random

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [112]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class PositionGroups:
    INFIELDERS = ['1B', '2B', '3B', 'SS']
    OUTFIELDERS = ['LF', 'CF', 'RF']
    CATCHERS = ['C']

    # Position-specific features
    INFIELD_FEATURES = ['Age', 'DRS/150', 'UZR/150', 'OAA/150','RngR/150', 'ErrR/150', 'DPR/150', 'Inn']
    OUTFIELD_FEATURES = ['Age', 'DRS/150', 'UZR/150', 'OAA/150', 'ARM/150', 'RngR/150', 'Inn'] 
    CATCHER_FEATURES = ['Age', 'DRS/150', 'FRM/150', 'rSB/150', 'rCERA/150', 'Inn']
@dataclass
class DataConfig:
    position_group: str
    seq_length: int = 6
    start_season: int = 2016
    min_innings: int = 50
    input_features: List[str] = None
    train_ratio: float = 0.7
    valid_ratio: float = 0.2
    random_seed: int = 42
    
    def __post_init__(self):
        # Set features based on position group
        if self.position_group == 'INFIELD':
            self.valid_positions = PositionGroups.INFIELDERS
            self.input_features = PositionGroups.INFIELD_FEATURES
        elif self.position_group == 'OUTFIELD':
            self.valid_positions = PositionGroups.OUTFIELDERS
            self.input_features = PositionGroups.OUTFIELD_FEATURES
        elif self.position_group == 'CATCHER':
            self.valid_positions = PositionGroups.CATCHERS
            self.input_features = PositionGroups.CATCHER_FEATURES

class SequenceHandler:
    """Handles creation and padding of sequences for LSTM input"""
    def __init__(self, seq_length: int, feature_dim: int):
        self.seq_length = seq_length
        self.feature_dim = feature_dim
        self.pad_value = 0
        
    def create_sequence(self, player_data: pd.DataFrame, input_features: List[str]) -> Tuple[np.ndarray, torch.Tensor]:
        available_seasons = len(player_data)
        
        if available_seasons >= self.seq_length:
            # Take most recent seasons
            sequence = player_data.iloc[-self.seq_length:][input_features].values
            mask = torch.ones(self.seq_length, dtype=torch.bool)
        else:
            # Create padding
            padding_size = self.seq_length - available_seasons
            real_data = player_data[input_features].values
            padding = np.full((padding_size, len(input_features)), self.pad_value)
            sequence = np.vstack([padding, real_data])
            mask = torch.zeros(self.seq_length, dtype=torch.bool)
            mask[padding_size:] = 1
            
        return sequence, mask
def prepare_sequences(df: pd.DataFrame, 
                     input_features: List[str],
                     seq_length: int,
                     valid_positions: List[str]) -> Tuple[List, List]:
    """Create sequences for LSTM input with better handling of incomplete seasons"""
    sequences = []
    masks = []
    handler = SequenceHandler(seq_length, len(input_features))
    skipped_sequences = 0

    # Group by player and position
    for (player_id, pos), player_data in df.groupby(['IDfg', 'Pos']):
        if pos not in valid_positions:
            continue
            
        # Remove seasons with any NaN values in features
        player_data = player_data.dropna(subset=input_features)
        player_data = player_data.sort_values('Season')
        num_seasons = len(player_data)

        if num_seasons < 1:  # Need at least one season
            continue

        # Generate sequences for each valid target season
        for i in range(1, num_seasons):
            history = player_data.iloc[:i]
            target = player_data.iloc[i][input_features].values

            # Create sequence and mask using handler (will handle padding automatically)
            sequence, mask = handler.create_sequence(history, input_features)

            if np.isnan(sequence).any() or np.isinf(sequence).any():
                skipped_sequences += 1
                continue

            sequences.append({
                'player_id': player_id,
                'position': pos,
                'sequence': sequence,
                'target': target
            })
            masks.append(mask)

    logger.info(f"Created {len(sequences)} valid sequences")
    logger.info(f"Skipped {skipped_sequences} sequences due to invalid values")

    return sequences, masks

def calculate_rate_stats(df: pd.DataFrame, position_group: str) -> pd.DataFrame:
    """Calculate position-specific rate statistics"""
    df = df.copy()
    
    # Base rates per 150 innings for all positions
    rate_stats = {
        'DRS': 'DRS/150'
    }
    
    # Add position-specific rates
    if position_group == 'INFIELD':
        rate_stats.update({
            'RngR': 'RngR/150',
            'ErrR': 'ErrR/150',
            'DPR': 'DPR/150',
            'UZR': 'UZR/150',
            'OAA': 'OAA/150'
        })
    elif position_group == 'OUTFIELD':
        rate_stats.update({
            'RngR': 'RngR/150',
            'ARM': 'ARM/150',
            'UZR': 'UZR/150',
            'OAA': 'OAA/150'
        })
    elif position_group == 'CATCHER':
        rate_stats.update({
            'FRM': 'FRM/150',
            'rSB': 'rSB/150',
            'rCERA': 'rCERA/150'
        })
    
    # Calculate rates
    for raw_stat, rate_stat in rate_stats.items():
        if raw_stat in df.columns:
            df[rate_stat] = df[raw_stat] / df['G'] * 150
            
    return df

def validate_features(df: pd.DataFrame, features: List[str]) -> None:
    """Validate that all required features exist in dataframe"""
    missing_features = [f for f in features if f not in df.columns]
    if missing_features:
        raise ValueError(f"Missing required features: {missing_features}")

def load_and_validate_data(file_path: str, config: DataConfig) -> pd.DataFrame:
    """Load data and perform initial validation"""
    logger.info(f"Loading data from {file_path}")
    try:
        df = pd.read_csv(file_path, low_memory=False)
        # Calculate rate stats before validation
        df = calculate_rate_stats(df, config.position_group)
        # Then validate all features including new rate stats
        validate_features(df, config.input_features)
        return df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise
def split_data(sequences: List[Tuple], 
               masks: List[torch.Tensor],
               train_ratio: float = 0.7,
               valid_ratio: float = 0.2) -> Tuple:
    """Split data into train, validation and test sets"""
    # Validate ratios
    if not 0 < train_ratio + valid_ratio < 1:
        raise ValueError("Train and validation ratios must sum to less than 1")
    
    logger.info("Splitting data into train, validation, and test sets")
    n = len(sequences)
    indices = np.random.permutation(n)
    
    train_size = int(n * train_ratio)
    valid_size = int(n * valid_ratio)
    
    train_indices = indices[:train_size]
    valid_indices = indices[train_size:train_size + valid_size]
    test_indices = indices[train_size + valid_size:]
    
    # Split sequences and masks
    train_data = ([sequences[i] for i in train_indices], [masks[i] for i in train_indices])
    valid_data = ([sequences[i] for i in valid_indices], [masks[i] for i in valid_indices])
    test_data = ([sequences[i] for i in test_indices], [masks[i] for i in test_indices])
    
    logger.info(f"Split sizes - Train: {len(train_indices)}, Valid: {len(valid_indices)}, Test: {len(test_indices)}")
    
    return train_data, valid_data, test_data
def filter_data(df: pd.DataFrame, config: DataConfig) -> pd.DataFrame:
    """Filter data based on position groups"""
    logger.info(f"Filtering data for {config.position_group}")
    initial_size = len(df)
    
    # Filter by season and position
    df = df[
        (df['Season'] >= config.start_season) & 
        (df['Pos'].isin(config.valid_positions))
    ]
    
    # Position-specific minimum innings
    min_innings = config.min_innings
    if config.position_group == 'CATCHER':
        min_innings = min_innings * 0.8  # Adjust for catchers
    
    df = df[df['Inn'] >= min_innings]
    df = calculate_rate_stats(df, config.position_group)
    df = df.dropna(subset=config.input_features)
    
    logger.info(f"Position distribution:")
    for pos in config.valid_positions:
        count = len(df[df['Pos'] == pos])
        logger.info(f"{pos}: {count} records")
    
    return df

def convert_column_types(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    """Convert columns to float32 for LSTM compatibility"""
    logger.info("Converting column types...")
    
    for col in features:
        try:
            # Convert percentage strings to floats if needed
            if df[col].dtype == object and df[col].str.contains('%').any():
                df[col] = df[col].str.rstrip('%').astype('float32') / 100
            else:
                df[col] = df[col].astype('float32')
        except Exception as e:
            logger.error(f"Error converting column {col}: {str(e)}")
            raise
            
    return df

from sklearn.preprocessing import MinMaxScaler

def scale_features(df: pd.DataFrame, 
                  features: List[str], 
                  position_group: str,
                  scaler: Optional[MinMaxScaler] = None) -> Tuple[pd.DataFrame, MinMaxScaler]:
    """Scale features using MinMaxScaler with position-specific career averages"""
    
    # Calculate position-specific career averages
    player_pos_stats = df.groupby(['IDfg', 'Pos'])[features].transform('mean')
    
    # Create deviation from career average features
    for feature in features:
        df[f'{feature}_vs_career'] = df[feature] - player_pos_stats[feature]
    
    all_features = features + [f'{feature}_vs_career' for feature in features]
    
    if scaler is None:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaled_data = scaler.fit_transform(df[all_features])
        joblib.dump(scaler, f'fielding_scaler_{position_group.lower()}.pkl')
        logger.info(f"Created new MinMaxScaler for {position_group}")
    else:
        scaled_data = scaler.transform(df[all_features])
    
    scaled_df = pd.DataFrame(scaled_data, columns=all_features, index=df.index)
    df[all_features] = scaled_df
    
    return df, scaler


def to_tensor(sequences: List[Dict], masks: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Convert sequences and masks to PyTorch tensors with validation"""
    sequences_array = np.array([s['sequence'] for s in sequences], dtype=np.float32)
    targets_array = np.array([s['target'] for s in sequences], dtype=np.float32)
    
    # Validate arrays before conversion
    if np.isnan(sequences_array).any():
        raise ValueError("NaN values found in input sequences")
    if np.isnan(targets_array).any():
        raise ValueError("NaN values found in target values")
        
    X = torch.FloatTensor(sequences_array)
    y = torch.FloatTensor(targets_array)
    masks = torch.stack(masks)
    
    logger.info(f"Tensor shapes - X: {X.shape}, y: {y.shape}, masks: {masks.shape}")
    logger.info(f"Value ranges - X: [{X.min():.2f}, {X.max():.2f}], y: [{y.min():.2f}, {y.max():.2f}]")
    
    return X, y, masks

def preprocess_data(file_path: str, config: DataConfig) -> Dict[str, Tuple]:
    """Process data for a specific position group"""
    try:
        # Load raw data once
        df = load_and_validate_data(file_path, config)
        
        # Filter data for this position group
        df = filter_data(df, config)
        
        # Scale features with position-specific scaler
        df, scaler = scale_features(
            df, 
            config.input_features,
            config.position_group
        )
        
        # Create sequences
        sequences, masks = prepare_sequences(
            df, 
            config.input_features, 
            config.seq_length,
            config.valid_positions
        )
        
        # Split and convert to tensors
        train_data, valid_data, test_data = split_data(sequences, masks)
        X_train, y_train, train_masks = to_tensor(*train_data)
        X_valid, y_valid, valid_masks = to_tensor(*valid_data)
        X_test, y_test, test_masks = to_tensor(*test_data)
        
        logger.info(f"{config.position_group} datasets - Train: {len(X_train)}, Valid: {len(X_valid)}, Test: {len(X_test)}")
        
        return (X_train, y_train, X_valid, y_valid, X_test, y_test,
                train_masks, valid_masks, test_masks)
        
    except Exception as e:
        logger.error(f"Error in preprocessing: {str(e)}")
        raise
        
    except Exception as e:
        logger.error(f"Error in preprocessing: {str(e)}")
        raise


# Initialize configs for each position group
infield_config = DataConfig(position_group='INFIELD')
outfield_config = DataConfig(position_group='OUTFIELD')
catcher_config = DataConfig(position_group='CATCHER')

# Process data for each group
infield_data = preprocess_data(
    '../data/mlb_fielding_data_2000_2024_with_age.csv',
    infield_config
)

outfield_data = preprocess_data(
    '../data/mlb_fielding_data_2000_2024_with_age.csv',
    outfield_config
)

catcher_data = preprocess_data(
    '../data/mlb_fielding_data_2000_2024_with_age.csv',
    catcher_config
)

INFO:__main__:Loading data from ../data/mlb_fielding_data_2000_2024_with_age.csv
INFO:__main__:Filtering data for INFIELD
INFO:__main__:Position distribution:
INFO:__main__:1B: 618 records
INFO:__main__:2B: 886 records
INFO:__main__:3B: 866 records
INFO:__main__:SS: 700 records
INFO:__main__:Created new MinMaxScaler for INFIELD
INFO:__main__:Converting column types...
INFO:__main__:Created 1881 valid sequences
INFO:__main__:Skipped 0 sequences due to invalid values
INFO:__main__:Splitting data into train, validation, and test sets
INFO:__main__:Split sizes - Train: 1316, Valid: 376, Test: 189
INFO:__main__:Tensor shapes - X: torch.Size([1316, 6, 8]), y: torch.Size([1316, 8]), masks: torch.Size([1316, 6])
INFO:__main__:Value ranges - X: [-1.00, 1.00], y: [-1.00, 1.00]
INFO:__main__:Tensor shapes - X: torch.Size([376, 6, 8]), y: torch.Size([376, 8]), masks: torch.Size([376, 6])
INFO:__main__:Value ranges - X: [-1.00, 0.99], y: [-1.00, 1.00]
INFO:__main__:Tensor shapes - X: torch.Size([18

In [113]:
class MultiHeadAttention(nn.Module):
    def __init__(
        self, 
        hidden_size: int,
        num_heads: int = 8,
        dropout: float = 0.1,
        bias: bool = True
    ):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        self.scaling = self.head_dim ** -0.5
        
        assert self.head_dim * num_heads == hidden_size, "hidden_size must be divisible by num_heads"
        
        # Linear projections
        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        self.k_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        self.v_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        self.out_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Initialize parameters
        self._reset_parameters()
    
    def _reset_parameters(self):
        # Use Xavier uniform initialization
        nn.init.xavier_uniform_(self.q_proj.weight)
        nn.init.xavier_uniform_(self.k_proj.weight)
        nn.init.xavier_uniform_(self.v_proj.weight)
        nn.init.xavier_uniform_(self.out_proj.weight)
        if self.q_proj.bias is not None:
            nn.init.zeros_(self.q_proj.bias)
            nn.init.zeros_(self.k_proj.bias)
            nn.init.zeros_(self.v_proj.bias)
            nn.init.zeros_(self.out_proj.bias)
    
    def forward(
        self,
        query: torch.Tensor,
        key: Optional[torch.Tensor] = None,
        value: Optional[torch.Tensor] = None,
        key_padding_mask: Optional[torch.Tensor] = None,
        need_weights: bool = False
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Set key and value to query if not provided
        if key is None:
            key = query
        if value is None:
            value = query
            
        batch_size, seq_len, _ = query.size()
        
        # Project inputs
        q = self.q_proj(query)
        k = self.k_proj(key)
        v = self.v_proj(value)
        
        # Reshape for multi-head attention
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Compute attention scores
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scaling
        
        # Apply key padding mask if provided
        if key_padding_mask is not None:
            attn_weights = attn_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf')
            )
        
        # Apply softmax and dropout
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Get attention output
        attn_output = torch.matmul(attn_weights, v)
        
        # Reshape and project output
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, seq_len, self.hidden_size)
        attn_output = self.out_proj(attn_output)
        
        if need_weights:
            return attn_output, attn_weights
        return attn_output, None

In [114]:
class ResidualBlock(nn.Module):
    def __init__(self, hidden_size: int, dropout: float = 0.1):
        super().__init__()
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.layers = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 4, hidden_size)
        )
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.layers(self.layer_norm(x))

class ImprovedLSTM(nn.Module):
    def __init__(
        self, 
        input_size: int,
        hidden_size: int = 512,
        num_layers: int = 4,
        output_size: int = None,
        dropout: float = 0.3,
        bidirectional: bool = True,
        num_heads: int = 8  # Keep this parameter for compatibility
    ):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size // 2  # Reduce internal hidden size
        self.num_layers = 2  # Use fewer layers internally
        self.output_size = output_size or input_size
        self.bidirectional = bidirectional
        self.directions = 2 if bidirectional else 1
        
        # Learned padding token
        self.pad_token = nn.Parameter(torch.randn(1, 1, input_size))
        
        # Input projection with Layer Normalization
        self.input_projection = nn.Sequential(
            nn.Linear(input_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU()
        )
        
        # LSTM layers with residual connections and layer normalization
        self.lstm_layers = nn.ModuleList([
            nn.ModuleDict({
                'lstm': nn.LSTM(
                    self.hidden_size * self.directions if i > 0 else self.hidden_size,
                    self.hidden_size,
                    num_layers=1,
                    batch_first=True,
                    bidirectional=bidirectional
                ),
                'norm': nn.LayerNorm(self.hidden_size * self.directions),
                'dropout': nn.Dropout(dropout/2)  # Reduce dropout
            }) for i in range(self.num_layers)
        ])
        
        # Attention mechanism
        self.attention = MultiHeadAttention(
            self.hidden_size * self.directions,
            num_heads=4,  # Reduced from num_heads parameter
            dropout=dropout/2
        )
        
        # Output projection
        self.output_projection = nn.Sequential(
            nn.Linear(self.hidden_size * self.directions, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Dropout(dropout/2),
            nn.Linear(hidden_size, self.output_size)
        )

    def forward(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len, _ = x.size()
        
        # Replace zero padding with learned padding token
        padding_mask = (x.sum(dim=-1) == 0).unsqueeze(-1)
        x = torch.where(padding_mask, self.pad_token.expand(batch_size, seq_len, -1), x)
        
        # Create attention mask
        attention_mask = torch.arange(seq_len, device=x.device)[None, :] < lengths[:, None]
        
        # Input projection
        x = self.input_projection(x)
        
        # Process LSTM layers with residual connections
        for layer in self.lstm_layers:
            # Pack padded sequence
            packed_x = pack_padded_sequence(
                x, lengths.cpu(),
                batch_first=True,
                enforce_sorted=False
            )
            
            # LSTM forward pass
            lstm_out, _ = layer['lstm'](packed_x)
            lstm_out, _ = pad_packed_sequence(
                lstm_out,
                batch_first=True,
                total_length=seq_len
            )
            
            # Apply normalization and dropout
            lstm_out = layer['norm'](lstm_out)
            lstm_out = layer['dropout'](lstm_out)
            
            # Residual connection if shapes match
            if lstm_out.size(-1) == x.size(-1):
                x = x + lstm_out
            else:
                x = lstm_out
        
        # Apply attention with proper masking
        attended, _ = self.attention(
            x, x, x,
            key_padding_mask=~attention_mask
        )
        
        # Get final states using sequence lengths
        batch_indices = torch.arange(batch_size, device=x.device)
        final_states = attended[batch_indices, lengths - 1]
        
        # Project to output size
        output = self.output_projection(final_states)
        
        return output

In [115]:
#Model configuration
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class DataBatch(NamedTuple):
    """Container for training data batches."""
    train: TensorDataset
    valid: TensorDataset
    test: TensorDataset

@dataclass
class Config:
    """Advanced configuration for LSTM-based baseball statistics prediction."""
    
    # Dynamic sizes from data
    input_size: int = None
    output_size: int = None
    
    # Model Architecture 
    hidden_size: int = 256
    num_layers: int = 4
    num_heads: int = 4
    bidirectional: bool = True
    attention_dropout: float = 0.1
    residual_dropout: float = 0.2
    layer_norm_eps: float = 1e-5
    
    # Training Parameters
    batch_size: int = 16
    dropout: float = 0.3
    learning_rate: float = 1e-3
    weight_decay: float = 1e-5
    gradient_clip: float = 1.0
    num_epochs: int = 50
    warmup_epochs: int = 5
    
    # Learning Rate Schedule
    lr_schedule: str = 'cosine'
    min_lr: float = 1e-6
    lr_decay_rate: float = 0.1
    lr_patience: int = 5
    
    # Early Stopping
    early_stopping_patience: int = 10
    early_stopping_min_delta: float = 1e-4
    
    # Loss Function Parameters
    diversity_alpha: float = 0.1  # Weight for diversity penalty
    consistency_beta: float = 0.05  # Weight for consistency penalty
    
    # Hardware Optimization
    mixed_precision: bool = True
    num_workers: int = 0
    pin_memory: bool = True
    
    # Logging
    log_interval: int = 100
    checkpoint_interval: int = 1
    
    def __init__(self, X_train: torch.Tensor, y_train: torch.Tensor):
        self.input_size = X_train.shape[2]
        self.output_size = y_train.shape[1]
        self._validate_config()
        self._log_config()
    
    def _validate_config(self) -> None:
        assert self.hidden_size % self.num_heads == 0, \
            "Hidden size must be divisible by number of attention heads"
        assert self.hidden_size >= self.input_size, \
            "Hidden size must be greater than or equal to input size"
        assert 0 <= self.dropout <= 1, "Dropout must be between 0 and 1"
        assert self.num_layers >= 1, "Must have at least one LSTM layer"
        assert self.batch_size > 0, "Batch size must be positive"
        assert self.learning_rate > 0, "Learning rate must be positive"
        assert self.lr_schedule in ['cosine', 'linear', 'exponential'], \
            "Invalid learning rate schedule"
        assert 0 <= self.diversity_alpha <= 1, "Diversity alpha must be between 0 and 1"
        assert 0 <= self.consistency_beta <= 1, "Consistency beta must be between 0 and 1"
    
    def _log_config(self) -> None:
        logger.info("Model Configuration:")
        for key, value in asdict(self).items():
            logger.info(f"{key}: {value}")
    
    @property
    def device(self) -> torch.device:
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PlayerDifferentiationLoss(nn.Module):
    def __init__(self, alpha: float = 0.1, beta: float = 0.05):
        super().__init__()
        self.mse = nn.MSELoss()
        self.alpha = alpha
        self.beta = beta

    def forward(self, predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        # Base MSE loss
        mse_loss = self.mse(predictions, targets)
        
        # Diversity penalty - encourage different predictions within batch
        batch_mean = predictions.mean(dim=0, keepdim=True)
        diversity_loss = -torch.mean(torch.abs(predictions - batch_mean))
        
        # Consistency penalty - predictions should be stable
        pred_std = predictions.std(dim=0).mean()
        consistency_loss = torch.abs(pred_std - targets.std(dim=0).mean())
        
        # Combine losses
        total_loss = mse_loss + self.alpha * diversity_loss + self.beta * consistency_loss
        
        return total_loss
    
    @property
    def device(self) -> torch.device:
        """Get appropriate device for training."""
        return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def create_data_loaders(data_tuple: tuple) -> DataBatch:
    """Create DataLoader objects from preprocessed data tuple."""
    try:
        X_train, y_train, X_valid, y_valid, X_test, y_test, \
        train_masks, valid_masks, test_masks = data_tuple
        
        # Create datasets
        train_dataset = TensorDataset(X_train, train_masks, y_train)
        valid_dataset = TensorDataset(X_valid, valid_masks, y_valid)
        test_dataset = TensorDataset(X_test, test_masks, y_test)
        
        return DataBatch(train_dataset, valid_dataset, test_dataset)
    
    except ValueError as e:
        logger.error(f"Error unpacking data: {str(e)}")
        raise



Init IF

In [116]:
# Initialize everything
try:
    # Create data loaders
    data_batch = create_data_loaders(infield_data)
    
    # Initialize config
    config = Config(data_batch.train.tensors[0], data_batch.train.tensors[2])
    
    # Create DataLoaders
    train_loader = DataLoader(
        data_batch.train,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    valid_loader = DataLoader(
        data_batch.valid,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    test_loader = DataLoader(
        data_batch.test,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )

    # Initialize model
    model = ImprovedLSTM(
        input_size=config.input_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        output_size=config.output_size,
        dropout=config.dropout,
        bidirectional=config.bidirectional,
        num_heads=config.num_heads
    ).to(config.device)
    
    # Initialize optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )
    
    # Initialize learning rate scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.learning_rate,
        epochs=config.num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=config.warmup_epochs / config.num_epochs,
        anneal_strategy='cos',
        final_div_factor=1e3
    )
    
    # Initialize loss function
    criterion = nn.MSELoss()
    
    logger.info(f"Data loaded successfully. Device: {config.device}")
    logger.info(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
    logger.info(f"Training batches: {len(train_loader)}")
    logger.info(f"Validation batches: {len(valid_loader)}")
    logger.info(f"Test batches: {len(test_loader)}")

except Exception as e:
    logger.error(f"Error during initialization: {str(e)}")
    raise

INFO:__main__:Model Configuration:
INFO:__main__:input_size: 8
INFO:__main__:output_size: 8
INFO:__main__:hidden_size: 256
INFO:__main__:num_layers: 4
INFO:__main__:num_heads: 4
INFO:__main__:bidirectional: True
INFO:__main__:attention_dropout: 0.1
INFO:__main__:residual_dropout: 0.2
INFO:__main__:layer_norm_eps: 1e-05
INFO:__main__:batch_size: 16
INFO:__main__:dropout: 0.3
INFO:__main__:learning_rate: 0.001
INFO:__main__:weight_decay: 1e-05
INFO:__main__:gradient_clip: 1.0
INFO:__main__:num_epochs: 50
INFO:__main__:warmup_epochs: 5
INFO:__main__:lr_schedule: cosine
INFO:__main__:min_lr: 1e-06
INFO:__main__:lr_decay_rate: 0.1
INFO:__main__:lr_patience: 5
INFO:__main__:early_stopping_patience: 10
INFO:__main__:early_stopping_min_delta: 0.0001
INFO:__main__:diversity_alpha: 0.1
INFO:__main__:consistency_beta: 0.05
INFO:__main__:mixed_precision: True
INFO:__main__:num_workers: 0
INFO:__main__:pin_memory: True
INFO:__main__:log_interval: 100
INFO:__main__:checkpoint_interval: 1
INFO:__main

In [117]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    config: Config,
    optimizer: optim.Optimizer,
    scheduler: optim.lr_scheduler._LRScheduler,
    criterion: nn.Module,
    checkpoint_dir: str = './checkpoints'
) -> dict:
    """Train LSTM model with advanced optimizations and monitoring."""
    import os
    
    logger.info(f"Starting training on device: {config.device}")
    model = model.to(config.device)
    
    # Mixed precision training
    scaler = torch.cuda.amp.GradScaler(enabled=config.mixed_precision)
    
    # Training state tracking
    best_val_loss = float('inf')
    early_stopping_counter = 0
    train_metrics = {
        'train_losses': [],
        'val_losses': [],
        'learning_rates': [],
        'best_epoch': 0
    }
    
    # Create checkpoint directory
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    for epoch in range(config.num_epochs):
        # Training phase
        model.train()
        epoch_loss = 0.0
        
        with tqdm(train_loader, desc=f'Epoch {epoch+1}/{config.num_epochs}') as pbar:
            for batch_idx, (data, masks, targets) in enumerate(pbar):
                try:
                    # Move data to device
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    
                    # Calculate sequence lengths from masks
                    lengths = masks.sum(1).clamp(min=1)  # Ensure minimum length of 1
                    
                    # Debug info
                    if batch_idx == 0 and epoch == 0:
                        logger.info(f"Batch shapes - Data: {data.shape}, Masks: {masks.shape}, "
                                  f"Targets: {targets.shape}, Lengths: {lengths.shape}")
                    
                    # Forward pass with mixed precision
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        if batch_idx == 0 and epoch == 0:
                            logger.info(f"Output shape: {outputs.shape}")
                        loss = criterion(outputs, targets)
                    
                    # Backward pass with gradient scaling
                    optimizer.zero_grad(set_to_none=True)
                    scaler.scale(loss).backward()
                    
                    # Gradient clipping
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                    
                    # Optimizer step with scaler
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    
                    # Update metrics
                    epoch_loss += loss.item()
                    current_lr = scheduler.get_last_lr()[0]
                    
                    # Update progress bar
                    pbar.set_postfix({
                        'loss': f'{loss.item():.3f}',
                        'lr': f'{current_lr:.2e}'
                    })
                    
                except RuntimeError as e:
                    logger.error(f"Error in batch {batch_idx}: {str(e)}")
                    logger.error(f"Data shapes - Input: {data.shape}, Mask: {masks.shape}, "
                               f"Target: {targets.shape}, Lengths: {lengths.shape}")
                    raise
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for data, masks, targets in valid_loader:
                try:
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    lengths = masks.sum(1).clamp(min=1)
                    
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        loss = criterion(outputs, targets)
                        val_loss += loss.item()
                        
                except RuntimeError as e:
                    logger.error(f"Error in validation: {str(e)}")
                    raise
        
        # Calculate epoch metrics
        epoch_loss /= len(train_loader)
        val_loss /= len(valid_loader)
        
        # Update training metrics
        train_metrics['train_losses'].append(epoch_loss)
        train_metrics['val_losses'].append(val_loss)
        train_metrics['learning_rates'].append(current_lr)
        
        # Model checkpointing
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            train_metrics['best_epoch'] = epoch
            early_stopping_counter = 0
            
            # Save checkpoint
            checkpoint_path = os.path.join(checkpoint_dir, 'IF_fielding_model.pth')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'val_loss': val_loss,
                'config': asdict(config),
                'metrics': train_metrics,
                'scaler_state_dict': scaler.state_dict()
            }, checkpoint_path)
            
            logger.info(f'New best model saved with validation loss: {val_loss:.4f}')
        else:
            early_stopping_counter += 1
        
        # Log epoch metrics
        logger.info(
            f'Epoch {epoch+1}: '
            f'Train Loss = {epoch_loss:.4f}, '
            f'Val Loss = {val_loss:.4f}, '
            f'LR = {current_lr:.2e}'
        )
        
        # Early stopping check
        if early_stopping_counter >= config.early_stopping_patience:
            logger.info(f'Early stopping triggered after {epoch+1} epochs')
            break
    
    return train_metrics

# Initialize training
try:
    metrics = train_model(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        config=config,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion
    )
except Exception as e:
    logger.error(f"Training failed: {str(e)}")
    raise

INFO:__main__:Starting training on device: cuda
Epoch 1/50:   0%|          | 0/83 [00:00<?, ?it/s]INFO:__main__:Batch shapes - Data: torch.Size([16, 6, 8]), Masks: torch.Size([16, 6]), Targets: torch.Size([16, 8]), Lengths: torch.Size([16])


INFO:__main__:Output shape: torch.Size([16, 8])
Epoch 1/50: 100%|██████████| 83/83 [00:02<00:00, 34.97it/s, loss=0.068, lr=1.32e-04]
INFO:__main__:New best model saved with validation loss: 0.0664
INFO:__main__:Epoch 1: Train Loss = 0.1208, Val Loss = 0.0664, LR = 1.32e-04
Epoch 2/50: 100%|██████████| 83/83 [00:02<00:00, 35.28it/s, loss=0.065, lr=3.73e-04]
INFO:__main__:Epoch 2: Train Loss = 0.0852, Val Loss = 0.0665, LR = 3.73e-04
Epoch 3/50: 100%|██████████| 83/83 [00:02<00:00, 31.80it/s, loss=0.075, lr=6.70e-04]
INFO:__main__:New best model saved with validation loss: 0.0628
INFO:__main__:Epoch 3: Train Loss = 0.0768, Val Loss = 0.0628, LR = 6.70e-04
Epoch 4/50: 100%|██████████| 83/83 [00:02<00:00, 31.15it/s, loss=0.080, lr=9.10e-04]
INFO:__main__:Epoch 4: Train Loss = 0.0731, Val Loss = 0.0639, LR = 9.10e-04
Epoch 5/50: 100%|██████████| 83/83 [00:02<00:00, 30.99it/s, loss=0.048, lr=1.00e-03]
INFO:__main__:Epoch 5: Train Loss = 0.0725, Val Loss = 0.0644, LR = 1.00e-03
Epoch 6/50: 10

OF Init & Train

In [118]:
# Initialize everything
try:
    # Create data loaders
    data_batch = create_data_loaders(outfield_data)
    
    # Initialize config
    config = Config(data_batch.train.tensors[0], data_batch.train.tensors[2])
    
    # Create DataLoaders
    train_loader = DataLoader(
        data_batch.train,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    valid_loader = DataLoader(
        data_batch.valid,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    test_loader = DataLoader(
        data_batch.test,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )

    # Initialize model
    model = ImprovedLSTM(
        input_size=config.input_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        output_size=config.output_size,
        dropout=config.dropout,
        bidirectional=config.bidirectional,
        num_heads=config.num_heads
    ).to(config.device)
    
    # Initialize optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )
    
    # Initialize learning rate scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.learning_rate,
        epochs=config.num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=config.warmup_epochs / config.num_epochs,
        anneal_strategy='cos',
        final_div_factor=1e3
    )
    
    # Initialize loss function
    criterion = nn.MSELoss()
    
    logger.info(f"Data loaded successfully. Device: {config.device}")
    logger.info(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
    logger.info(f"Training batches: {len(train_loader)}")
    logger.info(f"Validation batches: {len(valid_loader)}")
    logger.info(f"Test batches: {len(test_loader)}")

except Exception as e:
    logger.error(f"Error during initialization: {str(e)}")
    raise

INFO:__main__:Model Configuration:
INFO:__main__:input_size: 7
INFO:__main__:output_size: 7
INFO:__main__:hidden_size: 256
INFO:__main__:num_layers: 4
INFO:__main__:num_heads: 4
INFO:__main__:bidirectional: True
INFO:__main__:attention_dropout: 0.1
INFO:__main__:residual_dropout: 0.2
INFO:__main__:layer_norm_eps: 1e-05
INFO:__main__:batch_size: 16
INFO:__main__:dropout: 0.3
INFO:__main__:learning_rate: 0.001
INFO:__main__:weight_decay: 1e-05
INFO:__main__:gradient_clip: 1.0
INFO:__main__:num_epochs: 50
INFO:__main__:warmup_epochs: 5
INFO:__main__:lr_schedule: cosine
INFO:__main__:min_lr: 1e-06
INFO:__main__:lr_decay_rate: 0.1
INFO:__main__:lr_patience: 5
INFO:__main__:early_stopping_patience: 10
INFO:__main__:early_stopping_min_delta: 0.0001
INFO:__main__:diversity_alpha: 0.1
INFO:__main__:consistency_beta: 0.05
INFO:__main__:mixed_precision: True
INFO:__main__:num_workers: 0
INFO:__main__:pin_memory: True
INFO:__main__:log_interval: 100
INFO:__main__:checkpoint_interval: 1
INFO:__main

In [119]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    config: Config,
    optimizer: optim.Optimizer,
    scheduler: optim.lr_scheduler._LRScheduler,
    criterion: nn.Module,
    checkpoint_dir: str = './checkpoints'
) -> dict:
    """Train LSTM model with advanced optimizations and monitoring."""
    import os
    
    logger.info(f"Starting training on device: {config.device}")
    model = model.to(config.device)
    
    # Mixed precision training
    scaler = torch.cuda.amp.GradScaler(enabled=config.mixed_precision)
    
    # Training state tracking
    best_val_loss = float('inf')
    early_stopping_counter = 0
    train_metrics = {
        'train_losses': [],
        'val_losses': [],
        'learning_rates': [],
        'best_epoch': 0
    }
    
    # Create checkpoint directory
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    for epoch in range(config.num_epochs):
        # Training phase
        model.train()
        epoch_loss = 0.0
        
        with tqdm(train_loader, desc=f'Epoch {epoch+1}/{config.num_epochs}') as pbar:
            for batch_idx, (data, masks, targets) in enumerate(pbar):
                try:
                    # Move data to device
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    
                    # Calculate sequence lengths from masks
                    lengths = masks.sum(1).clamp(min=1)  # Ensure minimum length of 1
                    
                    # Debug info
                    if batch_idx == 0 and epoch == 0:
                        logger.info(f"Batch shapes - Data: {data.shape}, Masks: {masks.shape}, "
                                  f"Targets: {targets.shape}, Lengths: {lengths.shape}")
                    
                    # Forward pass with mixed precision
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        if batch_idx == 0 and epoch == 0:
                            logger.info(f"Output shape: {outputs.shape}")
                        loss = criterion(outputs, targets)
                    
                    # Backward pass with gradient scaling
                    optimizer.zero_grad(set_to_none=True)
                    scaler.scale(loss).backward()
                    
                    # Gradient clipping
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                    
                    # Optimizer step with scaler
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    
                    # Update metrics
                    epoch_loss += loss.item()
                    current_lr = scheduler.get_last_lr()[0]
                    
                    # Update progress bar
                    pbar.set_postfix({
                        'loss': f'{loss.item():.3f}',
                        'lr': f'{current_lr:.2e}'
                    })
                    
                except RuntimeError as e:
                    logger.error(f"Error in batch {batch_idx}: {str(e)}")
                    logger.error(f"Data shapes - Input: {data.shape}, Mask: {masks.shape}, "
                               f"Target: {targets.shape}, Lengths: {lengths.shape}")
                    raise
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for data, masks, targets in valid_loader:
                try:
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    lengths = masks.sum(1).clamp(min=1)
                    
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        loss = criterion(outputs, targets)
                        val_loss += loss.item()
                        
                except RuntimeError as e:
                    logger.error(f"Error in validation: {str(e)}")
                    raise
        
        # Calculate epoch metrics
        epoch_loss /= len(train_loader)
        val_loss /= len(valid_loader)
        
        # Update training metrics
        train_metrics['train_losses'].append(epoch_loss)
        train_metrics['val_losses'].append(val_loss)
        train_metrics['learning_rates'].append(current_lr)
        
        # Model checkpointing
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            train_metrics['best_epoch'] = epoch
            early_stopping_counter = 0
            
            # Save checkpoint
            checkpoint_path = os.path.join(checkpoint_dir, 'OF_fielding_model.pth')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'val_loss': val_loss,
                'config': asdict(config),
                'metrics': train_metrics,
                'scaler_state_dict': scaler.state_dict()
            }, checkpoint_path)
            
            logger.info(f'New best model saved with validation loss: {val_loss:.4f}')
        else:
            early_stopping_counter += 1
        
        # Log epoch metrics
        logger.info(
            f'Epoch {epoch+1}: '
            f'Train Loss = {epoch_loss:.4f}, '
            f'Val Loss = {val_loss:.4f}, '
            f'LR = {current_lr:.2e}'
        )
        
        # Early stopping check
        if early_stopping_counter >= config.early_stopping_patience:
            logger.info(f'Early stopping triggered after {epoch+1} epochs')
            break
    
    return train_metrics

# Initialize training
try:
    metrics = train_model(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        config=config,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion
    )
except Exception as e:
    logger.error(f"Training failed: {str(e)}")
    raise

INFO:__main__:Starting training on device: cuda
Epoch 1/50:   0%|          | 0/82 [00:00<?, ?it/s]INFO:__main__:Batch shapes - Data: torch.Size([16, 6, 7]), Masks: torch.Size([16, 6]), Targets: torch.Size([16, 7]), Lengths: torch.Size([16])
INFO:__main__:Output shape: torch.Size([16, 7])
Epoch 1/50: 100%|██████████| 82/82 [00:02<00:00, 32.23it/s, loss=0.081, lr=1.32e-04]
INFO:__main__:New best model saved with validation loss: 0.0651
INFO:__main__:Epoch 1: Train Loss = 0.1224, Val Loss = 0.0651, LR = 1.32e-04
Epoch 2/50: 100%|██████████| 82/82 [00:02<00:00, 32.72it/s, loss=0.066, lr=3.73e-04]
INFO:__main__:Epoch 2: Train Loss = 0.0829, Val Loss = 0.0683, LR = 3.73e-04
Epoch 3/50: 100%|██████████| 82/82 [00:02<00:00, 32.73it/s, loss=0.074, lr=6.70e-04]
INFO:__main__:New best model saved with validation loss: 0.0644
INFO:__main__:Epoch 3: Train Loss = 0.0752, Val Loss = 0.0644, LR = 6.70e-04
Epoch 4/50: 100%|██████████| 82/82 [00:02<00:00, 33.95it/s, loss=0.078, lr=9.10e-04]
INFO:__main_

Catcher Init & Train

In [120]:
# Initialize everything
try:
    # Create data loaders
    data_batch = create_data_loaders(catcher_data)
    
    # Initialize config
    config = Config(data_batch.train.tensors[0], data_batch.train.tensors[2])
    
    # Create DataLoaders
    train_loader = DataLoader(
        data_batch.train,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    valid_loader = DataLoader(
        data_batch.valid,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )
    
    test_loader = DataLoader(
        data_batch.test,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=config.pin_memory
    )

    # Initialize model
    model = ImprovedLSTM(
        input_size=config.input_size,
        hidden_size=config.hidden_size,
        num_layers=config.num_layers,
        output_size=config.output_size,
        dropout=config.dropout,
        bidirectional=config.bidirectional,
        num_heads=config.num_heads
    ).to(config.device)
    
    # Initialize optimizer
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )
    
    # Initialize learning rate scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config.learning_rate,
        epochs=config.num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=config.warmup_epochs / config.num_epochs,
        anneal_strategy='cos',
        final_div_factor=1e3
    )
    
    # Initialize loss function
    criterion = nn.MSELoss()
    
    logger.info(f"Data loaded successfully. Device: {config.device}")
    logger.info(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
    logger.info(f"Training batches: {len(train_loader)}")
    logger.info(f"Validation batches: {len(valid_loader)}")
    logger.info(f"Test batches: {len(test_loader)}")

except Exception as e:
    logger.error(f"Error during initialization: {str(e)}")
    raise

INFO:__main__:Model Configuration:
INFO:__main__:input_size: 6
INFO:__main__:output_size: 6
INFO:__main__:hidden_size: 256
INFO:__main__:num_layers: 4
INFO:__main__:num_heads: 4
INFO:__main__:bidirectional: True
INFO:__main__:attention_dropout: 0.1
INFO:__main__:residual_dropout: 0.2
INFO:__main__:layer_norm_eps: 1e-05
INFO:__main__:batch_size: 16
INFO:__main__:dropout: 0.3
INFO:__main__:learning_rate: 0.001
INFO:__main__:weight_decay: 1e-05
INFO:__main__:gradient_clip: 1.0
INFO:__main__:num_epochs: 50
INFO:__main__:warmup_epochs: 5
INFO:__main__:lr_schedule: cosine
INFO:__main__:min_lr: 1e-06
INFO:__main__:lr_decay_rate: 0.1
INFO:__main__:lr_patience: 5
INFO:__main__:early_stopping_patience: 10
INFO:__main__:early_stopping_min_delta: 0.0001
INFO:__main__:diversity_alpha: 0.1
INFO:__main__:consistency_beta: 0.05
INFO:__main__:mixed_precision: True
INFO:__main__:num_workers: 0
INFO:__main__:pin_memory: True
INFO:__main__:log_interval: 100
INFO:__main__:checkpoint_interval: 1
INFO:__main

In [121]:
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    config: Config,
    optimizer: optim.Optimizer,
    scheduler: optim.lr_scheduler._LRScheduler,
    criterion: nn.Module,
    checkpoint_dir: str = './checkpoints'
) -> dict:
    """Train LSTM model with advanced optimizations and monitoring."""
    import os
    
    logger.info(f"Starting training on device: {config.device}")
    model = model.to(config.device)
    
    # Mixed precision training
    scaler = torch.cuda.amp.GradScaler(enabled=config.mixed_precision)
    
    # Training state tracking
    best_val_loss = float('inf')
    early_stopping_counter = 0
    train_metrics = {
        'train_losses': [],
        'val_losses': [],
        'learning_rates': [],
        'best_epoch': 0
    }
    
    # Create checkpoint directory
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    for epoch in range(config.num_epochs):
        # Training phase
        model.train()
        epoch_loss = 0.0
        
        with tqdm(train_loader, desc=f'Epoch {epoch+1}/{config.num_epochs}') as pbar:
            for batch_idx, (data, masks, targets) in enumerate(pbar):
                try:
                    # Move data to device
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    
                    # Calculate sequence lengths from masks
                    lengths = masks.sum(1).clamp(min=1)  # Ensure minimum length of 1
                    
                    # Debug info
                    if batch_idx == 0 and epoch == 0:
                        logger.info(f"Batch shapes - Data: {data.shape}, Masks: {masks.shape}, "
                                  f"Targets: {targets.shape}, Lengths: {lengths.shape}")
                    
                    # Forward pass with mixed precision
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        if batch_idx == 0 and epoch == 0:
                            logger.info(f"Output shape: {outputs.shape}")
                        loss = criterion(outputs, targets)
                    
                    # Backward pass with gradient scaling
                    optimizer.zero_grad(set_to_none=True)
                    scaler.scale(loss).backward()
                    
                    # Gradient clipping
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                    
                    # Optimizer step with scaler
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    
                    # Update metrics
                    epoch_loss += loss.item()
                    current_lr = scheduler.get_last_lr()[0]
                    
                    # Update progress bar
                    pbar.set_postfix({
                        'loss': f'{loss.item():.3f}',
                        'lr': f'{current_lr:.2e}'
                    })
                    
                except RuntimeError as e:
                    logger.error(f"Error in batch {batch_idx}: {str(e)}")
                    logger.error(f"Data shapes - Input: {data.shape}, Mask: {masks.shape}, "
                               f"Target: {targets.shape}, Lengths: {lengths.shape}")
                    raise
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for data, masks, targets in valid_loader:
                try:
                    data = data.to(config.device)
                    masks = masks.to(config.device)
                    targets = targets.to(config.device)
                    lengths = masks.sum(1).clamp(min=1)
                    
                    with torch.cuda.amp.autocast(enabled=config.mixed_precision):
                        outputs = model(data, lengths)
                        loss = criterion(outputs, targets)
                        val_loss += loss.item()
                        
                except RuntimeError as e:
                    logger.error(f"Error in validation: {str(e)}")
                    raise
        
        # Calculate epoch metrics
        epoch_loss /= len(train_loader)
        val_loss /= len(valid_loader)
        
        # Update training metrics
        train_metrics['train_losses'].append(epoch_loss)
        train_metrics['val_losses'].append(val_loss)
        train_metrics['learning_rates'].append(current_lr)
        
        # Model checkpointing
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            train_metrics['best_epoch'] = epoch
            early_stopping_counter = 0
            
            # Save checkpoint
            checkpoint_path = os.path.join(checkpoint_dir, 'C_fielding_model.pth')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'val_loss': val_loss,
                'config': asdict(config),
                'metrics': train_metrics,
                'scaler_state_dict': scaler.state_dict()
            }, checkpoint_path)
            
            logger.info(f'New best model saved with validation loss: {val_loss:.4f}')
        else:
            early_stopping_counter += 1
        
        # Log epoch metrics
        logger.info(
            f'Epoch {epoch+1}: '
            f'Train Loss = {epoch_loss:.4f}, '
            f'Val Loss = {val_loss:.4f}, '
            f'LR = {current_lr:.2e}'
        )
        
        # Early stopping check
        if early_stopping_counter >= config.early_stopping_patience:
            logger.info(f'Early stopping triggered after {epoch+1} epochs')
            break
    
    return train_metrics

# Initialize training
try:
    metrics = train_model(
        model=model,
        train_loader=train_loader,
        valid_loader=valid_loader,
        config=config,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion
    )
except Exception as e:
    logger.error(f"Training failed: {str(e)}")
    raise

INFO:__main__:Starting training on device: cuda
Epoch 1/50:   0%|          | 0/27 [00:00<?, ?it/s]INFO:__main__:Batch shapes - Data: torch.Size([16, 6, 6]), Masks: torch.Size([16, 6]), Targets: torch.Size([16, 6]), Lengths: torch.Size([16])
INFO:__main__:Output shape: torch.Size([16, 6])
Epoch 1/50: 100%|██████████| 27/27 [00:00<00:00, 31.54it/s, loss=0.092, lr=1.33e-04]
INFO:__main__:New best model saved with validation loss: 0.1137
INFO:__main__:Epoch 1: Train Loss = 0.1717, Val Loss = 0.1137, LR = 1.33e-04
Epoch 2/50: 100%|██████████| 27/27 [00:00<00:00, 34.48it/s, loss=0.145, lr=3.76e-04]
INFO:__main__:Epoch 2: Train Loss = 0.1303, Val Loss = 0.1233, LR = 3.76e-04
Epoch 3/50: 100%|██████████| 27/27 [00:00<00:00, 34.77it/s, loss=0.100, lr=6.75e-04]
INFO:__main__:Epoch 3: Train Loss = 0.1183, Val Loss = 0.1171, LR = 6.75e-04
Epoch 4/50: 100%|██████████| 27/27 [00:00<00:00, 32.75it/s, loss=0.115, lr=9.14e-04]
INFO:__main__:Epoch 4: Train Loss = 0.1100, Val Loss = 0.1160, LR = 9.14e-04

In [133]:
def load_model_from_checkpoint(checkpoint_path: str, data_config, device: torch.device) -> nn.Module:
    """Load model with proper error handling and validation"""
    try:
        logger.info(f"Loading model from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path, map_location=device)
        
        # Initialize model with correct parameters
        model = ImprovedLSTM(
            input_size=len(data_config.input_features),
            hidden_size=256,
            num_layers=4,
            output_size=len(data_config.input_features),
            dropout=0.2,
            bidirectional=True,
            num_heads=4
        ).to(device)
        
        # Load state dict
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()
        
        # Validate model configuration
        if hasattr(checkpoint, 'config'):
            logger.info(f"Loaded model config: {checkpoint['config']}")
        
        return model
        
    except FileNotFoundError:
        logger.error(f"Checkpoint file not found: {checkpoint_path}")
        raise
    except KeyError as e:
        logger.error(f"Invalid checkpoint structure: {str(e)}")
        raise
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise


def load_position_specific_models(device: torch.device) -> Dict[str, Tuple[nn.Module, MinMaxScaler]]:
    """Load position-specific models and scalers"""
    models = {}
    position_configs = {
        'INFIELD': DataConfig(position_group='INFIELD'),
        'OUTFIELD': DataConfig(position_group='OUTFIELD'),
        'CATCHER': DataConfig(position_group='CATCHER')
    }
    
    for group, config in position_configs.items():
        model = load_model_from_checkpoint(
            f'checkpoints/{group[0:2]}_fielding_model.pth',
            config,
            device
        )
        scaler = joblib.load(f'fielding_scaler_{group.lower()}.pkl')
        models[group] = (model, scaler, config)
    
    return models

def predict_future_stats(player_id, position, input_features, model, scaler, raw_df, player_names):
    """Predict future stats for a given player at a specific position"""
    # Get player position-specific data
    player_data = raw_df[
        (raw_df['IDfg'] == player_id) & 
        (raw_df['Pos'] == position)
    ].sort_values('Season')
    
    # Add debugging logs
    logger.info(f"\nDebugging prediction for player {player_id}")
    logger.info(f"Position: {position}")
    logger.info(f"Seasons of data: {len(player_data)}")
    logger.info(f"Features available: {player_data[input_features].columns}")
    logger.info(f"Any NaN in features: {player_data[input_features].isna().any().any()}")
    
    if len(player_data) < 1:
        logger.warning(f"No data found for player {player_id} at position {position}")
        return None
        
    # Get player info
    player_name = player_names[player_names['IDfg'] == player_id]['Name'].iloc[0]
    last_season = player_data['Season'].max()
    last_age = player_data[player_data['Season'] == last_season]['Age'].iloc[0]
    
    logger.info(f"Player name: {player_name}")
    logger.info(f"Last season: {last_season}")
    logger.info(f"Recent data shape: {player_data[input_features].shape}")
    logger.info(f"Recent data:\n{player_data[input_features].tail()}")
    
    logger.info(f"\nGenerating predictions for {player_name} at {position}")
    logger.info(f"Last season: {last_season}, Last age: {last_age}")
    
    # Get device from model
    device = next(model.parameters()).device
    
    # Get most recent sequence with padding if needed
    seq_length = 3  # Desired sequence length
    available_seasons = len(player_data)
    
    if available_seasons >= seq_length:
        recent_data = player_data[input_features].iloc[-seq_length:].copy()
    else:
        # Create padding for missing seasons
        padding_size = seq_length - available_seasons
        padding_data = pd.DataFrame(0, index=range(padding_size), columns=input_features)
        recent_data = pd.concat([padding_data, player_data[input_features]], axis=0)
    
    # Calculate career stats (using actual data only)
    career_stats = player_data[input_features].mean()
    
    
    # Create enhanced features with career stats
    enhanced_features = []
    for idx, row in recent_data.iterrows():
        career_dev = row - career_stats
        combined = np.concatenate([row.values, career_dev.values])
        enhanced_features.append(combined)
    
    sequence = np.array(enhanced_features)
    sequence_scaled = scaler.transform(sequence)
    
    # Dynamically get feature dimensions
    n_features = len(input_features)
    sequence_scaled = sequence_scaled[:, :n_features]
    
    predictions = []
    with torch.no_grad():
        for year_offset in range(1, 16):
            current_year = last_season + year_offset
            current_age = last_age + year_offset
            
            data = torch.FloatTensor(sequence_scaled).unsqueeze(0).to(device)
            lengths = torch.tensor([3], dtype=torch.int64, device=device)
            
            output = model(data, lengths)
            pred_numpy = output.cpu().numpy()[0]
            
            # Dynamic padding based on scaler dimensions
            scaler_dim = scaler.n_features_in_
            pred_padded = np.pad(pred_numpy, (0, scaler_dim - n_features), 'constant')
            unscaled_pred = scaler.inverse_transform(pred_padded.reshape(1, -1))[0][:n_features]
            
            prediction = {
                'Name': player_name,
                'Age': current_age,
                'Year': current_year,
                'IDfg': player_id,
                'Pos': position  # Add position to predictions
            }
            
            for i, feature in enumerate(input_features):
                if feature == 'Age':
                    prediction[feature] = current_age
                else:
                    prediction[feature] = unscaled_pred[i]
            
            predictions.append(prediction)
            
            # Update sequence for next prediction
            age_index = input_features.index('Age')
            # Create zero array of correct scaler dimension
            age_update = np.zeros(scaler_dim)
            age_update[age_index] = current_age
            pred_numpy[age_index] = scaler.transform([age_update])[0][age_index]
            sequence_scaled = np.vstack([sequence_scaled[1:], pred_numpy])
    
    return predictions

def predict_all_2024_players(raw_df, player_names, position_models, position_group_map):
    all_predictions = []
    
    for group, (model, scaler, config) in position_models.items():
        # Calculate rate stats for this position group first
        group_df = raw_df[raw_df['Pos'].isin(config.valid_positions)].copy()
        group_df = calculate_rate_stats(group_df, group)
        
        # Filter players for this position group
        players_2024 = group_df[
            (group_df['Season'] == 2024) & 
            (group_df['Inn'] >= config.min_innings) &
            (group_df['Pos'].isin(config.valid_positions))
        ][['IDfg', 'Pos']].drop_duplicates()
        
        logger.info(f"\nProcessing {group} - {len(players_2024)} players")
        
        for _, row in tqdm(players_2024.iterrows(), desc=group):
            predictions = predict_future_stats(
                player_id=row['IDfg'],
                position=row['Pos'],
                input_features=config.input_features,  # Use position-specific features
                model=model,
                scaler=scaler,
                raw_df=group_df,  # Use position-specific dataframe
                player_names=player_names
            )
            if predictions:
                all_predictions.extend(predictions)
    
    return pd.DataFrame(all_predictions) if all_predictions else None




In [134]:
# Load data
raw_df = pd.read_csv('../data/mlb_fielding_data_2000_2024_with_age.csv')
player_names = pd.read_csv('../data/batter_names.csv')

# Load position-specific models and configs
position_models = {
    'INFIELD': (
        load_model_from_checkpoint('checkpoints/IF_fielding_model.pth', infield_config, device),
        joblib.load('fielding_scaler_infield.pkl'),
        infield_config
    ),
    'OUTFIELD': (
        load_model_from_checkpoint('checkpoints/OF_fielding_model.pth', outfield_config, device),
        joblib.load('fielding_scaler_outfield.pkl'),
        outfield_config
    ),
    'CATCHER': (
        load_model_from_checkpoint('checkpoints/C_fielding_model.pth', catcher_config, device),
        joblib.load('fielding_scaler_catcher.pkl'),
        catcher_config
    )
}

# Map positions to position groups
position_group_map = {
    '1B': 'INFIELD', '2B': 'INFIELD', '3B': 'INFIELD', 'SS': 'INFIELD',
    'LF': 'OUTFIELD', 'CF': 'OUTFIELD', 'RF': 'OUTFIELD',
    'C': 'CATCHER'
}
# Calculate rate stats for each position group
for group, (model, scaler, config) in position_models.items():
    # Filter for position group
    group_df = raw_df[raw_df['Pos'].isin(config.valid_positions)]
    # Calculate rate stats
    group_df = calculate_rate_stats(group_df, group)
    # Update raw_df with calculated stats
    raw_df.update(group_df)

# Generate predictions
predictions_df = predict_all_2024_players(raw_df, player_names, position_models, position_group_map)

# After predictions are generated
if predictions_df is not None:
    # Save predictions for each year
    for year in range(2025, 2040):
        year_predictions = predictions_df[predictions_df['Year'] == year].copy()
        filename = f'../data/generated/Fielding_Predictions_{year}.csv'
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        year_predictions.to_csv(filename, index=False)
        logger.info(f"Saved predictions for {year} to {filename}")

INFO:__main__:Loading model from checkpoints/IF_fielding_model.pth
INFO:__main__:Loading model from checkpoints/OF_fielding_model.pth
INFO:__main__:Loading model from checkpoints/C_fielding_model.pth
INFO:__main__:
Processing INFIELD - 405 players
INFIELD: 0it [00:00, ?it/s]INFO:__main__:
Debugging prediction for player 18314
INFO:__main__:Position: SS
INFO:__main__:Seasons of data: 9
INFO:__main__:Features available: Index(['Age', 'DRS/150', 'UZR/150', 'OAA/150', 'RngR/150', 'ErrR/150',
       'DPR/150', 'Inn'],
      dtype='object')
INFO:__main__:Any NaN in features: False
INFO:__main__:Player name: Dansby Swanson
INFO:__main__:Last season: 2024
INFO:__main__:Recent data shape: (9, 8)
INFO:__main__:Recent data:
        Age    DRS/150   UZR/150    OAA/150  RngR/150  ErrR/150   DPR/150  \
39357  26.0  22.500000  0.500000  15.000000 -3.750000  4.000000  0.250000   
41249  27.0  -6.603774  1.037736   2.830189  0.754717  0.188679  0.094340   
43670  28.0   8.385093  0.838509  18.633540  0

In [132]:
def analyze_2024_data_quality(raw_df, position_models):
    """Analyze data quality for 2024 players by position group"""
    for group, (model, scaler, config) in position_models.items():
        print(f"\n{'-'*50}")
        print(f"Analyzing {group} players")
        print(f"{'-'*50}")
        
        # Get 2024 players meeting innings requirement
        players = raw_df[
            (raw_df['Season'] == 2024) & 
            (raw_df['Inn'] >= config.min_innings) &
            (raw_df['Pos'].isin(config.valid_positions))
        ]
        
        total_players = len(players)
        print(f"\nTotal players meeting innings requirement: {total_players}")
        
        # Calculate rate stats
        players = calculate_rate_stats(players, group)
        
        # Check each feature
        print("\nMissing data analysis:")
        for feature in config.input_features:
            missing = players[feature].isna().sum()
            if missing > 0:
                print(f"{feature}: {missing} missing ({(missing/total_players)*100:.1f}%)")
                
                # Show example players with missing data
                print("\nExample players with missing data:")
                missing_players = players[players[feature].isna()]
                print(missing_players[['Name', 'Pos', 'Inn', feature]].head())
        
        # Check completely empty players
        empty_players = players[players[config.input_features].isna().all(axis=1)]
        if len(empty_players) > 0:
            print(f"\nPlayers with all features missing: {len(empty_players)}")
            print(empty_players[['Name', 'Pos', 'Inn']].head())

# Run analysis
analyze_2024_data_quality(raw_df, position_models)


--------------------------------------------------
Analyzing INFIELD players
--------------------------------------------------

Total players meeting innings requirement: 405

Missing data analysis:
UZR/150: 1 missing (0.2%)

Example players with missing data:
                 Name Pos   Inn  UZR/150
49282  Ezequiel Duran  1B  99.0      NaN
RngR/150: 1 missing (0.2%)

Example players with missing data:
                 Name Pos   Inn  RngR/150
49282  Ezequiel Duran  1B  99.0       NaN
ErrR/150: 1 missing (0.2%)

Example players with missing data:
                 Name Pos   Inn  ErrR/150
49282  Ezequiel Duran  1B  99.0       NaN
DPR/150: 61 missing (15.1%)

Example players with missing data:
                  Name Pos    Inn  DPR/150
48526       Ryan Bliss  2B  177.0      NaN
48545    Danny Mendick  2B   98.1      NaN
48549     Aaron Schunk  3B   89.0      NaN
48593     Bligh Madris  1B  165.2      NaN
48595  Oswaldo Cabrera  2B   77.1      NaN

--------------------------------------