# Clean LSTM Implementation for Store Sales Forecasting

A properly designed Long Short-Term Memory (LSTM) network for time series forecasting with correct data handling, sequence preparation, and model architecture.

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import warnings
import time
import joblib
import wandb
import torch.nn as nn
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')
from evaluation.metrics import summary

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Paths
DATA_DIR = Path('../content/data_processed')
RESULTS_DIR = Path('../results/neural_networks/lstm')
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Results directory: {RESULTS_DIR}")

In [None]:
# Load preprocessed data and scalers
print("Loading preprocessed data...")

# Load the datasets created by 02_data_preprocessing.ipynb
train_df = pd.read_parquet(DATA_DIR / 'train.parquet')
val_df = pd.read_parquet(DATA_DIR / 'val.parquet') 
test_df = pd.read_parquet(DATA_DIR / 'test.parquet')

# Load fitted scalers
scalers = joblib.load(DATA_DIR / 'scalers.pkl')

print(f"Train data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Available features: {train_df.columns.tolist()}")
print(f"Date range - Train: {train_df['date'].min()} to {train_df['date'].max()}")
print(f"Date range - Val: {val_df['date'].min()} to {val_df['date'].max()}")
print(f"Date range - Test: {test_df['date'].min()} to {test_df['date'].max()}")

# Display basic statistics
print("\nTrain data info:")
train_df.info()
print(f"\nSales statistics in train data:")
print(train_df['sales'].describe())

In [None]:
# Configuration for LSTM
CONFIG = {
    # Model architecture
    'sequence_length': 15,              
    'hidden_size': 16,                 
    'num_layers': 3,                 
    'bidirectional': False,            
    'dropout': 0.3,                    
    'output_size': 1,                  
    
    # Training - Strong regularization
    'learning_rate': 1e-4,             
    'weight_decay': 1e-3,              
    'batch_size': 256,                 
    'epochs': 30,                      
    'patience': 10,                    
    'min_delta': 1e-4,                 
    'gradient_clip': 0.5,              
    
    # Anti-overfitting techniques
    'label_smoothing': 0.1,           
    'noise_factor': 0.01,              
    'validation_frequency': 1,        
    
    # Data
    'target_column': 'sales',
    
    # Performance optimizations
    'num_workers': 2,                  
    'pin_memory': True,                
    'persistent_workers': False,       
    'compile_model': False,            
    'mixed_precision': False,          
    'eval_frequency': 1,               
    'log_frequency': 25,               
}

In [None]:
# Inspect the preprocessed data - debug
target_col = CONFIG['target_column']

print(f"\nTarget ({target_col}) statistics:")
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    if target_col in df.columns:
        stats = df[target_col].describe()
        print(f"  {name}: min={stats['min']:.2f}, max={stats['max']:.2f}, mean={stats['mean']:.2f}, std={stats['std']:.2f}")
        negative_pct = (df[target_col] < 0).mean() * 100
        print(f"    Negative values: {negative_pct:.2f}%")

# Display available columns
print(f"\nAvailable columns ({len(train_df.columns)}):")
print(train_df.columns.tolist())

In [None]:
# Prepare features from preprocessed data
print("Preparing features from preprocessed data...")

exclude_cols = [target_col, 'date']
all_feature_cols = [col for col in train_df.columns if col not in exclude_cols]

print(f"Feature preparation:")
print(f"  Total columns: {len(train_df.columns)}")
print(f"  Excluded: {exclude_cols}")
print(f"  Feature columns: {len(all_feature_cols)}")
print(f"  Sample features: {all_feature_cols[:10]}{'...' if len(all_feature_cols) > 10 else ''}")

X_train = train_df[all_feature_cols].values.astype(np.float32)
X_val = val_df[all_feature_cols].values.astype(np.float32)
X_test = test_df[all_feature_cols].values.astype(np.float32)

y_train = train_df[target_col].values.astype(np.float32)
y_val = val_df[target_col].values.astype(np.float32)
y_test = test_df[target_col].values.astype(np.float32)

print(f"\nData shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"  X_test: {X_test.shape}, y_test: {y_test.shape}")

# Data scaling check
print(f"\nData scaling check:")
print(f"  Feature range: [{X_train.min():.3f}, {X_train.max():.3f}]")
print(f"  Feature mean: {X_train.mean():.3f}, std: {X_train.std():.3f}")
print(f"  Target range: [{y_train.min():.3f}, {y_train.max():.3f}]")
print(f"  Target mean: {y_train.mean():.3f}, std: {y_train.std():.3f}")

# Check if features are in reasonable range for LSTM
feature_range = X_train.max() - X_train.min()
if feature_range > 100 or X_train.std() > 10:
    print("WARNING: Features may not be properly scaled")
    print("Large feature values can cause training instability")
else:
    print("Features appear to be reasonably scaled")

# Check target distribution 
if y_train.std() > 1000:
    print("WARNING: Target has very high variance")
    print("Consider log transformation or different loss function")
elif (y_train < 0).any():
    print("INFO: Target contains negative values")
else:
    print("Target distribution looks reasonable")

# Data quality check
print(f"\nData quality check:")
print(f"  X_train - NaN: {np.isnan(X_train).sum()}, Inf: {np.isinf(X_train).sum()}")
print(f"  y_train - NaN: {np.isnan(y_train).sum()}, Inf: {np.isinf(y_train).sum()}")

if np.isnan(X_train).any() or np.isnan(y_train).any():
    print("ERROR: Data contains NaN values")
elif np.isinf(X_train).any() or np.isinf(y_train).any():
    print("ERROR: Data contains Inf values")
else:
    print("Data quality check passed")

In [None]:
# Prepare Time Series Dataset for LSTM with Proper Grouping
from typing import List


class TimeSeriesDataset(Dataset):
    def __init__(self, df: pd.DataFrame, feature_cols: List[str], 
                 target_col: str, seq_length: int = 7):
        self.seq_length = seq_length
        self.sequences = []
        self.targets = []
        
        # Check if required columns exist
        group_cols = ['store_nbr', 'family', 'type']
        available_group_cols = [col for col in group_cols if col in df.columns]
        
        if not available_group_cols:
            available_group_cols = ['store_nbr', 'family']  # fallback
        
        # Group by store, family, and type to maintain temporal continuity
        for group_key, group in df.groupby(available_group_cols):
            group = group.sort_values('date')
            
            # Only create sequences if we have enough data
            if len(group) > seq_length:
                features = group[feature_cols].values.astype(np.float32)
                targets = group[target_col].values.astype(np.float32)
                
                # CORRECTED: Create sequences within this group
                for i in range(len(group) - seq_length):
                    seq_features = features[i:i+seq_length]
                    seq_target = targets[i+seq_length]
                    
                    self.sequences.append(seq_features)
                    self.targets.append(seq_target)
        
        if len(self.sequences) == 0:
            raise ValueError("No sequences created! Check your data and sequence length.")
        
        # Convert to tensors
        self.sequences = torch.FloatTensor(np.array(self.sequences))
        self.targets = torch.FloatTensor(np.array(self.targets))
        
        # Validate data
        assert not torch.isnan(self.sequences).any(), "Sequences contain NaN values"
        assert not torch.isnan(self.targets).any(), "Targets contain NaN values"
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    
    @property
    def input_dim(self):
        return self.sequences.shape[2]
    
def prepare_lstm_data_correctly(train_df, val_df, test_df, feature_cols, target_col, seq_length=7):
    """Properly prepare data for LSTM with grouped sequences"""
    
    print(f"Preparing LSTM data with proper grouping...")
    print(f"  Sequence length: {seq_length}")
    print(f"  Feature columns: {len(feature_cols)}")
    print(f"  Target column: {target_col}")
    
    # Create datasets with proper grouping
    train_dataset = TimeSeriesDataset(train_df, feature_cols, target_col, seq_length)
    val_dataset = TimeSeriesDataset(val_df, feature_cols, target_col, seq_length)
    test_dataset = TimeSeriesDataset(test_df, feature_cols, target_col, seq_length)
    
    return train_dataset, val_dataset, test_dataset

In [None]:
# Create LSTM datasets
print("="*60)
print("CREATING GROUPED LSTM DATASETS")
print("="*60)

train_dataset, val_dataset, test_dataset = prepare_lstm_data_correctly(
    train_df, val_df, test_df, 
    feature_cols=all_feature_cols,
    target_col=target_col, 
    seq_length=CONFIG['sequence_length']
)

print(f"\nSuccessfully created LSTM datasets!")
print(f"Dataset sizes:")
print(f"  Train: {len(train_dataset)} sequences")
print(f"  Validation: {len(val_dataset)} sequences")
print(f"  Test: {len(val_dataset)} sequences")
print(f"  Input dimension: {test_dataset.input_dim}")
print(f"  Sequence length: {CONFIG['sequence_length']}")

    
print(f"\nFinal dataset verification:")
print(f"  Train sequences: {len(train_dataset)}")
print(f"  Validation sequences: {len(val_dataset)}")
print(f"  Test sequences: {len(test_dataset)}")
print(f"  Input shape per sequence: {train_dataset.sequences.shape[1:]}")
print("="*60)

In [None]:
class LSTMModel(nn.Module):
    """LSTM"""
    def __init__(self, input_dim: int, hidden_size: int = 16, num_layers: int = 1, 
                 bidirectional: bool = False, dropout: float = 0.3):
        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = dropout
        
        # Ensure minimum hidden size for stability
        if hidden_size < 4:
            raise ValueError(f"hidden_size must be at least 4, got {hidden_size}")
        
        # Input dropout for regularization
        self.input_dropout = nn.Dropout(dropout * 0.5)
        
        # LSTM with recurrent dropout
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Output layers with controlled regularization
        fc_input_dim = hidden_size * 2 if bidirectional else hidden_size
        fc_hidden_dim = max(hidden_size // 2, 4)  # Ensure minimum size
        
        self.fc = nn.Sequential(
            nn.Dropout(dropout * 0.7),  # Reduce dropout slightly
            nn.Linear(fc_input_dim, fc_hidden_dim),
            nn.LayerNorm(fc_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout * 0.5),  # Reduce final dropout
            nn.Linear(fc_hidden_dim, 1)
        )
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Conservative weight initialization to prevent exploding gradients"""
        for name, param in self.named_parameters():
            if 'weight_ih' in name:
                # Xavier initialization for input weights
                nn.init.xavier_normal_(param, gain=0.5)
            elif 'weight_hh' in name:
                # Orthogonal initialization for recurrent weights
                nn.init.orthogonal_(param, gain=0.5)
            elif 'bias_ih' in name or 'bias_hh' in name:
                # Initialize all LSTM biases
                nn.init.zeros_(param)
                # Set forget gate bias to 1 for both input-hidden and hidden-hidden
                hidden_size = param.size(0) // 4
                param.data[hidden_size:2*hidden_size].fill_(1.0)
        
        # Initialize linear layers conservatively
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_normal_(module.weight, gain=0.5)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, x, add_noise=False):
        # Input dropout
        x = self.input_dropout(x)
        
        # Add noise during training for regularization
        if add_noise and self.training:
            noise = torch.randn_like(x) * 0.01
            x = x + noise
        
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)
        
        # Use the last output
        final_out = lstm_out[:, -1, :]
        
        # Final prediction with dropout
        output = self.fc(final_out).squeeze(-1)
        return output
    
    def get_num_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [None]:
class RMSLELoss(nn.Module):
    def __init__(self, epsilon=1e-6):
        super().__init__()
        self.epsilon = epsilon
    
    def forward(self, predictions, targets):
        # Ensure non-negative predictions for RMSLE calculation
        pred_clamped = torch.clamp(predictions, min=0) + self.epsilon
        
        # Calculate RMSLE using log1p for numerical stability
        log_pred = torch.log1p(pred_clamped)
        log_true = torch.log1p(targets + self.epsilon)
        
        mse_log = torch.mean((log_pred - log_true) ** 2)
        rmsle = torch.sqrt(mse_log)
        
        return rmsle


class AntiOverfittingTrainer:
    """Enhanced trainer with strong anti-overfitting measures"""
    
    def __init__(self, model, train_loader, val_loader, test_loader, 
                 config: dict, device):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.config = config
        self.device = device
        
        # Initialize WandB
        wandb.init(
            project="store-sales-forecasting",
            name=f"lstm-rmsle-{int(time.time())}",
            config=config,
            tags=["lstm", "anti-overfitting", "rmsle-loss", "time-series"],
            reinit=True,
        )
        
        # RMSLE loss
        self.criterion = RMSLELoss(epsilon=1e-6)
        print("Using RMSLE loss function for time series forecasting")
        
        # Optimizer
        self.optimizer = optim.AdamW(
            model.parameters(), 
            lr=config['learning_rate'],
            weight_decay=config['weight_decay'],
            betas=(0.9, 0.999),
            eps=1e-8
        )
        
        # Aggressive learning rate scheduler
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, 
            mode='min', 
            factor=0.3,  # More aggressive reduction
            patience=2,  # Faster response
            min_lr=1e-7,
        )
        
        # Training state with validation tracking
        self.best_val_loss = float('inf')
        self.best_model_state = None
        self.train_losses = []
        self.val_losses = []
        self.patience_counter = 0
        self.val_metrics_history = []
        
        # Overfitting detection
        self.overfitting_threshold = 1.5 
        
        print(f"Anti-overfitting trainer initialized with RMSLE loss:")
        print(f"  RMSLE epsilon: {1e-6}")
        print(f"  Weight decay: {config['weight_decay']}")
        print(f"  Learning rate: {config['learning_rate']}")
        print(f"  Early stopping patience: {config['patience']}")
    
    def train_epoch(self):
        """Training epoch with noise injection"""
        self.model.train()
        total_loss = 0
        num_batches = 0
        
        for batch_idx, (features, targets) in enumerate(self.train_loader):
            features, targets = features.to(self.device), targets.to(self.device)
            
            # Forward pass with noise injection
            if hasattr(self.model, 'forward') and 'add_noise' in self.model.forward.__code__.co_varnames:
                outputs = self.model(features, add_noise=True)
            else:
                outputs = self.model(features)
            
            loss = self.criterion(outputs, targets)
            
            # Backward pass with gradient clipping
            self.optimizer.zero_grad()
            loss.backward()
            
            if self.config.get('gradient_clip', 0) > 0:
                torch.nn.utils.clip_grad_norm_(
                    self.model.parameters(), 
                    self.config['gradient_clip']
                )
            
            self.optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
            
            if batch_idx % 25 == 0:
                print(f"\rBatch {batch_idx}/{len(self.train_loader)} - Loss: {loss.item():.6f}", end="")
        
        print()
        return total_loss / num_batches
    
    def evaluate(self, dataloader):
        """Evaluation without noise"""
        self.model.eval()
        total_loss = 0
        predictions = []
        actuals = []
        
        with torch.no_grad():
            for features, targets in dataloader:
                features, targets = features.to(self.device), targets.to(self.device)
                outputs = self.model(features)
                loss = self.criterion(outputs, targets)
                
                total_loss += loss.item()
                predictions.extend(outputs.cpu().numpy())
                actuals.extend(targets.cpu().numpy())
        
        avg_loss = total_loss / len(dataloader)
        return avg_loss, np.array(predictions), np.array(actuals)
    
    def detect_overfitting(self, train_loss, val_loss, epoch):
        """Detect overfitting patterns"""
        overfitting_detected = False
        
        if val_loss > train_loss * self.overfitting_threshold:
            print(f"Overfitting detected: val_loss ({val_loss:.6f}) > {self.overfitting_threshold}x train_loss ({train_loss:.6f})")
            overfitting_detected = True
        
        # Check if validation loss is trending upward
        if len(self.val_losses) >= 3:
            recent_val_losses = self.val_losses[-3:]
            if all(recent_val_losses[i] <= recent_val_losses[i+1] for i in range(len(recent_val_losses)-1)):
                print(f"Validation loss trending upward for 3 epochs")
                overfitting_detected = True
        
        return overfitting_detected
    
    def train(self):
        print("Starting LSTM training with RMSLE loss...")
        print(f"Model parameters: {self.model.get_num_parameters():,}")
        
        for epoch in range(self.config['epochs']):
            start_time = time.time()
            
            # Train
            train_loss = self.train_epoch()
            
            # Validate
            val_loss, val_preds, val_actuals = self.evaluate(self.val_loader)
            
            # Calculate validation metrics
            val_metrics = summary(val_actuals, val_preds) if val_preds.std() > 1e-8 else {'MAE': float('inf'), 'RMSE': float('inf'), 'RMSLE': float('inf'), 'MAPE': float('inf'), 'SMAPE': float('inf')}
            self.val_metrics_history.append(val_metrics)
            
            # Detect overfitting
            overfitting = self.detect_overfitting(train_loss, val_loss, epoch)
            
            # Learning rate scheduling
            self.scheduler.step(val_loss)
            
            # Early stopping and best model saving
            if val_loss < self.best_val_loss - self.config['min_delta']:
                self.best_val_loss = val_loss
                self.best_model_state = self.model.state_dict().copy()
                self.patience_counter = 0
                self.save_checkpoint(epoch, val_metrics)
                status = "Best"
            else:
                self.patience_counter += 1
                status = f" {self.patience_counter}/{self.config['patience']}"
            
            # Calculate logging variables
            epoch_time = time.time() - start_time
            lr = self.optimizer.param_groups[0]['lr']
            
            # Log to WandB
            wandb.log({
                'epoch': epoch + 1,
                'train_rmsle_loss': train_loss,
                'val_rmsle_loss': val_loss,
                'val_mae': val_metrics['MAE'],
                'val_rmse': val_metrics['RMSE'],
                'val_rmsle': val_metrics['RMSLE'],
                'val_mape': val_metrics['MAPE'],
                'val_smape': val_metrics['SMAPE'],
                'learning_rate': lr,
                'epoch_time': epoch_time,
                'patience_counter': self.patience_counter
            })
            
            # Display progress
            print(f"Epoch {epoch+1:3d}/{self.config['epochs']} | "
                  f"Train RMSLE: {train_loss:.6f} | Val RMSLE: {val_loss:.6f} | "
                  f"Ratio: {val_loss/train_loss:.2f} | "
                  f"Val RMSLE Metric: {val_metrics['RMSLE']:.6f} | "
                  f"LR: {lr:.2e} | Time: {epoch_time:.1f}s | {status}")
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            
            # Early stopping
            if self.patience_counter >= self.config['patience']:
                print(f"Early stopping at epoch {epoch+1}")
                break
            
            # Force stop on severe overfitting
            if overfitting and epoch > 5:
                print(f"Stopping due to overfitting detection at epoch {epoch+1}")
                break
        
        # Load best model
        if self.best_model_state is not None:
            self.model.load_state_dict(self.best_model_state)
            print(f"\nLoaded best model with validation loss: {self.best_val_loss:.6f}")
        
        wandb.finish()
    
    def save_checkpoint(self, epoch, val_metrics):
        """Save model checkpoint"""
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'val_loss': self.best_val_loss,
            'val_metrics': val_metrics,
            'config': self.config,
            'train_losses': self.train_losses,
            'val_losses': self.val_losses
        }
        torch.save(checkpoint, RESULTS_DIR / 'anti_overfitting_model.pt')
    
    def final_evaluation(self):
        """Final evaluation on test set"""
        print("\n" + "="*50)
        print("FINAL ANTI-OVERFITTING EVALUATION")
        print("="*50)
        
        test_loss, test_preds, test_actuals = self.evaluate(self.test_loader)
        test_metrics = summary(test_actuals, test_preds)
        
        print(f"Test RMSLE Loss: {test_loss:.6f}")
        print("\nTest Metrics:")
        for metric, value in test_metrics.items():
            print(f"  {metric}: {value:.6f}")
        
        # Overfitting analysis
        final_train_loss = self.train_losses[-1] if self.train_losses else float('inf')
        final_val_loss = self.val_losses[-1] if self.val_losses else float('inf')
        
        print(f"\nOverfitting Analysis:")
        print(f"  Final train RMSLE: {final_train_loss:.6f}")
        print(f"  Final val RMSLE: {final_val_loss:.6f}")
        print(f"  Test RMSLE: {test_loss:.6f}")
        print(f"  Val/Train ratio: {final_val_loss/final_train_loss:.2f}")
        print(f"  Test/Train ratio: {test_loss/final_train_loss:.2f}")
        
        return test_metrics, test_preds, test_actuals


# Keep the original Trainer class for compatibility
class Trainer(AntiOverfittingTrainer):
    pass

print("Updated trainer with anti-overfitting measures")

In [None]:
# Create proper LSTM datasets using the preprocessed data
print("Creating LSTM datasets from preprocessed data...")


print(f"Successfully created LSTM datasets!")
print(f"Dataset sizes:")
print(f"  Train: {len(train_dataset)} sequences")
print(f"  Validation: {len(val_dataset)} sequences")
print(f"  Test: {len(test_dataset)} sequences")
print(f"  Input dimension: {train_dataset.input_dim}")
print(f"  Sequence length: {CONFIG['sequence_length']}")

# Create OPTIMIZED data loaders with the corrected datasets
print("Creating optimized data loaders with properly grouped sequences...")

# Verify we have datasets
if 'train_dataset' not in locals():
    print("❌ Error: No datasets found. Please run the dataset creation cell first.")
    raise ValueError("Datasets not created. Run the previous cells first.")

print(f"Using datasets:")
print(f"  Train: {len(train_dataset)} sequences")
print(f"  Validation: {len(val_dataset)} sequences")
print(f"  Test: {len(test_dataset)} sequences")
print(f"  Input dimension: {train_dataset.input_dim}")
print(f"  Sequence length: {CONFIG['sequence_length']}")

# Create OPTIMIZED data loaders with performance settings
is_cuda = device.type == 'cuda'
num_workers = CONFIG.get('num_workers', 4) if is_cuda else 0 
pin_memory = CONFIG.get('pin_memory', True) and is_cuda
persistent_workers = CONFIG.get('persistent_workers', True) and num_workers > 0

print(f"\nData loader settings:")
print(f"  Device: {device}")
print(f"  Batch size: {CONFIG['batch_size']}")
print(f"  Num workers: {num_workers}")
print(f"  Pin memory: {pin_memory}")
print(f"  Persistent workers: {persistent_workers}")

train_loader = DataLoader(
    train_dataset, 
    batch_size=CONFIG['batch_size'], 
    shuffle=True, 
    num_workers=num_workers,
    pin_memory=pin_memory,
    persistent_workers=persistent_workers,
    drop_last=True,  # For consistent batch sizes
    prefetch_factor=2 if num_workers > 0 else None
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=CONFIG['batch_size'], 
    shuffle=False, 
    num_workers=num_workers,
    pin_memory=pin_memory,
    persistent_workers=persistent_workers,
    drop_last=False,
    prefetch_factor=2 if num_workers > 0 else None
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=CONFIG['batch_size'], 
    shuffle=False, 
    num_workers=num_workers,
    pin_memory=pin_memory,
    persistent_workers=persistent_workers,
    drop_last=False,
    prefetch_factor=2 if num_workers > 0 else None
)

print("\n Data loaders created successfully with proper grouping!")
print(f"Data loader batch counts:")
print(f"  Train: {len(train_loader)} batches")
print(f"  Validation: {len(val_loader)} batches") 
print(f"  Test: {len(test_loader)} batches")

# Test a batch to ensure everything works
try:
    sample_batch = next(iter(train_loader))
    print(f"\nSample batch test successful:")
    print(f"  Batch input shape: {sample_batch[0].shape}")
    print(f"  Batch target shape: {sample_batch[1].shape}")
    print(f"  Expected: (batch_size={CONFIG['batch_size']}, seq_len={CONFIG['sequence_length']}, features={train_dataset.input_dim})")
except Exception as e:
    print(f"Error in sample batch: {e}")
    raise

In [None]:
# Create regularized model to prevent overfitting
model = LSTMModel(
    input_dim=train_dataset.input_dim,
    hidden_size=CONFIG['hidden_size'],
    num_layers=CONFIG['num_layers'],
    bidirectional=CONFIG['bidirectional'],
    dropout=CONFIG['dropout']
).to(device)

print("Anti-Overfitting Model Architecture:")
print(model)
print(f"Total parameters: {model.get_num_parameters():,}")
print(f"Model size reduction: {model.get_num_parameters()} vs previous larger models")

In [None]:
print("STARTING ANTI-OVERFITTING LSTM TRAINING")
print("="*60)
print("="*60)

trainer = AntiOverfittingTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    config=CONFIG,
    device=device,
)

start_training_time = time.time()

try:
    trainer.train()
    total_training_time = time.time() - start_training_time
    print(f"\nTraining completed in {total_training_time:.2f} seconds")
    
    print(f"\nTraining Summary:")
    print(f"  Best validation loss: {trainer.best_val_loss:.6f}")
    print(f"  Final train/val loss ratio: {trainer.val_losses[-1]/trainer.train_losses[-1]:.2f}")
    print(f"  Early stopping triggered: {trainer.patience_counter >= CONFIG['patience']}")
    print(f"  Epochs completed: {len(trainer.train_losses)}")
    
except KeyboardInterrupt:
    print("\nTraining interrupted by user")
    total_training_time = time.time() - start_training_time
    print(f"Training time before interruption: {total_training_time:.2f} seconds")
except Exception as e:
    print(f"\nTraining failed with error: {e}")
    total_training_time = time.time() - start_training_time
    print(f"Training time before failure: {total_training_time:.2f} seconds")
    raise


In [None]:
# Final evaluation
test_metrics, test_preds, test_actuals = trainer.final_evaluation()

In [None]:
# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Training curves
axes[0, 0].plot(trainer.train_losses, label='Training Loss', alpha=0.8)
axes[0, 0].plot(trainer.val_losses, label='Validation Loss', alpha=0.8)
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('MSE Loss')
axes[0, 0].set_title('Training Progress')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Predictions vs Actuals
axes[0, 1].scatter(test_actuals, test_preds, alpha=0.6, s=1)
min_val, max_val = min(test_actuals.min(), test_preds.min()), max(test_actuals.max(), test_preds.max())
axes[0, 1].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
axes[0, 1].set_xlabel('Actual Sales')
axes[0, 1].set_ylabel('Predicted Sales')
axes[0, 1].set_title('Predictions vs Actuals')
axes[0, 1].grid(True, alpha=0.3)

# 3. Residuals
residuals = test_actuals - test_preds
axes[1, 0].scatter(test_preds, residuals, alpha=0.6, s=1)
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 0].set_xlabel('Predicted Sales')
axes[1, 0].set_ylabel('Residuals')
axes[1, 0].set_title('Residuals Plot')
axes[1, 0].grid(True, alpha=0.3)

# 4. Error distribution
axes[1, 1].hist(residuals, bins=50, alpha=0.7, density=True)
axes[1, 1].axvline(x=0, color='r', linestyle='--', lw=2)
axes[1, 1].set_xlabel('Residuals')
axes[1, 1].set_ylabel('Density')
axes[1, 1].set_title('Residuals Distribution')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'lstm_evaluation_plots.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Plots saved to {RESULTS_DIR / 'lstm_evaluation_plots.png'}")

# Summary statistics
print(f"\nFINAL PERFORMANCE SUMMARY")
print(f"{'='*50}")
print(f"Model: LSTM with {model.get_num_parameters():,} parameters")
print(f"Architecture: {CONFIG['hidden_size']} hidden units, {CONFIG['num_layers']} layers")
print(f"Bidirectional: {CONFIG['bidirectional']}")
print(f"")
print(f"Test Metrics (Original Scale):")
for metric, value in test_metrics.items():
    print(f"  {metric:6s}: {value:10.6f}")
print(f"")
print(f"Prediction Quality:")
print(f"  Actual range:    [{test_actuals.min():8.2f}, {test_actuals.max():8.2f}]")
print(f"  Predicted range: [{test_preds.min():8.2f}, {test_preds.max():8.2f}]")
print(f"  Mean error:      {residuals.mean():8.2f}")
print(f"  Std error:       {residuals.std():8.2f}")
print(f"{'='*50}")