# LSTM Model

This script implements a deep learning model using an LSTM network to forecast daily unit sales,
leveraging historical sales data, calendar effects, and engineered features from the M5 Forecasting Challenge.

The model pipeline includes:
- Data loading from the preprocessed feature file
- Label encoding of categorical variables
- Feature normalization
- Construction of temporal sequences for LSTM input
- Model training with early stopping and RMSE evaluation
- Visualization of predictions and training history
- Output of predictions on a held-out test set for inference or submission

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Load preprocessed data
print("Loading preprocessed data...")
df = pd.read_pickle("/Users/nanxuan/Desktop/M5 Enhanced Forecasting System/Dataset/processed_sales_data.pkl")
df.dropna(subset=["sales_lag_7", "rolling_mean_7", "price_change"], inplace=True)

Loading preprocessed data...


In [3]:
# Encode categorical columns
print("Encoding categorical variables...")
categorical_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

Encoding categorical variables...


In [4]:
# Select features
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',
            'day_of_week', 'month', 'year', 'is_weekend', 'is_us_holiday',
            'sales_lag_7', 'rolling_mean_7', 'price_change']
target = 'sales'

# Normalize features
print("Normalizing features...")
normalize_cols = ['sales_lag_7', 'rolling_mean_7', 'price_change', 'day_of_week', 'month', 'year']
scaler = MinMaxScaler()
df[normalize_cols] = scaler.fit_transform(df[normalize_cols])

Normalizing features...


In [5]:
# Construct sequences grouped by 'id'
def create_grouped_sequences(df, features, target, sequence_length=28, group_col='id', max_seq_per_group=100):
    X, y = [], []
    for _, group in df.groupby(group_col):
        group = group.sort_values('date')
        if len(group) >= sequence_length + 1:
            n_seq = min(len(group) - sequence_length, max_seq_per_group)
            for i in range(n_seq):
                seq_x = group[features].iloc[i:i+sequence_length].values
                seq_y = group[target].iloc[i+sequence_length]
                X.append(seq_x)
                y.append(seq_y)
    return np.array(X), np.array(y)

print("Creating grouped sequences...")
X_all, y_all = create_grouped_sequences(df, features, target, sequence_length=28, group_col='id', max_seq_per_group=100)

Creating grouped sequences...


In [6]:
# Train/val split
print("Splitting data...")
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

Splitting data...


In [7]:
# Convert to PyTorch tensors
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device)
X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_t = torch.tensor(y_val, dtype=torch.float32).to(device)

train_dataset = TensorDataset(X_train_t, y_train_t)
val_dataset = TensorDataset(X_val_t, y_val_t)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [8]:
# Define LSTM model
class SalesLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(SalesLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Take last output
        return out.squeeze()

print("Initializing model...")
model = SalesLSTM(input_size=X_train.shape[2], hidden_size=64).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Initializing model...


In [9]:
# Training loop
print("Training LSTM model...")
epochs = 20
train_losses, val_losses = [], []
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        output = model(xb)
        loss = criterion(output, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_losses.append(running_loss / len(train_loader))

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            output = model(xb)
            loss = criterion(output, yb)
            val_loss += loss.item()
    val_losses.append(val_loss / len(val_loader))
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")


Training LSTM model...
Epoch 1/20, Train Loss: 16.0464, Val Loss: 15.9263
Epoch 2/20, Train Loss: 15.7666, Val Loss: 15.0843
Epoch 3/20, Train Loss: 14.4583, Val Loss: 14.2456
Epoch 4/20, Train Loss: 14.6579, Val Loss: 15.5475
Epoch 5/20, Train Loss: 14.4603, Val Loss: 15.6237
Epoch 6/20, Train Loss: 15.4990, Val Loss: 15.2675
Epoch 7/20, Train Loss: 13.6773, Val Loss: 13.2939
Epoch 8/20, Train Loss: 13.6007, Val Loss: 15.3893
Epoch 9/20, Train Loss: 15.4767, Val Loss: 15.7271
Epoch 10/20, Train Loss: 14.3516, Val Loss: 15.4487
Epoch 11/20, Train Loss: 14.8717, Val Loss: 15.7589
Epoch 12/20, Train Loss: 14.8557, Val Loss: 14.9637
Epoch 13/20, Train Loss: 13.6743, Val Loss: 13.9418
Epoch 14/20, Train Loss: 13.1349, Val Loss: 12.0455
Epoch 15/20, Train Loss: 15.2651, Val Loss: 10.9511
Epoch 16/20, Train Loss: 12.8145, Val Loss: 14.0845
Epoch 17/20, Train Loss: 13.7819, Val Loss: 16.2911
Epoch 18/20, Train Loss: 13.7464, Val Loss: 14.4302
Epoch 19/20, Train Loss: 14.3406, Val Loss: 15.664

In [10]:
# Evaluation
model.eval()
preds = model(X_val_t).detach().cpu().numpy()
true = y_val_t.detach().cpu().numpy()
rmse = np.sqrt(mean_squared_error(true, preds))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 3.7774
