<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_PatchTST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import os
import zipfile
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
import wandb
from datetime import datetime
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Unzip dataset
zip_path = "/content/drive/MyDrive/ML-FinalProject/data.zip"
extract_path = "/content/data"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
for zip_file in glob.glob(f"{extract_path}/*.csv.zip"):
    with zipfile.ZipFile(zip_file, 'r') as z:
        z.extractall(extract_path)

# Set random seed
torch.manual_seed(42)
np.random.seed(42)

# Initialize wandb
wandb.init(project="walmart-sales-forecasting", config={
    "learning_rate": 0.0005,
    "batch_size": 64,
    "epochs": 30,
    "patch_length": 4,
    "n_patches": 13,
    "d_model": 256,
    "n_heads": 8,
    "n_layers": 4
})
config = wandb.config



Mounted at /content/drive


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlkhok22[0m ([33mlkhok22-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# Load data
train = pd.read_csv(f"{extract_path}/train.csv")
test = pd.read_csv(f"{extract_path}/test.csv")
stores = pd.read_csv(f"{extract_path}/stores.csv")
features = pd.read_csv(f"{extract_path}/features.csv")

# Rename IsHoliday in features to avoid merge conflicts
features = features.rename(columns={"IsHoliday": "IsHoliday_features"})

# Merge datasets
train = train.merge(stores, on="Store", how="left").merge(features, on=["Store", "Date"], how="left")
test = test.merge(stores, on="Store", how="left").merge(features, on=["Store", "Date"], how="left")

# Convert Date to datetime
train["Date"] = pd.to_datetime(train["Date"])
test["Date"] = pd.to_datetime(test["Date"])

# Verify merges
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())
print(f"Train rows: {len(train)}, Test rows: {len(test)}")



Train columns: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Type', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_features']
Test columns: ['Store', 'Dept', 'Date', 'IsHoliday', 'Type', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_features']
Train rows: 421570, Test rows: 115064


In [3]:
# Handle missing values
for col in ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)
train[["CPI", "Unemployment"]] = train[["CPI", "Unemployment"]].ffill()
test[["CPI", "Unemployment"]] = test[["CPI", "Unemployment"]].ffill()

# Encode categorical variables
le_store = LabelEncoder()
le_dept = LabelEncoder()
le_type = LabelEncoder()
train["Store"] = le_store.fit_transform(train["Store"])
test["Store"] = le_store.transform(test["Store"])
train["Dept"] = le_dept.fit_transform(train["Dept"])
test["Dept_original"] = test["Dept"]  # Preserve original Dept for Ids
test["Dept"] = test["Dept"].apply(lambda x: x if x in le_dept.classes_ else le_dept.classes_[-1])
test["Dept"] = le_dept.transform(test["Dept"])
train["Type"] = le_type.fit_transform(train["Type"])
test["Type"] = le_type.transform(test["Type"])

# Add holiday indicators
holiday_dates = {
    "Super Bowl": ["2010-02-12", "2011-02-11", "2012-02-10", "2013-02-08"],
    "Labor Day": ["2010-09-10", "2011-09-09", "2012-09-07", "2013-09-06"],
    "Thanksgiving": ["2010-11-26", "2011-11-25", "2012-11-23", "2013-11-29"],
    "Christmas": ["2010-12-31", "2011-12-30", "2012-12-28", "2013-12-27"]
}
for holiday, dates in holiday_dates.items():
    dates = pd.to_datetime(dates)
    for df in [train, test]:
        df[holiday] = df["Date"].isin(dates).astype(int)

# Extract date features and holiday weight
for df in [train, test]:
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Week"] = df["Date"].dt.isocalendar().week
    df["IsHolidayWeight"] = df["IsHoliday"].apply(lambda x: 5 if x else 1)

# Add lag-52 feature
train["Lag_52"] = train.groupby(["Store", "Dept"])["Weekly_Sales"].shift(52)
# Create a lagged date column in train
train["Date_lag"] = train["Date"] + pd.Timedelta(weeks=52)
test = test.merge(
    train[["Store", "Dept", "Date_lag", "Weekly_Sales"]].copy().rename(columns={"Weekly_Sales": "Weekly_Sales_lag"}),
    left_on=["Store", "Dept", "Date"],
    right_on=["Store", "Dept", "Date_lag"],
    how="left"
)
train["Lag_52"] = train["Lag_52"].fillna(train["Weekly_Sales"].mean())
test["Lag_52"] = test["Weekly_Sales_lag"].fillna(train["Weekly_Sales"].mean())
# Drop temporary column
train = train.drop(columns=["Date_lag"])
test = test.drop(columns=["Weekly_Sales_lag", "Date_lag"], errors="ignore")

# Define features
num_features = ["Size", "Temperature", "Fuel_Price", "MarkDown1", "MarkDown2", "MarkDown3",
                "MarkDown4", "MarkDown5", "CPI", "Unemployment", "Year", "Month", "Week",
                "Super Bowl", "Labor Day", "Thanksgiving", "Christmas", "Lag_52"]
cat_features = ["Store", "Dept", "Type"]
target = "Weekly_Sales"

# Scale numerical features
scaler = StandardScaler()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

# Scale target
target_scaler = StandardScaler()
train["Weekly_Sales"] = target_scaler.fit_transform(train[["Weekly_Sales"]]).flatten()

# Verify preprocessing
print(f"Train columns after preprocessing: {train.columns.tolist()}")
print(f"Test columns after preprocessing: {test.columns.tolist()}")
print(f"Train NaNs:\n{train[num_features].isnull().sum()}")
print(f"Test NaNs:\n{test[num_features].isnull().sum()}")

Train columns after preprocessing: ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Type', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_features', 'Super Bowl', 'Labor Day', 'Thanksgiving', 'Christmas', 'Year', 'Month', 'Week', 'IsHolidayWeight', 'Lag_52']
Test columns after preprocessing: ['Store', 'Dept', 'Date', 'IsHoliday', 'Type', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_features', 'Dept_original', 'Super Bowl', 'Labor Day', 'Thanksgiving', 'Christmas', 'Year', 'Month', 'Week', 'IsHolidayWeight', 'Lag_52']
Train NaNs:
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
Year            0
Month           0
Week            0
Super Bowl      0
Labor Day       0
Thanksgi

In [4]:
def create_sequences(train_df, test_df, seq_length=52):
    train_sequences = []
    train_targets = []
    train_weights = []
    test_sequences = []
    test_ids = []
    n_features = len(num_features + cat_features)

    # Training sequences
    for (store, dept), group in train_df.groupby(["Store", "Dept"]):
        group = group.sort_values("Date")
        features = group[num_features + cat_features].values.astype(np.float64)
        sales = group[target].values.astype(np.float64)
        weights = group["IsHolidayWeight"].values.astype(np.float64)
        for i in range(len(group) - seq_length):
            train_sequences.append(features[i:i+seq_length])
            train_targets.append(sales[i+seq_length])
            train_weights.append(weights[i+seq_length])

    # Mean features for padding
    mean_features = train_df[num_features + cat_features].mean().values.astype(np.float64)

    # Test sequences
    for idx, row in test_df.iterrows():
        store = row["Store"]
        dept = row["Dept"]
        date = row["Date"]
        test_features = row[num_features + cat_features].values.astype(np.float64)
        group = train_df[(train_df["Store"] == store) & (train_df["Dept"] == dept)].sort_values("Date")
        historical = group[group["Date"] < date][num_features + cat_features].values.astype(np.float64)

        seq = np.zeros((seq_length, n_features), dtype=np.float64)
        seq[:] = mean_features
        if len(historical) > 0:
            start_idx = max(0, len(historical) - (seq_length - 1))
            seq[-(len(historical) - start_idx + 1):-1] = historical[start_idx:]
        seq[-1] = test_features
        test_sequences.append(seq)
        test_ids.append(f"{int(row['Store'])}_{int(row['Dept_original'])}_{date.strftime('%Y-%m-%d')}")

    train_sequences = np.array(train_sequences, dtype=np.float64)
    train_targets = np.array(train_targets, dtype=np.float64)
    train_weights = np.array(train_weights, dtype=np.float64)
    test_sequences = np.array(test_sequences, dtype=np.float64)

    print(f"train_sequences shape: {train_sequences.shape}")
    print(f"test_sequences shape: {test_sequences.shape}")
    print(f"Number of unique test_ids: {len(set(test_ids))}")

    return train_sequences, train_targets, train_weights, test_sequences, test_ids

seq_length = config.patch_length * config.n_patches  # 4 * 13 = 52
train_sequences, train_targets, train_weights, test_sequences, test_ids = create_sequences(train, test)

# Convert to tensors
train_sequences = torch.FloatTensor(train_sequences)
train_targets = torch.FloatTensor(train_targets)
train_weights = torch.FloatTensor(train_weights)
test_sequences = torch.FloatTensor(test_sequences)

train_sequences shape: (261083, 52, 21)
test_sequences shape: (115064, 52, 21)
Number of unique test_ids: 115064


In [5]:
class PatchTST(nn.Module):
    def __init__(self, seq_length, patch_length, n_patches, d_model, n_heads, n_layers, n_features):
        super(PatchTST, self).__init__()
        self.patch_length = patch_length
        self.n_patches = n_patches
        self.d_model = d_model

        assert seq_length >= n_patches * patch_length, f"Sequence length ({seq_length}) must be >= n_patches ({n_patches}) * patch_length ({patch_length})"

        self.patch_embedding = nn.Linear(patch_length * n_features, d_model)
        self.position_embedding = nn.Parameter(torch.randn(1, n_patches, d_model))

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.fc = nn.Linear(d_model * n_patches, 1)

    def forward(self, x):
        batch_size = x.size(0)
        x = x[:, :self.n_patches * self.patch_length, :].view(batch_size, self.n_patches, self.patch_length * x.size(-1))
        x = self.patch_embedding(x) + self.position_embedding
        x = self.transformer(x)
        x = x.view(batch_size, -1)
        x = self.fc(x)
        return x.squeeze(-1)

class SalesDataset(Dataset):
    def __init__(self, sequences, targets, weights):
        self.sequences = sequences
        self.targets = targets
        self.weights = weights

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx], self.weights[idx]

In [6]:
# Split train/validation
val_size = int(0.2 * len(train_sequences))
train_seq, val_seq = train_sequences[:-val_size], train_sequences[-val_size:]
train_tgt, val_tgt = train_targets[:-val_size], train_targets[-val_size:]
train_wgt, val_wgt = train_weights[:-val_size], train_weights[-val_size:]

train_dataset = SalesDataset(train_seq, train_tgt, train_wgt)
val_dataset = SalesDataset(val_seq, val_tgt, val_wgt)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size)

# WMAE Loss
def wmae_loss(pred, target, weight):
    return torch.mean(weight * torch.abs(pred - target))

# Initialize model
n_features = len(num_features + cat_features)
model = PatchTST(seq_length=seq_length, patch_length=config.patch_length, n_patches=config.n_patches,
                 d_model=config.d_model, n_heads=config.n_heads, n_layers=config.n_layers,
                 n_features=n_features)
model = model.cuda() if torch.cuda.is_available() else model
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training loop
best_val_wmae = float("inf")
for epoch in range(config.epochs):
    model.train()
    train_loss = 0
    for seq, tgt, wgt in train_loader:
        seq, tgt, wgt = seq.to(device), tgt.to(device), wgt.to(device)
        optimizer.zero_grad()
        pred = model(seq)
        loss = wmae_loss(pred, tgt, wgt)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * seq.size(0)
    train_loss /= len(train_loader.dataset)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for seq, tgt, wgt in val_loader:
            seq, tgt, wgt = seq.to(device), tgt.to(device), wgt.to(device)
            pred = model(seq)
            loss = wmae_loss(pred, tgt, wgt)
            val_loss += loss.item() * seq.size(0)
    val_loss /= len(val_loader.dataset)

    scheduler.step(val_loss)
    wandb.log({"epoch": epoch+1, "train_wmae": train_loss, "val_wmae": val_loss, "lr": optimizer.param_groups[0]['lr']})

    if val_loss < best_val_wmae:
        best_val_wmae = val_loss
        torch.save(model.state_dict(), "best_model.pt")

    print(f"Epoch {epoch+1}/{config.epochs}, Train WMAE: {train_loss:.4f}, Val WMAE: {val_loss:.4f}")

Epoch 1/30, Train WMAE: 0.3314, Val WMAE: 0.2009
Epoch 2/30, Train WMAE: 0.1996, Val WMAE: 0.1847
Epoch 3/30, Train WMAE: 0.1874, Val WMAE: 0.1450
Epoch 4/30, Train WMAE: 0.1818, Val WMAE: 0.1383
Epoch 5/30, Train WMAE: 0.1807, Val WMAE: 0.1627
Epoch 6/30, Train WMAE: 0.1785, Val WMAE: 0.1535
Epoch 7/30, Train WMAE: 0.1722, Val WMAE: 0.1621
Epoch 8/30, Train WMAE: 0.1754, Val WMAE: 0.1886
Epoch 9/30, Train WMAE: 0.1762, Val WMAE: 0.1869
Epoch 10/30, Train WMAE: 0.1814, Val WMAE: 0.2197
Epoch 11/30, Train WMAE: 0.1655, Val WMAE: 0.1659
Epoch 12/30, Train WMAE: 0.1609, Val WMAE: 0.1404
Epoch 13/30, Train WMAE: 0.1586, Val WMAE: 0.1310
Epoch 14/30, Train WMAE: 0.1564, Val WMAE: 0.1391
Epoch 15/30, Train WMAE: 0.1546, Val WMAE: 0.1614
Epoch 16/30, Train WMAE: 0.1550, Val WMAE: 0.1377
Epoch 17/30, Train WMAE: 0.1565, Val WMAE: 0.1476
Epoch 18/30, Train WMAE: 0.1673, Val WMAE: 0.1503
Epoch 19/30, Train WMAE: 0.1539, Val WMAE: 0.1589
Epoch 20/30, Train WMAE: 0.1478, Val WMAE: 0.1615
Epoch 21/

In [8]:
# Generate predictions for test set
model.load_state_dict(torch.load("best_model.pt"))
model.eval()
test_predictions = []
with torch.no_grad():
    for i in range(0, len(test_sequences), config.batch_size):
        batch_seq = test_sequences[i:i+config.batch_size].to(device)
        pred = model(batch_seq)
        pred = target_scaler.inverse_transform(pred.cpu().numpy().reshape(-1, 1)).flatten()
        test_predictions.extend(pred)

# Load test.csv to get expected Ids
test_csv = pd.read_csv(f"{extract_path}/test.csv")
test_csv["Date"] = pd.to_datetime(test_csv["Date"])
test_csv["Id"] = test_csv.apply(lambda row: f"{int(row['Store'])}_{int(row['Dept'])}_{row['Date'].strftime('%Y-%m-%d')}", axis=1)

# Create submission DataFrame
submission = pd.DataFrame({
    "Id": test_csv["Id"],
    "Weekly_Sales": [test_predictions[i] if i < len(test_predictions) else train["Weekly_Sales"].mean() for i in range(len(test_csv))]
})

# Clip negative predictions
submission["Weekly_Sales"] = submission["Weekly_Sales"].clip(lower=0)

# Ensure correct number of entries and no duplicates
assert len(submission) == len(test_csv), f"Submission has {len(submission)} rows, expected {len(test_csv)}"
assert len(set(submission["Id"])) == len(submission), f"Found {len(submission) - len(set(submission['Id']))} duplicate Ids"

# Sort by Id
submission = submission.sort_values("Id")

# Verify date range
submission["Date"] = submission["Id"].str.split("_").str[2]
submission["Date"] = pd.to_datetime(submission["Date"])
assert submission["Date"].min() == pd.to_datetime("2012-11-02"), f"Submission starts at {submission['Date'].min()}, expected 2012-11-02"
assert submission["Date"].max() <= pd.to_datetime("2013-07-26"), f"Submission ends at {submission['Date'].max()}, expected <= 2013-07-26"

# Save submission
submission[["Id", "Weekly_Sales"]].to_csv("submission.csv", index=False)
wandb.save("submission.csv")

# Verify submission
print(f"Submission rows: {len(submission)}")
print(f"Unique Ids: {len(set(submission['Id']))}")
print(submission.head())

Submission rows: 115064
Unique Ids: 115064
                     Id  Weekly_Sales       Date
24245  10_10_2012-11-02  46189.890625 2012-11-02
24246  10_10_2012-11-09  49435.960938 2012-11-09
24247  10_10_2012-11-16  48769.394531 2012-11-16
24248  10_10_2012-11-23  58300.500000 2012-11-23
24249  10_10_2012-11-30  48250.031250 2012-11-30


#Score: 5475.74688