<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_DLinear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Install required libraries
!pip install pandas numpy matplotlib seaborn scikit-learn torch torchvision wandb pyyaml darts --quiet
import wandb
wandb.login(key="eccf2c915699fc032ad678daf0fd4b5ac60bf87c")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [16]:
# Mount Google Drive and extract data
from google.colab import drive
import zipfile
import os
drive.mount('/content/drive')
zip_path = '/content/drive/MyDrive/ML-FinalProject/data.zip'
extract_to = '/content/walmart_data/'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
for file_name in os.listdir(extract_to):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(extract_to, file_name), 'r') as zip_ref:
            zip_ref.extractall(extract_to)
print("✅ Extracted files:", os.listdir(extract_to))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Extracted files: ['test.csv.zip', 'features.csv', 'train.csv.zip', 'train.csv', 'features.csv.zip', 'test.csv', 'stores.csv', 'sampleSubmission.csv.zip', 'sampleSubmission.csv']


In [17]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from datetime import timedelta

In [None]:
# Load and preprocess data
train = pd.read_csv('/content/walmart_data/train.csv')
features = pd.read_csv('/content/walmart_data/features.csv')
stores = pd.read_csv('/content/walmart_data/stores.csv')
test = pd.read_csv('/content/walmart_data/test.csv')

# Merge train with features and stores
df = pd.merge(train, features, on=['Store', 'Date'], how='left')
df = pd.merge(df, stores, on='Store', how='left')
df = df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Store', 'Dept', 'Date'])

# Handle missing values
df['MarkDown1'].fillna(0, inplace=True)
df['MarkDown2'].fillna(0, inplace=True)
df['MarkDown3'].fillna(0, inplace=True)
df['MarkDown4'].fillna(0, inplace=True)
df['MarkDown5'].fillna(0, inplace=True)
df['CPI'].fillna(df['CPI'].mean(), inplace=True)
df['Unemployment'].fillna(df['Unemployment'].mean(), inplace=True)
df['Temperature'].fillna(df['Temperature'].mean(), inplace=True)
df['Fuel_Price'].fillna(df['Fuel_Price'].mean(), inplace=True)
df['IsHoliday'] = df['IsHoliday'].astype(int)  # Convert boolean to 0/1
df['Type'] = df['Type'].map({'A': 0, 'B': 1, 'C': 2})
df['Size'].fillna(df['Size'].mean(), inplace=True)  # Fill missing Size values

# Check for NaN or inf in data
feature_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2',
                'MarkDown3', 'MarkDown4', 'MarkDown5', 'Size', 'Type', 'IsHoliday']
assert not df[['Weekly_Sales'] + feature_cols].isna().any().any(), "NaN values found in data"
assert not df[['Weekly_Sales'] + feature_cols].isin([np.inf, -np.inf]).any().any(), "Inf values found in data"

# Normalize features with robust scaling
scaler_sales = StandardScaler()
scaler_features = StandardScaler()
df['Weekly_Sales'] = scaler_sales.fit_transform(df[['Weekly_Sales']].clip(lower=-1e5, upper=1e5))  # Clip outliers
df[feature_cols] = scaler_features.fit_transform(df[feature_cols].clip(lower=-1e5, upper=1e5))  # Clip outliers

# Create time series dictionary with exogenous features
store_dept_pairs = df[['Store', 'Dept']].drop_duplicates()
time_series_dict = {}
for _, row in store_dept_pairs.iterrows():
    store, dept = row['Store'], row['Dept']
    sub_df = df[(df['Store'] == store) & (df['Dept'] == dept)].sort_values('Date')

    # Ensure continuous weekly data
    date_range = pd.date_range(start=sub_df['Date'].min(), end=sub_df['Date'].max(), freq='W-FRI')
    sub_df = sub_df.set_index('Date').reindex(date_range, method='ffill').reset_index()  # Forward fill
    sub_df['Store'] = store
    sub_df['Dept'] = dept
    sub_df['Weekly_Sales'].fillna(0, inplace=True)
    sub_df['IsHoliday'].fillna(0, inplace=True)  # Fill missing holidays as non-holiday
    sub_df['Type'].fillna(df[df['Store'] == store]['Type'].iloc[0], inplace=True)  # Fill with store's Type
    sub_df['Size'].fillna(df[df['Store'] == store]['Size'].iloc[0], inplace=True)  # Fill with store's Size
    sub_df[feature_cols] = sub_df[feature_cols].fillna(0)  # Fill remaining feature_cols with 0

    # Check for NaN or inf after reindexing
    assert not sub_df[['Weekly_Sales'] + feature_cols].isna().any().any(), f"NaN in sub_df for Store {store}, Dept {dept}"
    assert not sub_df[['Weekly_Sales'] + feature_cols].isin([np.inf, -np.inf]).any().any(), f"Inf in sub_df for Store {store}, Dept {dept}"

    time_series_dict[(store, dept)] = {
        'sales': sub_df['Weekly_Sales'].values.astype(np.float32),
        'features': sub_df[feature_cols].values.astype(np.float32),
        'dates': sub_df['index'].values,
        'is_holiday': sub_df['IsHoliday'].values.astype(np.float32)
    }
print(f"Created time series for {len(time_series_dict)} store-department pairs.")

In [51]:
class WalmartSalesDataset(Dataset):
    def __init__(self, time_series_dict, seq_len=36, pred_len=6, train=True):
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.data = []
        self.holiday_weights = []

        for (store, dept), ts_data in time_series_dict.items():
            sales = ts_data['sales']
            features = ts_data['features']
            is_holiday = ts_data['is_holiday']
            n = len(sales)
            if n < seq_len + pred_len:
                continue
            split_idx = int(0.8 * (n - seq_len - pred_len + 1)) if train else 0
            start_idx = 0 if train else split_idx
            end_idx = split_idx if train else n - seq_len - pred_len + 1
            for i in range(start_idx, end_idx):
                x_sales = sales[i:i + seq_len]
                x_features = features[i:i + seq_len]
                y = sales[i + seq_len:i + seq_len + pred_len]
                w = is_holiday[i + seq_len:i + seq_len + pred_len] * 4 + 1
                # Validate data
                if np.any(np.isnan(x_sales)) or np.any(np.isnan(x_features)) or np.any(np.isnan(y)) or np.any(np.isnan(w)):
                    continue
                if np.any(np.isinf(x_sales)) or np.any(np.isinf(x_features)) or np.any(np.isinf(y)) or np.any(np.isinf(w)):
                    continue
                self.data.append((x_sales, x_features, y))
                self.holiday_weights.append(w)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_sales, x_features, y = self.data[idx]
        w = self.holiday_weights[idx]
        return (torch.tensor(x_sales, dtype=torch.float32).unsqueeze(-1),
                torch.tensor(x_features, dtype=torch.float32),
                torch.tensor(y, dtype=torch.float32),
                torch.tensor(w, dtype=torch.float32))

In [52]:
# Create train and validation datasets
train_dataset = WalmartSalesDataset(time_series_dict, seq_len=36, pred_len=6, train=True)
val_dataset = WalmartSalesDataset(time_series_dict, seq_len=36, pred_len=6, train=False)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")

Train samples: 251133, Val samples: 316241


In [53]:
# Enhanced DLinear model with exogenous features
class DLinear(nn.Module):
    def __init__(self, seq_len, pred_len, n_features):
        super(DLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.n_features = n_features

        # Linear layers for sales (trend and seasonal)
        self.Linear_Trend = nn.Linear(seq_len, pred_len)
        self.Linear_Seasonal = nn.Linear(seq_len, pred_len)

        # Linear layer for exogenous features
        self.Linear_Exogenous = nn.Linear(seq_len * n_features, pred_len)

    def forward(self, x_sales, x_features):
        # x_sales: (batch_size, seq_len, 1)
        # x_features: (batch_size, seq_len, n_features)
        x_sales = x_sales.squeeze(-1)  # (batch_size, seq_len)
        trend = self.Linear_Trend(x_sales)
        seasonal = self.Linear_Seasonal(x_sales)

        # Flatten features and process
        x_features = x_features.view(x_features.size(0), -1)  # (batch_size, seq_len * n_features)
        exogenous = self.Linear_Exogenous(x_features)

        return trend + seasonal + exogenous

In [54]:
# Custom WMAE loss
def wmae_loss(preds, targets, weights):
    return torch.mean(weights * torch.abs(preds - targets))

In [56]:
# Training and validation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DLinear(seq_len=36, pred_len=6, n_features=12).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion_mse = nn.MSELoss()
criterion_wmae = wmae_loss

# Initialize WandB
wandb.init(project="walmart-dlinear", name="dlinear-enhanced-run", config={
    "seq_len": 36, "pred_len": 6, "batch_size": 32, "epochs": 20,
    "learning_rate": 0.001, "model": "DLinear-Enhanced", "n_features": 12
})

# Training and validation loop
for epoch in range(20):
    model.train()
    train_loss_mse, train_loss_wmae = 0.0, 0.0
    train_batches = 0
    for xb_sales, xb_features, yb, wb in train_loader:
        xb_sales, xb_features, yb, wb = xb_sales.to(device), xb_features.to(device), yb.to(device), wb.to(device)
        optimizer.zero_grad()
        preds = model(xb_sales, xb_features)
        loss_mse = criterion_mse(preds, yb)
        loss_wmae = criterion_wmae(preds, yb, wb)
        loss = loss_mse + loss_wmae
        if torch.isnan(loss) or torch.isinf(loss):
            continue  # Skip batch if loss is invalid
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Prevent exploding gradients
        optimizer.step()
        train_loss_mse += loss_mse.item() * xb_sales.size(0)
        train_loss_wmae += loss_wmae.item() * xb_sales.size(0)
        train_batches += xb_sales.size(0)

    train_loss_mse = train_loss_mse / train_batches if train_batches > 0 else float('nan')
    train_loss_wmae = train_loss_wmae / train_batches if train_batches > 0 else float('nan')

    # Validation
    model.eval()
    val_loss_mse, val_loss_wmae = 0.0, 0.0
    val_batches = 0
    with torch.no_grad():
        for xb_sales, xb_features, yb, wb in val_loader:
            xb_sales, xb_features, yb, wb = xb_sales.to(device), xb_features.to(device), yb.to(device), wb.to(device)
            preds = model(xb_sales, xb_features)
            loss_mse = criterion_mse(preds, yb)
            loss_wmae = criterion_wmae(preds, yb, wb)
            if torch.isnan(loss_mse) or torch.isinf(loss_mse) or torch.isnan(loss_wmae) or torch.isinf(loss_wmae):
                continue
            val_loss_mse += loss_mse.item() * xb_sales.size(0)
            val_loss_wmae += loss_wmae.item() * xb_sales.size(0)
            val_batches += xb_sales.size(0)

    val_loss_mse = val_loss_mse / val_batches if val_batches > 0 else float('nan')
    val_loss_wmae = val_loss_wmae / val_batches if val_batches > 0 else float('nan')

    print(f"Epoch {epoch+1}/20 — Train MSE: {train_loss_mse:.4f}, Train WMAE: {train_loss_wmae:.4f}, "
          f"Val MSE: {val_loss_mse:.4f}, Val WMAE: {val_loss_wmae:.4f}")
    wandb.log({"train_mse": train_loss_mse, "train_wmae": train_loss_wmae,
               "val_mse": val_loss_mse, "val_wmae": val_loss_wmae, "epoch": epoch+1})

Epoch 1/20 — Train MSE: 0.1074, Train WMAE: 0.2894, Val MSE: 0.1209, Val WMAE: 0.2594
Epoch 2/20 — Train MSE: 0.0961, Train WMAE: 0.2790, Val MSE: 0.1053, Val WMAE: 0.2376
Epoch 3/20 — Train MSE: 0.0957, Train WMAE: 0.2771, Val MSE: 0.1103, Val WMAE: 0.2672
Epoch 4/20 — Train MSE: 0.0955, Train WMAE: 0.2782, Val MSE: 0.1175, Val WMAE: 0.2670
Epoch 5/20 — Train MSE: 0.0961, Train WMAE: 0.2785, Val MSE: 0.1009, Val WMAE: 0.2485
Epoch 6/20 — Train MSE: 0.0956, Train WMAE: 0.2775, Val MSE: 0.1151, Val WMAE: 0.2935
Epoch 7/20 — Train MSE: 0.0955, Train WMAE: 0.2783, Val MSE: 0.1192, Val WMAE: 0.2720
Epoch 8/20 — Train MSE: 0.0957, Train WMAE: 0.2778, Val MSE: 0.1063, Val WMAE: 0.2626
Epoch 9/20 — Train MSE: 0.0955, Train WMAE: 0.2776, Val MSE: 0.1006, Val WMAE: 0.2502
Epoch 10/20 — Train MSE: 0.0955, Train WMAE: 0.2781, Val MSE: 0.1049, Val WMAE: 0.2695
Epoch 11/20 — Train MSE: 0.0958, Train WMAE: 0.2780, Val MSE: 0.1092, Val WMAE: 0.2520
Epoch 12/20 — Train MSE: 0.0954, Train WMAE: 0.2772,

In [57]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import os

# Ensure the model is in evaluation mode
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Preprocess the test data
test = pd.read_csv('/content/walmart_data/test.csv')
features = pd.read_csv('/content/walmart_data/features.csv')
stores = pd.read_csv('/content/walmart_data/stores.csv')

# Merge test with features and stores
test_df = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')
test_df = test_df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df = test_df.sort_values(by=['Store', 'Dept', 'Date'])

# Handle missing values consistently with training
test_df['MarkDown1'].fillna(0, inplace=True)
test_df['MarkDown2'].fillna(0, inplace=True)
test_df['MarkDown3'].fillna(0, inplace=True)
test_df['MarkDown4'].fillna(0, inplace=True)
test_df['MarkDown5'].fillna(0, inplace=True)
test_df['CPI'].fillna(df['CPI'].mean(), inplace=True)  # Use training mean
test_df['Unemployment'].fillna(df['Unemployment'].mean(), inplace=True)
test_df['Temperature'].fillna(df['Temperature'].mean(), inplace=True)
test_df['Fuel_Price'].fillna(df['Fuel_Price'].mean(), inplace=True)
test_df['IsHoliday'] = test_df['IsHoliday'].astype(int)
test_df['Type'] = test_df['Type'].map({'A': 0, 'B': 1, 'C': 2})
test_df['Size'].fillna(df['Size'].mean(), inplace=True)

# Scale features using the same scaler as training
feature_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2',
                'MarkDown3', 'MarkDown4', 'MarkDown5', 'Size', 'Type', 'IsHoliday']
test_df[feature_cols] = scaler_features.transform(test_df[feature_cols].clip(lower=-1e5, upper=1e5))

# Check for NaN or inf
assert not test_df[feature_cols].isna().any().any(), "NaN values in test data"
assert not test_df[feature_cols].isin([np.inf, -np.inf]).any().any(), "Inf values in test data"

# Step 2: Prepare test dataset for DLinear
def create_test_sequences(test_df, time_series_dict, seq_len=36):
    test_data = []
    test_ids = []
    store_dept_pairs = test_df[['Store', 'Dept']].drop_duplicates()

    for _, row in store_dept_pairs.iterrows():
        store, dept = row['Store'], row['Dept']
        test_sub_df = test_df[(test_df['Store'] == store) & (test_df['Dept'] == dept)].sort_values('Date')
        train_sub_df = df[(df['Store'] == store) & (df['Dept'] == dept)].sort_values('Date')

        if len(test_sub_df) == 0:
            continue

        # Get the last seq_len weeks from training data
        if (store, dept) in time_series_dict:
            train_sales = time_series_dict[(store, dept)]['sales']
            train_features = time_series_dict[(store, dept)]['features']
            train_dates = time_series_dict[(store, dept)]['dates']
        else:
            # If no training data, use zeros or mean values
            train_sales = np.zeros(seq_len, dtype=np.float32)
            train_features = np.zeros((seq_len, len(feature_cols)), dtype=np.float32)
            train_dates = test_sub_df['Date'].min() - pd.Timedelta(weeks=seq_len)
            train_dates = pd.date_range(end=train_dates, periods=seq_len, freq='W-FRI')

        # For each test date, create input sequence
        for date in test_sub_df['Date'].unique():
            test_row = test_sub_df[test_sub_df['Date'] == date]
            if len(test_row) == 0:
                continue

            # Find the index of the closest prior date in training
            last_train_date = train_dates[-1] if len(train_dates) > 0 else date - pd.Timedelta(weeks=seq_len)
            weeks_diff = (date - pd.to_datetime(last_train_date)).days // 7

            # Use the last seq_len weeks from training data
            x_sales = train_sales[-seq_len:] if len(train_sales) >= seq_len else np.pad(train_sales, (seq_len - len(train_sales), 0), mode='constant')
            x_features = train_features[-seq_len:] if len(train_features) >= seq_len else np.pad(train_features, ((seq_len - len(train_features), 0), (0, 0)), mode='constant')

            # Append test features (only for the current week)
            test_features = test_row[feature_cols].values.astype(np.float32)
            if len(test_features) > 0:
                x_features[-1] = test_features[0]  # Replace the last week's features with test week's features

            # Create Id for submission
            date_str = date.strftime('%Y-%m-%d')
            test_id = f"{int(store)}_{int(dept)}_{date_str}"

            test_data.append((x_sales, x_features))
            test_ids.append(test_id)

    return test_data, test_ids

# Create test sequences
test_data, test_ids = create_test_sequences(test_df, time_series_dict, seq_len=36)

# Convert to DataLoader
class TestDataset(Dataset):
    def __init__(self, test_data):
        self.test_data = test_data

    def __len__(self):
        return len(self.test_data)

    def __getitem__(self, idx):
        x_sales, x_features = self.test_data[idx]
        return (torch.tensor(x_sales, dtype=torch.float32).unsqueeze(-1),
                torch.tensor(x_features, dtype=torch.float32))

test_dataset = TestDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 3: Generate predictions
predictions = []
with torch.no_grad():
    for xb_sales, xb_features in test_loader:
        xb_sales, xb_features = xb_sales.to(device), xb_features.to(device)
        preds = model(xb_sales, xb_features)  # Shape: (batch_size, pred_len)
        # Take only the first prediction (for the next week)
        preds = preds[:, 0]  # Shape: (batch_size,)
        predictions.append(preds.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)

# Step 4: Inverse transform predictions
predictions = scaler_sales.inverse_transform(predictions.reshape(-1, 1)).flatten()

# Step 5: Create submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'Weekly_Sales': predictions
})

# Ensure no negative sales predictions
submission['Weekly_Sales'] = submission['Weekly_Sales'].clip(lower=0)

# Save submission file
submission.to_csv('/content/submission.csv', index=False)
print("✅ Submission file created: /content/submission.csv")

# Log submission to WandB
wandb.save('/content/submission.csv')
wandb.finish()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['MarkDown1'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['MarkDown2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

✅ Submission file created: /content/submission.csv


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_mse,█▂▁▁▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁
train_wmae,█▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▂
val_mse,█▄▅▇▂▆█▄▂▃▅▂▁▃▄▆▃▃▂▄
val_wmae,▄▁▅▅▂█▅▄▃▅▃▄▃▃▅▇▃▆▄▆

0,1
epoch,20.0
train_mse,0.09562
train_wmae,0.27796
val_mse,0.10572
val_wmae,0.27525


# public score 6098