<a href="https://colab.research.google.com/github/lkhok22/ML-FinalProject-Walmart-Recruiting---Store-Sales-Forecasting/blob/main/model_experiment_DLinear.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install pandas numpy matplotlib seaborn scikit-learn torch torchvision wandb pyyaml darts --quiet
import wandb
wandb.login(key="eccf2c915699fc032ad678daf0fd4b5ac60bf87c")

In [2]:
# Mount Google Drive and extract data
from google.colab import drive
import zipfile
import os
drive.mount('/content/drive')
zip_path = '/content/drive/MyDrive/ML-FinalProject/data.zip'
extract_to = '/content/walmart_data/'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
for file_name in os.listdir(extract_to):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(extract_to, file_name), 'r') as zip_ref:
            zip_ref.extractall(extract_to)
print("✅ Extracted files:", os.listdir(extract_to))

Mounted at /content/drive
✅ Extracted files: ['test.csv.zip', 'features.csv', 'train.csv.zip', 'train.csv', 'features.csv.zip', 'test.csv', 'stores.csv', 'sampleSubmission.csv.zip', 'sampleSubmission.csv']


In [3]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from datetime import timedelta

In [None]:
# Load and preprocess data
train = pd.read_csv('/content/walmart_data/train.csv')
features = pd.read_csv('/content/walmart_data/features.csv')
stores = pd.read_csv('/content/walmart_data/stores.csv')
test = pd.read_csv('/content/walmart_data/test.csv')

# Merge train with features and stores
df = pd.merge(train, features, on=['Store', 'Date'], how='left')
df = pd.merge(df, stores, on='Store', how='left')
df = df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Store', 'Dept', 'Date'])

# Add holiday-specific features
holiday_dates = {
    'SuperBowl': ['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'],
    'LaborDay': ['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'],
    'Thanksgiving': ['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'],
    'Christmas': ['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']
}
for holiday, dates in holiday_dates.items():
    df[holiday] = df['Date'].isin(pd.to_datetime(dates)).astype(int)

# Add time-based features
df['WeekOfYear'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year - df['Date'].dt.year.min()

# Add holiday proximity features
for holiday, dates in holiday_dates.items():
    for date in pd.to_datetime(dates):
        df[f'{holiday}_Before'] = ((df['Date'] < date) & (df['Date'] >= date - pd.Timedelta(weeks=2))).astype(int)
        df[f'{holiday}_After'] = ((df['Date'] > date) & (df['Date'] <= date + pd.Timedelta(weeks=2))).astype(int)

# Add lagged sales
df['Lag1'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
df['Lag2'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)
df['Lag4'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(4)

# Handle missing values
df['MarkDown1'].fillna(0, inplace=True)
df['MarkDown2'].fillna(0, inplace=True)
df['MarkDown3'].fillna(0, inplace=True)
df['MarkDown4'].fillna(0, inplace=True)
df['MarkDown5'].fillna(0, inplace=True)
df['CPI'].fillna(df['CPI'].mean(), inplace=True)
df['Unemployment'].fillna(df['Unemployment'].mean(), inplace=True)
df['Temperature'].fillna(df['Temperature'].mean(), inplace=True)
df['Fuel_Price'].fillna(df['Fuel_Price'].mean(), inplace=True)
df['IsHoliday'] = df['IsHoliday'].astype(int)
df['Type'] = df['Type'].map({'A': 0, 'B': 1, 'C': 2})
df['Size'].fillna(df['Size'].mean(), inplace=True)
df[['Lag1', 'Lag2', 'Lag4']] = df[['Lag1', 'Lag2', 'Lag4']].fillna(0)

# Define feature columns
feature_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2',
                'MarkDown3', 'MarkDown4', 'MarkDown5', 'Size', 'Type', 'IsHoliday',
                'SuperBowl', 'LaborDay', 'Thanksgiving', 'Christmas',
                'WeekOfYear', 'Month', 'Year', 'Lag1', 'Lag2', 'Lag4',
                'SuperBowl_Before', 'SuperBowl_After', 'LaborDay_Before', 'LaborDay_After']
assert len(feature_cols) == 26, f"Expected 26 features, got {len(feature_cols)}"

# Check for NaN or inf
assert not df[['Weekly_Sales'] + feature_cols].isna().any().any(), "NaN values found in data"
assert not df[['Weekly_Sales'] + feature_cols].isin([np.inf, -np.inf]).any().any(), "Inf values found in data"

# Normalize features
scaler_sales = StandardScaler()
scaler_features = StandardScaler()
df['Weekly_Sales'] = scaler_sales.fit_transform(df[['Weekly_Sales']].clip(lower=-1e5, upper=1e5))
df[feature_cols] = scaler_features.fit_transform(df[feature_cols].clip(lower=-1e5, upper=1e5))

# Create time series dictionary
store_dept_pairs = df[['Store', 'Dept']].drop_duplicates()
time_series_dict = {}
for _, row in store_dept_pairs.iterrows():
    store, dept = row['Store'], row['Dept']
    sub_df = df[(df['Store'] == store) & (df['Dept'] == dept)].sort_values('Date')

    date_range = pd.date_range(start=sub_df['Date'].min(), end=sub_df['Date'].max(), freq='W-FRI')
    sub_df = sub_df.set_index('Date').reindex(date_range, method='ffill').reset_index()
    sub_df['Store'] = store
    sub_df['Dept'] = dept
    sub_df['Weekly_Sales'].fillna(0, inplace=True)
    sub_df['IsHoliday'].fillna(0, inplace=True)
    sub_df['Type'].fillna(df[df['Store'] == store]['Type'].iloc[0], inplace=True)
    sub_df['Size'].fillna(df[df['Store'] == store]['Size'].iloc[0], inplace=True)
    for holiday in ['SuperBowl', 'LaborDay', 'Thanksgiving', 'Christmas',
                    'SuperBowl_Before', 'SuperBowl_After', 'LaborDay_Before', 'LaborDay_After',
                    'Thanksgiving_Before', 'Thanksgiving_After', 'Christmas_Before', 'Christmas_After']:
        sub_df[holiday].fillna(0, inplace=True)
    sub_df[['WeekOfYear', 'Month', 'Year', 'Lag1', 'Lag2', 'Lag4']] = sub_df[['WeekOfYear', 'Month', 'Year', 'Lag1', 'Lag2', 'Lag4']].fillna(0)
    sub_df[feature_cols] = sub_df[feature_cols].fillna(0)

    assert not sub_df[['Weekly_Sales'] + feature_cols].isna().any().any(), f"NaN in sub_df for Store {store}, Dept {dept}"
    assert not sub_df[['Weekly_Sales'] + feature_cols].isin([np.inf, -np.inf]).any().any(), f"Inf in sub_df for Store {store}, Dept {dept}"

    time_series_dict[(store, dept)] = {
        'sales': sub_df['Weekly_Sales'].values.astype(np.float32),
        'features': sub_df[feature_cols].values.astype(np.float32),
        'dates': sub_df['index'].values,
        'is_holiday': sub_df['IsHoliday'].values.astype(np.float32)
    }
print(f"Created time series for {len(time_series_dict)} store-department pairs.")

In [5]:
class WalmartSalesDataset(Dataset):
    def __init__(self, time_series_dict, seq_len=36, pred_len=6, train=True):
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.data = []
        self.holiday_weights = []

        for (store, dept), ts_data in time_series_dict.items():
            sales = ts_data['sales']
            features = ts_data['features']
            is_holiday = ts_data['is_holiday']
            n = len(sales)
            if n < seq_len + pred_len:
                continue
            split_idx = int(0.8 * (n - seq_len - pred_len + 1)) if train else 0
            start_idx = 0 if train else split_idx
            end_idx = split_idx if train else n - seq_len - pred_len + 1
            for i in range(start_idx, end_idx):
                x_sales = sales[i:i + seq_len]
                x_features = features[i:i + seq_len]
                y = sales[i + seq_len:i + seq_len + pred_len]
                w = is_holiday[i + seq_len:i + seq_len + pred_len] * 4 + 1
                # Validate data
                if np.any(np.isnan(x_sales)) or np.any(np.isnan(x_features)) or np.any(np.isnan(y)) or np.any(np.isnan(w)):
                    continue
                if np.any(np.isinf(x_sales)) or np.any(np.isinf(x_features)) or np.any(np.isinf(y)) or np.any(np.isinf(w)):
                    continue
                self.data.append((x_sales, x_features, y))
                self.holiday_weights.append(w)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x_sales, x_features, y = self.data[idx]
        w = self.holiday_weights[idx]
        return (torch.tensor(x_sales, dtype=torch.float32).unsqueeze(-1),
                torch.tensor(x_features, dtype=torch.float32),
                torch.tensor(y, dtype=torch.float32),
                torch.tensor(w, dtype=torch.float32))

In [6]:
# Create train and validation datasets
train_dataset = WalmartSalesDataset(time_series_dict, seq_len=36, pred_len=6, train=True)
val_dataset = WalmartSalesDataset(time_series_dict, seq_len=36, pred_len=6, train=False)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")

Train samples: 251133, Val samples: 316241


In [7]:
import torch
import torch.nn as nn
import numpy as np

class AttentionLayer(nn.Module):
    def __init__(self, feature_dim, attention_dim=128):
        super(AttentionLayer, self).__init__()
        self.query = nn.Linear(feature_dim, attention_dim)
        self.key = nn.Linear(feature_dim, attention_dim)
        self.value = nn.Linear(feature_dim, feature_dim)
        self.attention_dim = attention_dim
        self.scale = 1 / (attention_dim ** 0.5)

    def forward(self, x):
        # x: (batch_size, seq_len, n_features)
        query = self.query(x)  # (batch_size, seq_len, attention_dim)
        key = self.key(x)      # (batch_size, seq_len, attention_dim)
        value = self.value(x)  # (batch_size, seq_len, n_features)

        scores = torch.bmm(query, key.transpose(1, 2)) * self.scale  # (batch_size, seq_len, seq_len)
        weights = torch.softmax(scores, dim=-1)
        output = torch.bmm(weights, value)  # (batch_size, seq_len, n_features)
        return output

class EnhancedDLinear(nn.Module):
    def __init__(self, seq_len, pred_len, n_features, dropout=0.2, kernel_size=7):
        super(EnhancedDLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.n_features = n_features
        self.kernel_size = kernel_size

        # Moving average for trend decomposition
        self.avg_pool = nn.AvgPool1d(kernel_size=kernel_size, stride=1, padding=kernel_size//2)

        # Trend component
        self.Trend_MLP = nn.Sequential(
            nn.Linear(seq_len, seq_len),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(seq_len, seq_len // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(seq_len // 2, pred_len)
        )

        # Seasonal component
        self.Seasonal_MLP = nn.Sequential(
            nn.Linear(seq_len, seq_len),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(seq_len, seq_len // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(seq_len // 2, pred_len)
        )

        # Attention for exogenous features
        self.Attention = AttentionLayer(n_features, attention_dim=128)

        # Exogenous feature interaction layer
        self.Interaction_MLP = nn.Sequential(
            nn.Linear(seq_len * n_features, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, pred_len)
        )

        # Holiday-specific branch
        self.Holiday_MLP = nn.Sequential(
            nn.Linear(seq_len * 5, 256),  # 5 holiday features
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, pred_len)
        )

        # Final combination layer
        self.Combine = nn.Linear(pred_len * 4, pred_len)

    def forward(self, x_sales, x_features):
        # x_sales: (batch_size, seq_len, 1)
        # x_features: (batch_size, seq_len, n_features)
        x_sales = x_sales.squeeze(-1)  # (batch_size, seq_len)

        # Trend decomposition with moving average
        trend = self.avg_pool(x_sales.unsqueeze(1)).squeeze(1)  # (batch_size, seq_len)
        seasonal = x_sales - trend  # Residual for seasonality

        # Trend and seasonal predictions
        trend_pred = self.Trend_MLP(trend)  # (batch_size, pred_len)
        seasonal_pred = self.Seasonal_MLP(seasonal)  # (batch_size, pred_len)

        # Exogenous features with attention
        attended_features = self.Attention(x_features)  # (batch_size, seq_len, n_features)
        attended_features = attended_features.reshape(attended_features.size(0), -1)  # (batch_size, seq_len * n_features)
        exogenous_pred = self.Interaction_MLP(attended_features)  # (batch_size, pred_len)

        # Holiday-specific prediction (use holiday features)
        holiday_features = x_features[:, :, -5:]  # Last 5 features: IsHoliday, SuperBowl, LaborDay, Thanksgiving, Christmas
        holiday_features = holiday_features.reshape(holiday_features.size(0), -1)  # (batch_size, seq_len * 5)
        holiday_pred = self.Holiday_MLP(holiday_features)  # (batch_size, pred_len)

        # Combine all components
        combined = torch.cat([trend_pred, seasonal_pred, exogenous_pred, holiday_pred], dim=-1)  # (batch_size, pred_len * 4)
        output = self.Combine(combined)  # (batch_size, pred_len)

        return output

In [8]:
# Custom WMAE loss
def wmae_loss(preds, targets, weights):
    return torch.mean(weights * torch.abs(preds - targets))

In [9]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EnhancedDLinear(seq_len=36, pred_len=6, n_features=26, dropout=0.2, kernel_size=7).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
criterion_mse = nn.MSELoss()
criterion_wmae = wmae_loss

# Initialize WandB
wandb.init(project="walmart-dlinear", name="enhanced-dlinear-run-v3", config={
    "seq_len": 36, "pred_len": 6, "batch_size": 32, "epochs": 50,
    "learning_rate": 0.0001, "model": "EnhancedDLinear", "n_features": 26
})

# Training loop
best_val_wmae = float('inf')
best_model_state = None
patience = 10
counter = 0

for epoch in range(50):
    model.train()
    train_loss_mse, train_loss_wmae, train_wmae_unscaled = 0.0, 0.0, 0.0
    train_batches = 0
    for xb_sales, xb_features, yb, wb in train_loader:
        xb_sales, xb_features, yb, wb = xb_sales.to(device), xb_features.to(device), yb.to(device), wb.to(device)
        optimizer.zero_grad()
        preds = model(xb_sales, xb_features)
        loss_mse = criterion_mse(preds, yb)
        loss_wmae = criterion_wmae(preds, yb, wb)
        loss = loss_mse + 3 * loss_wmae
        if torch.isnan(loss) or torch.isinf(loss):
            continue
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss_mse += loss_mse.item() * xb_sales.size(0)
        train_loss_wmae += loss_wmae.item() * xb_sales.size(0)
        preds_unscaled = scaler_sales.inverse_transform(preds.detach().cpu().numpy())
        yb_unscaled = scaler_sales.inverse_transform(yb.detach().cpu().numpy())
        train_wmae_unscaled += np.mean(wb.cpu().numpy() * np.abs(preds_unscaled - yb_unscaled)) * xb_sales.size(0)
        train_batches += xb_sales.size(0)

    train_loss_mse /= train_batches
    train_loss_wmae /= train_batches
    train_wmae_unscaled /= train_batches

    # Validation
    model.eval()
    val_loss_mse, val_loss_wmae, val_wmae_unscaled = 0.0, 0.0, 0.0
    val_batches = 0
    with torch.no_grad():
        for xb_sales, xb_features, yb, wb in val_loader:
            xb_sales, xb_features, yb, wb = xb_sales.to(device), xb_features.to(device), yb.to(device), wb.to(device)
            preds = model(xb_sales, xb_features)
            loss_mse = criterion_mse(preds, yb)
            loss_wmae = criterion_wmae(preds, yb, wb)
            if torch.isnan(loss_mse) or torch.isinf(loss_mse) or torch.isnan(loss_wmae) or torch.isinf(loss_wmae):
                continue
            val_loss_mse += loss_mse.item() * xb_sales.size(0)
            val_loss_wmae += loss_wmae.item() * xb_sales.size(0)
            preds_unscaled = scaler_sales.inverse_transform(preds.cpu().numpy())
            yb_unscaled = scaler_sales.inverse_transform(yb.cpu().numpy())
            val_wmae_unscaled += np.mean(wb.cpu().numpy() * np.abs(preds_unscaled - yb_unscaled)) * xb_sales.size(0)
            val_batches += xb_sales.size(0)

    val_loss_mse /= val_batches
    val_loss_wmae /= val_batches
    val_wmae_unscaled /= val_batches

    print(f"Epoch {epoch+1}/50 — Train MSE: {train_loss_mse:.4f}, Train WMAE: {train_loss_wmae:.4f}, "
          f"Train WMAE Unscaled: {train_wmae_unscaled:.2f}, Val MSE: {val_loss_mse:.4f}, "
          f"Val WMAE: {val_loss_wmae:.4f}, Val WMAE Unscaled: {val_wmae_unscaled:.2f}")
    wandb.log({
        "train_mse": train_loss_mse, "train_wmae": train_loss_wmae, "train_wmae_unscaled": train_wmae_unscaled,
        "val_mse": val_loss_mse, "val_wmae": val_loss_wmae, "val_wmae_unscaled": val_wmae_unscaled,
        "lr": optimizer.param_groups[0]['lr'], "epoch": epoch+1
    })

    scheduler.step(val_loss_wmae)
    if val_loss_wmae < best_val_wmae:
        best_val_wmae = val_loss_wmae
        best_model_state = model.state_dict()
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

if best_model_state is not None:
    torch.save(best_model_state, '/content/drive/MyDrive/ML-FinalProject/enhanced_dlinear_model_best_v3.pth')
wandb.finish()



Epoch 1/50 — Train MSE: 0.1083, Train WMAE: 0.2435, Train WMAE Unscaled: 4986.68, Val MSE: 0.0754, Val WMAE: 0.1684, Val WMAE Unscaled: 3448.59
Epoch 2/50 — Train MSE: 0.0849, Train WMAE: 0.2021, Train WMAE Unscaled: 4137.93, Val MSE: 0.0713, Val WMAE: 0.1629, Val WMAE Unscaled: 3335.77
Epoch 3/50 — Train MSE: 0.0814, Train WMAE: 0.1940, Train WMAE Unscaled: 3972.61, Val MSE: 0.0700, Val WMAE: 0.1553, Val WMAE Unscaled: 3180.00
Epoch 4/50 — Train MSE: 0.0806, Train WMAE: 0.1886, Train WMAE Unscaled: 3862.44, Val MSE: 0.0695, Val WMAE: 0.1522, Val WMAE Unscaled: 3117.29
Epoch 5/50 — Train MSE: 0.0799, Train WMAE: 0.1854, Train WMAE Unscaled: 3796.65, Val MSE: 0.0700, Val WMAE: 0.1492, Val WMAE Unscaled: 3055.05
Epoch 6/50 — Train MSE: 0.0787, Train WMAE: 0.1827, Train WMAE Unscaled: 3741.36, Val MSE: 0.0680, Val WMAE: 0.1468, Val WMAE Unscaled: 3007.07
Epoch 7/50 — Train MSE: 0.0772, Train WMAE: 0.1798, Train WMAE Unscaled: 3681.43, Val MSE: 0.0659, Val WMAE: 0.1443, Val WMAE Unscaled: 

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mse,█▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_wmae,█▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train_wmae_unscaled,█▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_mse,█▇▇▇▇▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▂▂▁▁▂▁▁▁▁▁▁
val_wmae,██▇▆▆▆▆▆▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▁▂▂▂▁▁▁▁▁▁
val_wmae_unscaled,█▇▇▆▆▆▆▆▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▂▂▁▁▁▁▁▁

0,1
epoch,50.0
lr,0.0001
train_mse,0.04757
train_wmae,0.10999
train_wmae_unscaled,2252.3855
val_mse,0.04001
val_wmae,0.08354
val_wmae_unscaled,1710.73181


In [12]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import wandb

# Initialize WandB
wandb.init(project="walmart-dlinear", name="submission-enhanced-run-v4", config={
    "seq_len": 36, "pred_len": 6, "model": "EnhancedDLinear", "n_features": 26
})

# Load and preprocess test data
test = pd.read_csv('/content/walmart_data/test.csv')
features = pd.read_csv('/content/walmart_data/features.csv')
stores = pd.read_csv('/content/walmart_data/stores.csv')

# Merge test with features and stores
test_df = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')
test_df = test_df.drop(columns=['IsHoliday_x']).rename(columns={'IsHoliday_y': 'IsHoliday'})
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df = test_df.sort_values(by=['Store', 'Dept', 'Date'])

# Add holiday-specific features
holiday_dates = {
    'SuperBowl': ['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'],
    'LaborDay': ['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'],
    'Thanksgiving': ['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'],
    'Christmas': ['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27']
}
for holiday, dates in holiday_dates.items():
    test_df[holiday] = test_df['Date'].isin(pd.to_datetime(dates)).astype(int)

# Add time-based features
test_df['WeekOfYear'] = test_df['Date'].dt.isocalendar().week
test_df['Month'] = test_df['Date'].dt.month
test_df['Year'] = test_df['Date'].dt.year - df['Date'].dt.year.min()

# Add holiday proximity features (only for SuperBowl and LaborDay, as in training)
for holiday, dates in holiday_dates.items():
    if holiday in ['SuperBowl', 'LaborDay']:
        for date in pd.to_datetime(dates):
            test_df[f'{holiday}_Before'] = ((test_df['Date'] < date) & (test_df['Date'] >= date - pd.Timedelta(weeks=2))).astype(int)
            test_df[f'{holiday}_After'] = ((test_df['Date'] > date) & (test_df['Date'] <= date + pd.Timedelta(weeks=2))).astype(int)

# Add lagged sales (using scaled Weekly_Sales from training data)
for _, row in test_df[['Store', 'Dept']].drop_duplicates().iterrows():
    store, dept = row['Store'], row['Dept']
    train_sub_df = df[(df['Store'] == store) & (df['Dept'] == dept)][['Date', 'Weekly_Sales']]
    for lag in [1, 2, 4]:
        test_df.loc[(test_df['Store'] == store) & (test_df['Dept'] == dept), f'Lag{lag}'] = \
            test_df[(test_df['Store'] == store) & (test_df['Dept'] == dept)]['Date'].map(
                lambda x: train_sub_df[train_sub_df['Date'] == x - pd.Timedelta(weeks=lag)]['Weekly_Sales'].iloc[0] if \
                (x - pd.Timedelta(weeks=lag)) in train_sub_df['Date'].values else 0
            )

# Handle missing values
test_df['MarkDown1'].fillna(0, inplace=True)
test_df['MarkDown2'].fillna(0, inplace=True)
test_df['MarkDown3'].fillna(0, inplace=True)
test_df['MarkDown4'].fillna(0, inplace=True)
test_df['MarkDown5'].fillna(0, inplace=True)
test_df['CPI'].fillna(df['CPI'].mean(), inplace=True)
test_df['Unemployment'].fillna(df['Unemployment'].mean(), inplace=True)
test_df['Temperature'].fillna(df['Temperature'].mean(), inplace=True)
test_df['Fuel_Price'].fillna(df['Fuel_Price'].mean(), inplace=True)
test_df['IsHoliday'] = test_df['IsHoliday'].astype(int)
test_df['Type'] = test_df['Type'].map({'A': 0, 'B': 1, 'C': 2})
test_df['Size'].fillna(df['Size'].mean(), inplace=True)
test_df[['Lag1', 'Lag2', 'Lag4']] = test_df[['Lag1', 'Lag2', 'Lag4']].fillna(0)
for holiday in ['SuperBowl_Before', 'SuperBowl_After', 'LaborDay_Before', 'LaborDay_After']:
    test_df[holiday].fillna(0, inplace=True)

# Define feature columns (consistent with training)
feature_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2',
                'MarkDown3', 'MarkDown4', 'MarkDown5', 'Size', 'Type', 'IsHoliday',
                'SuperBowl', 'LaborDay', 'Thanksgiving', 'Christmas',
                'WeekOfYear', 'Month', 'Year', 'Lag1', 'Lag2', 'Lag4',
                'SuperBowl_Before', 'SuperBowl_After', 'LaborDay_Before', 'LaborDay_After']
assert len(feature_cols) == 26, f"Expected 26 features, got {len(feature_cols)}"

# Scale features
test_df[feature_cols] = scaler_features.transform(test_df[feature_cols].clip(lower=-1e5, upper=1e5))

# Check for NaN or inf
assert not test_df[feature_cols].isna().any().any(), "NaN values in test data"
assert not test_df[feature_cols].isin([np.inf, -np.inf]).any().any(), "Inf values in test data"

# Create test sequences
def create_test_sequences(test_df, time_series_dict, seq_len=36, pred_len=6):
    test_data = []
    test_ids = []
    store_dept_pairs = test_df[['Store', 'Dept']].drop_duplicates()

    for _, row in store_dept_pairs.iterrows():
        store, dept = row['Store'], row['Dept']
        test_sub_df = test_df[(test_df['Store'] == store) & (test_df['Dept'] == dept)].sort_values('Date')

        if len(test_sub_df) == 0:
            continue

        # Get training data for the store-dept pair
        if (store, dept) in time_series_dict:
            train_sales = time_series_dict[(store, dept)]['sales']
            train_features = time_series_dict[(store, dept)]['features']
            train_dates = time_series_dict[(store, dept)]['dates']
        else:
            # Use average from similar departments or store
            similar_depts = df[df['Store'] == store]['Dept'].unique()
            if len(similar_depts) > 0:
                avg_sales = df[(df['Store'] == store) & (df['Dept'].isin(similar_depts))]['Weekly_Sales'].mean()
                avg_features = df[(df['Store'] == store) & (df['Dept'].isin(similar_depts))][feature_cols].mean().values
            else:
                avg_sales = df['Weekly_Sales'].mean()
                avg_features = df[feature_cols].mean().values
            train_sales = np.full(seq_len, avg_sales, dtype=np.float32)
            train_features = np.tile(avg_features, (seq_len, 1)).astype(np.float32)
            train_dates = pd.date_range(end=test_sub_df['Date'].min() - pd.Timedelta(weeks=1), periods=seq_len, freq='W-FRI')

        # Ensure training sequences are of length seq_len
        if len(train_sales) < seq_len:
            train_sales = np.pad(train_sales, (seq_len - len(train_sales), 0), mode='constant')
            train_features = np.pad(train_features, ((seq_len - len(train_features), 0), (0, 0)), mode='constant')
            train_dates = np.pad(train_dates, (seq_len - len(train_dates), 0), mode='constant', constant_values=train_dates[-1] if len(train_dates) > 0 else test_sub_df['Date'].iloc[0])

        # Generate one sequence per test date
        test_dates = test_sub_df['Date'].unique()
        for date in test_dates:
            end_date = date - pd.Timedelta(weeks=1)
            if pd.to_datetime(end_date) <= pd.to_datetime(train_dates[-1]):
                train_mask = pd.to_datetime(train_dates) <= end_date
                x_sales = train_sales[train_mask][-seq_len:]
                x_features = train_features[train_mask][-seq_len:]
            else:
                x_sales = train_sales[-seq_len:]
                x_features = train_features[-seq_len:]

            if len(x_sales) < seq_len:
                x_sales = np.pad(x_sales, (seq_len - len(x_sales), 0), mode='constant')
                x_features = np.pad(x_features, ((seq_len - len(x_features), 0), (0, 0)), mode='constant')

            # Use features for the current test date
            test_features = test_sub_df[test_sub_df['Date'] == date][feature_cols].values.astype(np.float32)
            if len(test_features) > 0:
                x_features = np.roll(x_features, -1, axis=0)
                x_features[-1] = test_features[0]

                date_str = date.strftime('%Y-%m-%d')
                test_id = f"{int(store)}_{int(dept)}_{date_str}"
                test_data.append((x_sales, x_features))
                test_ids.append(test_id)

    return test_data, test_ids

# Create test sequences
test_data, test_ids = create_test_sequences(test_df, time_series_dict, seq_len=36, pred_len=6)

# Define test dataset
class TestDataset(Dataset):
    def __init__(self, test_data):
        self.test_data = test_data

    def __len__(self):
        return len(self.test_data)

    def __getitem__(self, idx):
        x_sales, x_features = self.test_data[idx]
        return (torch.tensor(x_sales, dtype=torch.float32).unsqueeze(-1),
                torch.tensor(x_features, dtype=torch.float32))

test_dataset = TestDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Load best model and predict
model = EnhancedDLinear(seq_len=36, pred_len=6, n_features=26, dropout=0.2, kernel_size=7).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/ML-FinalProject/enhanced_dlinear_model_best_v3.pth'))
model.eval()

predictions = []
with torch.no_grad():
    for xb_sales, xb_features in test_loader:
        xb_sales, xb_features = xb_sales.to(device), xb_features.to(device)
        preds = model(xb_sales, xb_features)  # Shape: (batch_size, pred_len)
        preds = preds[:, 0]  # Take first week for single-week prediction
        predictions.append(preds.cpu().numpy())

predictions = np.concatenate(predictions, axis=0).flatten()

# Verify lengths
print(f"Length of test_ids: {len(test_ids)}")
print(f"Length of predictions: {len(predictions)}")
assert len(test_ids) == len(predictions), f"Mismatch: len(test_ids)={len(test_ids)}, len(predictions)={len(predictions)}"

# Inverse transform predictions
predictions = scaler_sales.inverse_transform(predictions.reshape(-1, 1)).flatten()

# Create submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'Weekly_Sales': predictions
})
submission['Weekly_Sales'] = submission['Weekly_Sales'].clip(lower=0)
submission_path = '/content/submission_enhanced_v4.csv'
submission.to_csv(submission_path, index=False)
print(f"✅ Submission file created: {submission_path}")

# Log to WandB
wandb.save(submission_path)
wandb.log({
    "submission_mean_sales": submission['Weekly_Sales'].mean(),
    "submission_std_sales": submission['Weekly_Sales'].std(),
    "submission_min_sales": submission['Weekly_Sales'].min(),
    "submission_max_sales": submission['Weekly_Sales'].max()
})
wandb.finish()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['MarkDown1'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['MarkDown2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

Length of test_ids: 115064
Length of predictions: 115064




✅ Submission file created: /content/submission_enhanced_v4.csv


0,1
submission_max_sales,▁
submission_mean_sales,▁
submission_min_sales,▁
submission_std_sales,▁

0,1
submission_max_sales,120749.07812
submission_mean_sales,14442.28125
submission_min_sales,0.0
submission_std_sales,18261.46289
