In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import random
from itertools import product
import torch

In [None]:
!pip install neuralforecast


Collecting neuralforecast
  Downloading neuralforecast-3.0.2-py3-none-any.whl.metadata (14 kB)
Collecting coreforecast>=0.0.6 (from neuralforecast)
  Downloading coreforecast-0.0.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting pytorch-lightning>=2.0.0 (from neuralforecast)
  Downloading pytorch_lightning-2.5.2-py3-none-any.whl.metadata (21 kB)
Collecting ray>=2.2.0 (from ray[tune]>=2.2.0->neuralforecast)
  Downloading ray-2.48.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting optuna (from neuralforecast)
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting utilsforecast>=0.2.3 (from neuralforecast)
  Downloading utilsforecast-0.2.12-py3-none-any.whl.metadata (7.6 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning>=2.0.0->neuralforecast)
  Downloading torchmetrics-1.7.4-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning>=2.0.0->neuralforecast)
  Downloadin

# **Read Data**

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Train Data:
# Store - the store number
# Dept - the department number
# Date - the week
# Weekly_Sales -  sales for the given department in the given store
# IsHoliday - whether the week is a special holiday week
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Walmart_Recruiting/Data/train.csv.zip')
train_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Dept          421570 non-null  int64  
 2   Date          421570 non-null  object 
 3   Weekly_Sales  421570 non-null  float64
 4   IsHoliday     421570 non-null  bool   
dtypes: bool(1), float64(1), int64(2), object(1)
memory usage: 13.3+ MB


In [None]:
# Test Data:
# Store - the store number
# Dept - the department number
# Date - the week
# IsHoliday - whether the week is a special holiday week

test_df = pd.read_csv('/content/drive/MyDrive/Walmart_Recruiting/Data/test.csv.zip')
test_df.head()

Unnamed: 0,Store,Dept,Date,IsHoliday
0,1,1,2012-11-02,False
1,1,1,2012-11-09,False
2,1,1,2012-11-16,False
3,1,1,2012-11-23,True
4,1,1,2012-11-30,False


In [None]:
# Info about stores
stores_df = pd.read_csv('/content/drive/MyDrive/Walmart_Recruiting/Data/stores.csv')
stores_df.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [None]:
stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


In [None]:
# Features data:
# Store - the store number
# Date - the week
# Temperature - average temperature in the region
# Fuel_Price - cost of fuel in the region
# MarkDown1-5 - anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA.
# CPI - the consumer price index
# Unemployment - the unemployment rate
# IsHoliday - whether the week is a special holiday week

features_df = pd.read_csv('/content/drive/MyDrive/Walmart_Recruiting/Data/features.csv.zip')
features_df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [None]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         8190 non-null   int64  
 1   Date          8190 non-null   object 
 2   Temperature   8190 non-null   float64
 3   Fuel_Price    8190 non-null   float64
 4   MarkDown1     4032 non-null   float64
 5   MarkDown2     2921 non-null   float64
 6   MarkDown3     3613 non-null   float64
 7   MarkDown4     3464 non-null   float64
 8   MarkDown5     4050 non-null   float64
 9   CPI           7605 non-null   float64
 10  Unemployment  7605 non-null   float64
 11  IsHoliday     8190 non-null   bool   
dtypes: bool(1), float64(9), int64(1), object(1)
memory usage: 712.0+ KB


# **Processing**

In [None]:
class WalmartPatchTSTPreprocessor:
    def __init__(self):
        self.observed_features = ['Weekly_Sales']
        self.known_continuous_features = [
            'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
            'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'
        ]

    def fit(self, train_df, features_df, stores_df):
        return self  # No fitting required for PatchTST

    def transform(self, df, features_df, stores_df, is_train=True):
        df = df.copy()
        df = df.merge(features_df, how='left', on=['Store', 'Date', 'IsHoliday'])
        df = df.merge(stores_df, how='left', on='Store')
        df['Date'] = pd.to_datetime(df['Date'])
        df.sort_values(by=['Store', 'Dept', 'Date'], inplace=True)

        # Fill markdowns
        markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
        for col in markdown_cols:
            df[col] = df[col].fillna(0)

        # Fill missing Weekly_Sales with interpolation
        df['series_id'] = df['Store'].astype(str) + "_" + df['Dept'].astype(str)
        df['Weekly_Sales'] = df['Weekly_Sales'].replace([np.inf, -np.inf], np.nan)
        df['Weekly_Sales'] = df.groupby('series_id')['Weekly_Sales'].transform(
            lambda x: x.interpolate(limit_direction='both').fillna(0))

        # Save Original Date
        df['OriginalDate'] = df['Date']
        df.set_index('Date', inplace=True)

        # Normalize features (Z-score)
        numeric_features = self.observed_features + self.known_continuous_features
        for col in numeric_features:
            df[col] = df.groupby('series_id')[col].transform(
                lambda x: (x - x.mean()) / (x.std() + 1e-8))

        # Final feature list
        features = self.observed_features + self.known_continuous_features

        return df, features

    def fit_transform(self, train_df, features_df, stores_df):
        self.fit(train_df, features_df, stores_df)
        return self.transform(train_df, features_df, stores_df, is_train=True)


In [None]:
preprocessor = WalmartPatchTSTPreprocessor()
processed_df, features = preprocessor.fit_transform(train_df, features_df, stores_df)

import plotly.express as px
import pandas as pd

# Use this line to assign the preprocessed data to `df`
df = processed_df.reset_index()  # Reset index to access Date column


In [None]:
fig = px.line(df, x="OriginalDate", y="Weekly_Sales", color="series_id",
              title="Weekly Sales Over Time by Store-Dept Series")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
fig = px.box(df, x="series_id", y="Weekly_Sales",
             title="Distribution of Weekly Sales per Series",
             points="outliers")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
df['DayOfWeek'] = df['OriginalDate'].dt.dayofweek
fig = px.box(df, x="DayOfWeek", y="Weekly_Sales",
             title="Sales Pattern by Day of the Week")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
heatmap_df = df.pivot(index='OriginalDate', columns='series_id', values='Weekly_Sales')
fig = px.imshow(heatmap_df.T,
                aspect="auto",
                title="Sales Heatmap (Time vs Series)",
                labels={"x": "Date", "y": "Series", "color": "Sales"})
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
fig = px.scatter(df, x="Fuel_Price", y="Weekly_Sales",
                 trendline="ols", opacity=0.4,
                 title="Weekly Sales vs Fuel Price")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
melted = df[["OriginalDate", "Weekly_Sales", "CPI", "Unemployment"]].melt(id_vars="OriginalDate")
fig = px.line(melted, x="OriginalDate", y="value", color="variable",
              facet_row="variable",
              title="Sales, CPI, and Unemployment Over Time")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
import plotly.express as px
import pandas as pd

# Optional: limit to top 5 most active series
top_series = df.groupby("series_id")["Weekly_Sales"].sum().sort_values(ascending=False).head(5).index
df_top = df[df["series_id"].isin(top_series)]


In [None]:
fig = px.line(df_top, x="OriginalDate", y="Weekly_Sales", color="series_id",
              title="📈 Weekly Sales Over Time (Top 5 Store-Dept Series)",
              labels={"Weekly_Sales": "Weekly Sales", "OriginalDate": "Date"},
              markers=True)
fig.update_traces(line=dict(width=2))
fig.show()

In [None]:
df_top['DayOfWeek'] = df_top['OriginalDate'].dt.day_name()
avg_sales_dow = df_top.groupby(["series_id", "DayOfWeek"])["Weekly_Sales"].mean().reset_index()

fig = px.bar(avg_sales_dow, x="DayOfWeek", y="Weekly_Sales", color="series_id",
             barmode="group",
             title="📊 Average Sales by Day of the Week",
             labels={"Weekly_Sales": "Average Sales"})
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
fig = px.box(df_top, x="series_id", y="Weekly_Sales", points="outliers",
             title="📦 Sales Distribution per Series (Top 5)",
             labels={"Weekly_Sales": "Weekly Sales"})
fig.show()


In [None]:
fig = px.scatter(df_top, x="Fuel_Price", y="Weekly_Sales", color="series_id",
                 trendline="ols", opacity=0.6,
                 title="⛽ Sales vs Fuel Price (Top 5 Series)",
                 labels={"Fuel_Price": "Fuel Price", "Weekly_Sales": "Weekly Sales"})
fig.show()


In [None]:
plot_df = df_top[["OriginalDate", "series_id", "Weekly_Sales", "CPI", "Unemployment"]].melt(
    id_vars=["OriginalDate", "series_id"], var_name="Variable", value_name="Value")

fig = px.line(plot_df, x="OriginalDate", y="Value", color="Variable",
              facet_col="series_id", facet_col_wrap=2,
              title="📉 Sales, CPI, and Unemployment Over Time (Top 5 Series)")
fig.show()


In [None]:
heatmap_df = df_top.pivot(index='OriginalDate', columns='series_id', values='Weekly_Sales')
fig = px.imshow(heatmap_df.T,
                aspect="auto",
                color_continuous_scale="Viridis",
                title="🔥 Heatmap of Weekly Sales Over Time",
                labels={"x": "Date", "y": "Series", "color": "Sales"})
fig.show()


# **Train PatchTST model**

In [None]:
from neuralforecast.utils import AirPassengersDF

patch_df = df.copy()
patch_df = patch_df.rename(columns={
    'OriginalDate': 'ds',
    'series_id': 'unique_id',
    'Weekly_Sales': 'y'
})

# Remove 'Weekly_Sales' from features
features = [col for col in features if col != 'Weekly_Sales']

# Keep only the required columns
patch_df = patch_df[['unique_id', 'ds', 'y'] + features]



In [None]:
print("Features:", features)


Features: ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']


In [None]:
from neuralforecast.models import PatchTST
from neuralforecast.core import NeuralForecast

# Define forecast horizon
horizon = 24

# Define the model
models = [
    PatchTST(
        h=horizon,
        input_size=96,
        max_steps=1000,
        scaler_type='robust',
    )
]

# Initialize NeuralForecast with the models and frequency only
nf = NeuralForecast(models=models, freq='W')  # 'W' = weekly data

# Train the model on your dataframe
nf.fit(patch_df)  # patch_df must contain 'unique_id', 'ds', 'y'


INFO:lightning_fabric.utilities.seed:Seed set to 1


ValueError: Found missing values in ['y', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'].