# Store-Item Demand Forecasting Project

### Import Libraries 

In [None]:
pip install pandas numpy scikit-learn lightgbm shap matplotlib seaborn

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor
import shap
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


### Load the Kaggle dataset

In [5]:
df = pd.read_csv("train.csv", parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)

print("Shape:", df.shape)
df.head()


Shape: (913000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-01,7,12,26
2,2013-01-01,7,46,27
3,2013-01-01,8,12,54
4,2013-01-01,9,12,35


### Add date/time features

In [8]:
def add_time_features(data: pd.DataFrame) -> pd.DataFrame:
    data["year"] = data["date"].dt.year
    data["month"] = data["date"].dt.month
    data["day"] = data["date"].dt.day
    data["dayofweek"] = data["date"].dt.dayofweek
    data["is_weekend"] = (data["dayofweek"] >= 5).astype(int)
    data["weekofyear"] = data["date"].dt.isocalendar().week.astype(int)
    data["is_month_start"] = data["date"].dt.is_month_start.astype(int)
    data["is_month_end"] = data["date"].dt.is_month_end.astype(int)
    return data

df = add_time_features(df)
df.head()


Unnamed: 0,date,store,item,sales,year,month,day,dayofweek,is_weekend,weekofyear,is_month_start,is_month_end
0,2013-01-01,1,1,13,2013,1,1,1,0,1,1,0
1,2013-01-01,7,12,26,2013,1,1,1,0,1,1,0
2,2013-01-01,7,46,27,2013,1,1,1,0,1,1,0
3,2013-01-01,8,12,54,2013,1,1,1,0,1,1,0
4,2013-01-01,9,12,35,2013,1,1,1,0,1,1,0


### Add lag + rolling features (time-series magic)

In [9]:
def add_lag_features(data: pd.DataFrame,
                     group_cols=("store", "item"),
                     target_col="sales") -> pd.DataFrame:
    # Recent lags (previous days)
    for lag in [1, 2, 3, 7, 14, 30]:
        data[f"lag_{lag}"] = (
            data.groupby(list(group_cols))[target_col].shift(lag)
        )

    # Rolling stats using past values
    data["roll_mean_7"] = (
        data.groupby(list(group_cols))[target_col]
            .shift(1).rolling(window=7).mean()
    )
    data["roll_mean_30"] = (
        data.groupby(list(group_cols))[target_col]
            .shift(1).rolling(window=30).mean()
    )
    data["roll_std_7"] = (
        data.groupby(list(group_cols))[target_col]
            .shift(1).rolling(window=7).std()
    )

    return data

df = add_lag_features(df)

# Drop early rows that have NaNs from lags/rollings
df = df.dropna().reset_index(drop=True)
print("Shape after lag/rolling features:", df.shape)
df.head()


Shape after lag/rolling features: (898000, 21)


Unnamed: 0,date,store,item,sales,year,month,day,dayofweek,is_weekend,weekofyear,...,is_month_end,lag_1,lag_2,lag_3,lag_7,lag_14,lag_30,roll_mean_7,roll_mean_30,roll_std_7
0,2013-01-31,6,16,9,2013,1,31,3,0,5,...,1,14.0,7.0,8.0,9.0,12.0,13.0,31.0,29.666667,18.717194
1,2013-01-31,1,50,31,2013,1,31,3,0,5,...,1,19.0,24.0,23.0,35.0,34.0,30.0,26.0,29.1,16.03122
2,2013-01-31,5,24,28,2013,1,31,3,0,5,...,1,27.0,20.0,23.0,18.0,23.0,26.0,28.714286,28.966667,13.948886
3,2013-01-31,6,35,28,2013,1,31,3,0,5,...,1,24.0,33.0,23.0,26.0,29.0,23.0,25.857143,29.3,12.23967
4,2013-01-31,8,42,23,2013,1,31,3,0,5,...,1,25.0,31.0,24.0,30.0,29.0,20.0,22.285714,28.233333,6.156684


### Define features + train/test split (time-based)

In [10]:
feature_cols = [
    "store", "item",
    "year", "month", "day", "dayofweek", "weekofyear",
    "is_weekend", "is_month_start", "is_month_end",
    "lag_1", "lag_2", "lag_3", "lag_7", "lag_14", "lag_30",
    "roll_mean_7", "roll_mean_30", "roll_std_7"
]

target_col = "sales"

split_date = "2017-09-30"

train_df = df[df["date"] <= split_date].copy()
test_df  = df[df["date"] > split_date].copy()

print("Train range:", train_df["date"].min(), "to", train_df["date"].max())
print("Test range:", test_df["date"].min(), "to", test_df["date"].max())
print("Train samples:", len(train_df), "Test samples:", len(test_df))


Train range: 2013-01-31 00:00:00 to 2017-09-30 00:00:00
Test range: 2017-10-01 00:00:00 to 2017-12-31 00:00:00
Train samples: 852000 Test samples: 46000


### Baseline model (mean per item)

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test  = test_df[feature_cols]
y_test  = test_df[target_col]

# Baseline: mean sales per item (from training)
item_means = train_df.groupby("item")[target_col].mean()
baseline_pred = test_df["item"].map(item_means)

# In case any item in test doesn't exist in train (should be rare, but safe):
baseline_pred = baseline_pred.fillna(train_df[target_col].mean())

baseline_mae = mean_absolute_error(y_test, baseline_pred)

# Older sklearn: compute RMSE by taking sqrt of MSE
baseline_mse  = mean_squared_error(y_test, baseline_pred)
baseline_rmse = baseline_mse ** 0.5

print("=== Baseline (item mean) ===")
print("MAE :", baseline_mae)
print("RMSE:", baseline_rmse)


=== Baseline (item mean) ===
MAE : 13.457148346601345
RMSE: 17.815297878904435


### Prepare data for LightGBM (categoricals)

In [13]:
# Cast categorical columns
for col in ["store", "item"]:
    train_df[col] = train_df[col].astype("category")
    test_df[col]  = test_df[col].astype("category")

X_train = train_df[feature_cols]
X_test  = test_df[feature_cols]
y_train = train_df[target_col]
y_test  = test_df[target_col]


### Train base LightGBM model

In [15]:
base_lgbm = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective="regression"
)

base_lgbm.fit(X_train, y_train)
y_pred_base = base_lgbm.predict(X_test)

base_mae = mean_absolute_error(y_test, y_pred_base)
base_rmse = mean_squared_error(y_test, y_pred_base) ** 0.5   

print("=== LightGBM (base) ===")
print("MAE :", base_mae)
print("RMSE:", base_rmse)
print("Improvement over baseline RMSE: {:.2f}%".format(
    100 * (baseline_rmse - base_rmse) / baseline_rmse
))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079855 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2089
[LightGBM] [Info] Number of data points in the train set: 852000, number of used features: 19
[LightGBM] [Info] Start training from score 52.522494
=== LightGBM (base) ===
MAE : 6.07015332739005
RMSE: 7.857987709856682
Improvement over baseline RMSE: 55.89%


### Simple hyperparameter tuning (random search)

In [None]:
import random

# Use last ~2 months of train as validation
val_cut_date = "2017-08-01"

train_sub = train_df[train_df["date"] < val_cut_date]
val_sub   = train_df[train_df["date"] >= val_cut_date]

X_tr = train_sub[feature_cols]
y_tr = train_sub[target_col]
X_val = val_sub[feature_cols]
y_val = val_sub[target_col]

def random_search_lgbm(X_tr, y_tr, X_val, y_val, n_iter=8):
    best_config = None
    best_rmse = float("inf")

    for i in range(n_iter):
        params = {
            "n_estimators": random.choice([300, 500, 700]),
            "learning_rate": random.choice([0.01, 0.03, 0.05, 0.1]),
            "max_depth": random.choice([-1, 8, 10, 12]),
            "num_leaves": random.choice([31, 63, 127]),
            "subsample": random.choice([0.7, 0.8, 0.9, 1.0]),
            "colsample_bytree": random.choice([0.7, 0.8, 0.9, 1.0]),
            "min_child_samples": random.choice([20, 50, 100]),
            "random_state": 42,
            "objective": "regression",
            "n_jobs": -1,
        }

        model = LGBMRegressor(**params)
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict(X_val)

        # ⬇️ FIXED: no 'squared=False', compute RMSE manually
        mse = mean_squared_error(y_val, y_val_pred)
        rmse = mse ** 0.5

        print(f"Iter {i+1}/{n_iter} - RMSE: {rmse:.4f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_config = params

    return best_config, best_rmse

best_params, best_val_rmse = random_search_lgbm(X_tr, y_tr, X_val, y_val, n_iter=8)
print("Best params:", best_params)
print("Best validation RMSE:", best_val_rmse)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2076
[LightGBM] [Info] Number of data points in the train set: 821500, number of used features: 19
[LightGBM] [Info] Start training from score 52.084600


TypeError: got an unexpected keyword argument 'squared'

### Train final tuned model + evaluate

In [None]:
tuned_lgbm = LGBMRegressor(**best_params)
tuned_lgbm.fit(X_train, y_train)

y_pred_tuned = tuned_lgbm.predict(X_test)

tuned_mae = mean_absolute_error(y_test, y_pred_tuned)
tuned_rmse = mean_squared_error(y_test, y_pred_tuned, squared=False)

print("=== LightGBM (tuned) ===")
print("MAE :", tuned_mae)
print("RMSE:", tuned_rmse)
print("Improvement over baseline RMSE: {:.2f}%".format(
    100 * (baseline_rmse - tuned_rmse) / baseline_rmse
))


### 