In [None]:
import json
import gc
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm

import lightgbm as lgbm

In [None]:
# Constants

# Paths
INPUT_BASE_PATH = "/kaggle/input/"
RAW_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-forecasting-accuracy"
PROCESSED_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-acc"
OUTPUT_BASE_BATH = "/kaggle/working"

# Timestamps
MAX_TRAIN_TIMESTAMP = 1941
START_TEST_TIMESTAMP = 1942
START_TEST_WM_YR_WK = 11617

# Data Input

In [None]:
def cast_category_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_category_cols = ["dept_id", "cat_id", "store_id", "state_id"]
    date_category_cols = ["weekday", "month", "quarter", "year"]
    event_cols = [
        "event_name", "event_type",
        "event_name_lag_-3", "event_type_lag_-3",
        "event_name_lag_-2", "event_type_lag_-2",
        "event_name_lag_-1", "event_type_lag_-1",
        "event_name_lag_1", "event_type_lag_1",
        "event_name_lag_2", "event_type_lag_2",
        "event_name_lag_3", "event_type_lag_3",
        "snap_CA", "snap_TX", "snap_WI"
    ]
    sale_cols = ["item_on_sale"]
    
    all_cat_cols = item_category_cols + date_category_cols + event_cols + sale_cols
    for category_col in all_cat_cols:
        try:
            dataset[category_col] = dataset[category_col].astype("category")
        except KeyError:
            # print(f"Column {category_col} does not exist. Skipping ...")
            pass
    
    return dataset


def cast_int_types(dataset: pd.DataFrame) -> pd.DataFrame:
    int_cols = ["d", "count"]
    for int_col in int_cols:
        try:
            dataset[int_col] = dataset[int_col].astype(np.int16)
        except KeyError:
            # print(f"Column {int_col} does not exist. Skipping ...")
            pass
    
    return dataset


def cast_float_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_id_cols = ["item_id"]
    float_sales_cols = [
        "count_lag_28", "count_lag_29", "count_lag_30", "count_lag_31",
        "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
        "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
        "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
        "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",
    ]
    float_price_cols = [
        "sell_price",
        "sell_price_diff_1", "sell_price_diff_2", "sell_price_diff_3", "sell_price_diff_7",
        "sell_price_rolling_mean_window_7", "sell_price_rolling_std_window_7", "sell_price_rolling_kurt_window_7",
        "sell_price_rolling_mean_window_14", "sell_price_rolling_std_window_14", "sell_price_rolling_kurt_window_14",
        "sell_price_rolling_mean_window_21", "sell_price_rolling_std_window_21", "sell_price_rolling_kurt_window_21",
        "sell_price_rolling_mean_window_28", "sell_price_rolling_std_window_28", "sell_price_rolling_kurt_window_28",
    ]
    float_date_cols = [
        "weekday_sin", "weekday_cos",
        "month_sin", "month_cos",
        "quarter_sin", "quarter_cos",
    ]
    float_cols = item_id_cols + float_sales_cols + float_price_cols + float_date_cols
    for float_col in float_cols:
        try:
            dataset[float_col] = dataset[float_col].astype(np.float16)
        except KeyError:
            # print(f"Column {int_col} does not exist. Skipping ...")
            pass
    
    return dataset

# Cast to lower resolution types to save memory
def cast_data_types(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset = cast_category_types(dataset)
    dataset = cast_int_types(dataset)
    dataset = cast_float_types(dataset)
    return dataset

In [None]:
def drop_cols(dataset: pd.DataFrame) -> pd.DataFrame:
    date_cols_to_drop = ["date", "weekday", "month", "quarter"]
    event_cols_to_drop = [
        "event_type",
        "event_type_lag_-3", "event_type_lag_-2", "event_type_lag_-1",
        "event_type_lag_3", "event_type_lag_2", "event_type_lag_1", 
    ]
    sell_cols_to_drop = [
        "sell_price_rolling_mean_window_7", "sell_price_rolling_std_window_7", "sell_price_rolling_kurt_window_7",
        "sell_price_rolling_mean_window_14", "sell_price_rolling_std_window_14", "sell_price_rolling_kurt_window_14",
        "sell_price_rolling_mean_window_21", "sell_price_rolling_std_window_21", "sell_price_rolling_kurt_window_21",
        "sell_price_rolling_mean_window_28", "sell_price_rolling_std_window_28", "sell_price_rolling_kurt_window_28",
    ]
    count_cols_to_drop = [
#         "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
#         "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
        "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
        "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",

    ]

    for column_set in (date_cols_to_drop, event_cols_to_drop, sell_cols_to_drop, count_cols_to_drop):
        try:
            dataset = dataset.drop(columns=column_set)
        except KeyError:
            # print(f"Column '{col}' not found in axis. Skipping ...")
            pass

        _ = gc.collect()
    return dataset

In [None]:
# Load train data in chunks
train_pq_file = pq.ParquetFile(f"{PROCESSED_DATA_INPUT_PATH}/m5-acc-train.parquet")

train_data_set = pd.DataFrame()
for i, batch in tqdm(enumerate(train_pq_file.iter_batches(batch_size=131_072))):
    train_batch_df = batch.to_pandas()
    train_batch_df = drop_cols(train_batch_df)
    train_batch_df = cast_data_types(train_batch_df)
    
    train_data_set = pd.concat([train_data_set, train_batch_df], ignore_index=True)
    
    del train_batch_df
    _ = gc.collect()

train_data_set = cast_data_types(train_data_set)

# LightGBM Model
Train the best performing LightGBM model.

In [None]:
CATEGORICAL_FEATURES = [
    "dept_id",
    "cat_id",
    "store_id",
    "state_id",
    "year",
    "event_name",
#     "event_name_lag_-3",
    "event_name_lag_-2",
    "event_name_lag_-1",
    "event_name_lag_1",
    "event_name_lag_2",
#     "event_name_lag_3",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "item_on_sale",
]
CONTINOUS_FEATURES = [
    "item_id",
    "weekday_sin",
    "weekday_cos",
    "month_sin",
    "month_cos",
    "quarter_sin",
    "quarter_cos",
    "sell_price",
    "sell_price_diff_1",
    "sell_price_diff_2",
    "sell_price_diff_3",
    "sell_price_diff_7",
    "count_lag_28",
    "count_lag_29",
    "count_lag_30",
    "count_lag_31",
    "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
    "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
#     "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
#     "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",
]
FEATURES = CATEGORICAL_FEATURES + CONTINOUS_FEATURES
LABEL = "count"

# Parameters
DATASET_PARAMETERS = {}
TRAIN_PARAMETERS = {
    "objective": "tweedie",
    "tweedie_variance_power": 1.1,
    "learning_rate": 0.0125,
    "num_leaves": 2 ** 8 - 1,
    "max_bin": 2 ** 7 - 1,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8, 
    "metric": "rmse",
    "force_col_wise": True,
    "seed": 1,
    "histogram_pool_size": 11_000,
}

In [None]:
# Train test split
horizon = 28
start_t = MAX_TRAIN_TIMESTAMP - horizon
end_t = MAX_TRAIN_TIMESTAMP

train_df = train_data_set[train_data_set["d"] < start_t]
valid_df = train_data_set[(train_data_set["d"] >= start_t) & (train_data_set["d"] <= end_t)]
print(f"Train idx (start, end): ({train_df['d'].min()}, {train_df['d'].max()})")
print(f"Valid idx (start, end): ({valid_df['d'].min()}, {valid_df['d'].max()})")
print("============")

X_train, y_train = train_df[FEATURES], train_df[LABEL]
X_valid, y_valid = valid_df[FEATURES], valid_df[LABEL]
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_valid.shape}, y_test shape: {y_valid.shape}")

train_data = lgbm.Dataset(data=X_train, label=y_train)
valid_data = lgbm.Dataset(data=X_valid, label=y_valid)

del train_df
gc.collect()

In [None]:
# Train model
train_valid_loss = {}
model = lgbm.train(
    TRAIN_PARAMETERS,
    num_boost_round=1000,
    train_set=train_data,
    valid_sets=[train_data, valid_data],
    callbacks=[
        lgbm.early_stopping(stopping_rounds=5),
        lgbm.log_evaluation(period=10),
        lgbm.record_evaluation(train_valid_loss)
    ]
)

In [None]:
# Forecast and save to onto valid df
y_hat = model.predict(data=X_valid)
y_hat = np.clip(y_hat, a_min=0, a_max=np.inf)

forecast_df = valid_df[["item_id", "dept_id", "cat_id", "store_id", "state_id", "d"]].copy()
forecast_df["count"] = y_valid
forecast_df["forecast"] = y_hat

## 1. Train / valid loss curves

In [None]:
def learning_curve(
    train_loss: list[float],
    valid_loss: list[float],
    ylabel: str,
):
    fig, ax = plt.subplots()
    n_iter = np.arange(1, len(train_loss) + 1)
    ax.plot(n_iter, train_loss, lw=2, label="train")
    ax.plot(n_iter, valid_loss, lw=2, label="valid")
    ax.set(xlabel="Iteration", ylabel=ylabel)
    ax.legend()
    return ax

In [None]:
train_rmse = train_valid_loss["training"]["rmse"]
valid_rmse = train_valid_loss["valid_1"]["rmse"]
learning_curve(train_rmse, valid_rmse, "RMSE");

## 2. Residual analysis

In [None]:
def load_category_map(path: str) -> dict[int, str]:
    with open(path, "r") as fp:
        category_mapping = json.load(fp)
    return category_mapping

def reverse_map(mapping):
    return {v: k for k, v in mapping.items()}

item_id_map = reverse_map(load_category_map(f"{PROCESSED_DATA_INPUT_PATH}/item_id_map.json"))
dept_id_map = reverse_map(load_category_map(f"{PROCESSED_DATA_INPUT_PATH}/dept_id_map.json"))
cat_id_map = reverse_map(load_category_map(f"{PROCESSED_DATA_INPUT_PATH}/cat_id_map.json"))
store_id_map = reverse_map(load_category_map(f"{PROCESSED_DATA_INPUT_PATH}/store_id_map.json"))
state_id_map = reverse_map(load_category_map(f"{PROCESSED_DATA_INPUT_PATH}/state_id_map.json"))

### 2.1 Residuals vs. predicted values

In [None]:
def residuals_plot(forecast_df: pd.DataFrame):
    y_true = np.array(forecast_df["count"])
    y_pred = np.array(forecast_df["forecast"])
    fig, ax = plt.subplots()
    ax.axhline(0, ls="--", color="gray", alpha=0.75)
    ax.scatter(
        np.array(y_pred),
        np.array(y_true - y_pred),
        alpha=0.3,
    )
    ax.set(xlabel="y_pred", ylabel="y_true - y_pred")
    return ax


def residuals_plot_by_group(forecast_df: pd.DataFrame, group_by: str, category_map: dict[int, str]):
    # Determine grid
    group_values = forecast_df[group_by].unique()
    if len(group_values) > 5:
        n_rows = math.ceil(len(group_values) / 5)
        n_cols = 5
    else:
        n_rows = 1
        n_cols = len(group_values)
    
    # Plot
    fig, ax = plt.subplots(
        n_rows,
        n_cols,
        figsize=(4 * n_cols, 3 * n_rows),
        sharex=True,
        sharey=True,
    )
    ax = ax.flatten()
    for i, group_value in enumerate(group_values):
        group_df = forecast_df[forecast_df[group_by] == group_value]
        ax[i].axhline(0, ls="--", color="gray", alpha=0.75)
        ax[i].scatter(
            np.array(group_df["forecast"]),
            np.array(group_df["count"] - group_df["forecast"]),
            alpha=0.3,
            label=category_map[group_value]
        )
        ax[i].legend()

    for i in range(n_cols, 0, -1):
        ax[-i].set(xlabel="y_pred")
    for i in range(0, len(ax), n_cols):
        ax[i].set(ylabel="y_true - y_pred")

    fig.tight_layout();
    return ax

In [None]:
residuals_plot(forecast_df);

In [None]:
residuals_plot_by_group(forecast_df, "store_id", store_id_map);

### 2.2 Residual distributions

In [None]:
def residuals_dist_plot(forecast_df: pd.DataFrame):
    y_true = np.array(forecast_df["count"])
    y_pred = np.array(forecast_df["forecast"])
    errors = np.array(y_true - y_pred)
    fig, ax = plt.subplots()
    freq, bins = np.histogram(errors, bins=100, density=True)
    ax.bar(bins[:-1], freq, width=np.diff(bins), edgecolor="black", align="edge");
    ax.set(xlabel="y_pred - y_true", ylabel="Density")
    return ax


def residuals_dist_plot_by_group(forecast_df: pd.DataFrame, group_by: str, category_map: dict[int, str]):
    # Determine grid
    group_values = forecast_df[group_by].unique()
    if len(group_values) > 5:
        n_rows = math.ceil(len(group_values) / 5)
        n_cols = 5
    else:
        n_rows = 1
        n_cols = len(group_values)
    
    # Plot
    fig, ax = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 3 * n_rows), sharex=True)
    ax = ax.flatten()
    for i, group_value in enumerate(group_values):
        group_df = forecast_df[forecast_df[group_by] == group_value]
        y_true = np.array(group_df["count"])
        y_pred = np.array(group_df["forecast"])
        errors = np.array(y_true - y_pred)
        freq, bins = np.histogram(errors, bins=100, density=True)
        ax[i].bar(
            bins[:-1],
            freq,
            width=np.diff(bins),
            edgecolor="black",
            align="edge",
            label=category_map[group_value],
        );
        ax[i].legend()

    for i in range(n_cols, 0, -1):
        ax[-i].set(xlabel="y_true - y_pred")
    for i in range(0, len(ax), n_cols):
        ax[i].set(ylabel="Density")

    fig.tight_layout();
    return ax

In [None]:
residuals_dist_plot(forecast_df);

In [None]:
residuals_dist_plot_by_group(forecast_df, "store_id", store_id_map);

### 2.3 Correlation

## 3. Feature importance

In [None]:
split_importance_df = pd.DataFrame(
    data=list(zip(model.feature_importance("split"), FEATURES)),
    columns=["importance", "feature"]
)

gain_importance_df = pd.DataFrame(
    data=list(zip(model.feature_importance("gain"), FEATURES)),
    columns=["importance", "feature"]
)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 7.5), sharey=True)

y_labels = np.array(split_importance_df["feature"])
y_positions = np.arange(len(y_labels))
importance = np.array(split_importance_df["importance"])
ax[0].barh(y_positions, importance)
ax[0].set(yticks=y_positions, yticklabels=y_labels)
ax[0].set_xscale("log")
ax[0].set_title("Split Importance")


y_labels = np.array(gain_importance_df["feature"])
y_positions = np.arange(len(y_labels))
importance = np.array(gain_importance_df["importance"])
ax[1].barh(y_positions, importance, align='center')
ax[1].set(yticks=y_positions, yticklabels=y_labels)
ax[1].set_xscale("log")
ax[1].set_title("Gain Importance")

fig.tight_layout();