In [None]:
import json
import gc

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm

import lightgbm as lgbm

In [None]:
# Constants

# Paths
INPUT_BASE_PATH = "/kaggle/input/"
RAW_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-forecasting-accuracy"
PROCESSED_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-acc"
OUTPUT_BASE_BATH = "/kaggle/working"

# Timestamps
MAX_TRAIN_TIMESTAMP = 1941
START_TEST_TIMESTAMP = 1942
START_TEST_WM_YR_WK = 11617

# Data Input

In [None]:
def cast_category_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_category_cols = ["dept_id", "cat_id", "store_id", "state_id"]
    date_category_cols = ["weekday", "month", "quarter", "year"]
    event_cols = [
        "event_name", "event_type",
        "event_name_lag_-3", "event_type_lag_-3",
        "event_name_lag_-2", "event_type_lag_-2",
        "event_name_lag_-1", "event_type_lag_-1",
        "event_name_lag_1", "event_type_lag_1",
        "event_name_lag_2", "event_type_lag_2",
        "event_name_lag_3", "event_type_lag_3",
        "snap_CA", "snap_TX", "snap_WI"
    ]
    sale_cols = ["item_on_sale"]
    
    all_cat_cols = item_category_cols + date_category_cols + event_cols + sale_cols
    for category_col in all_cat_cols:
        try:
            dataset[category_col] = dataset[category_col].astype("category")
        except KeyError:
            # print(f"Column {category_col} does not exist. Skipping ...")
            pass
    
    return dataset


def cast_int_types(dataset: pd.DataFrame) -> pd.DataFrame:
    int_cols = ["d", "count"]
    for int_col in int_cols:
        try:
            dataset[int_col] = dataset[int_col].astype(np.int16)
        except KeyError:
            # print(f"Column {int_col} does not exist. Skipping ...")
            pass
    
    return dataset


def cast_float_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_id_cols = ["item_id"]
    float_sales_cols = [
        "count_lag_28", "count_lag_29", "count_lag_30", "count_lag_31",
        "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
        "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
        "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
        "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",
    ]
    float_price_cols = [
        "sell_price",
        "sell_price_diff_1", "sell_price_diff_2", "sell_price_diff_3", "sell_price_diff_7",
        "sell_price_rolling_mean_window_7", "sell_price_rolling_std_window_7", "sell_price_rolling_kurt_window_7",
        "sell_price_rolling_mean_window_14", "sell_price_rolling_std_window_14", "sell_price_rolling_kurt_window_14",
        "sell_price_rolling_mean_window_21", "sell_price_rolling_std_window_21", "sell_price_rolling_kurt_window_21",
        "sell_price_rolling_mean_window_28", "sell_price_rolling_std_window_28", "sell_price_rolling_kurt_window_28",
    ]
    float_date_cols = [
        "weekday_sin", "weekday_cos",
        "month_sin", "month_cos",
        "quarter_sin", "quarter_cos",
    ]
    float_cols = item_id_cols + float_sales_cols + float_price_cols + float_date_cols
    for float_col in float_cols:
        try:
            dataset[float_col] = dataset[float_col].astype(np.float16)
        except KeyError:
            # print(f"Column {int_col} does not exist. Skipping ...")
            pass
    
    return dataset

# Cast to lower resolution types to save memory
def cast_data_types(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset = cast_category_types(dataset)
    dataset = cast_int_types(dataset)
    dataset = cast_float_types(dataset)
    return dataset

In [None]:
def drop_cols(dataset: pd.DataFrame) -> pd.DataFrame:
    date_cols_to_drop = ["date", "weekday", "month", "quarter"]
    event_cols_to_drop = [
        "event_type",
        "event_type_lag_-3", "event_type_lag_-2", "event_type_lag_-1",
        "event_type_lag_3", "event_type_lag_2", "event_type_lag_1", 
    ]
    sell_cols_to_drop = [
        "sell_price_rolling_mean_window_7", "sell_price_rolling_std_window_7", "sell_price_rolling_kurt_window_7",
        "sell_price_rolling_mean_window_14", "sell_price_rolling_std_window_14", "sell_price_rolling_kurt_window_14",
        "sell_price_rolling_mean_window_21", "sell_price_rolling_std_window_21", "sell_price_rolling_kurt_window_21",
        "sell_price_rolling_mean_window_28", "sell_price_rolling_std_window_28", "sell_price_rolling_kurt_window_28",
    ]
    count_cols_to_drop = [
#         "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
#         "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
        "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
        "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",

    ]

    for column_set in (date_cols_to_drop, event_cols_to_drop, sell_cols_to_drop, count_cols_to_drop):
        try:
            dataset = dataset.drop(columns=column_set)
        except KeyError:
            # print(f"Column '{col}' not found in axis. Skipping ...")
            pass

        _ = gc.collect()
    return dataset

In [None]:
# Load train data in chunks
train_pq_file = pq.ParquetFile(f"{PROCESSED_DATA_INPUT_PATH}/m5-acc-train.parquet")

train_data_set = pd.DataFrame()
for batch in tqdm(train_pq_file.iter_batches(batch_size=131_072)):
    train_batch_df = batch.to_pandas()
    train_batch_df = drop_cols(train_batch_df)
    train_batch_df = cast_data_types(train_batch_df)
    
    train_data_set = pd.concat([train_data_set, train_batch_df], ignore_index=True)
    
    del train_batch_df
    _ = gc.collect()

train_data_set = cast_data_types(train_data_set)

# LightGBM Model
Train the best performing LightGBM model.

In [None]:
CATEGORICAL_FEATURES = [
    "dept_id",
    "cat_id",
    "store_id",
    "state_id",
    "year",
    "event_name",
#     "event_name_lag_-3",
    "event_name_lag_-2",
    "event_name_lag_-1",
    "event_name_lag_1",
    "event_name_lag_2",
#     "event_name_lag_3",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "item_on_sale",
]
CONTINOUS_FEATURES = [
    "item_id",
    "weekday_sin",
    "weekday_cos",
    "month_sin",
    "month_cos",
    "quarter_sin",
    "quarter_cos",
    "sell_price",
    "sell_price_diff_1",
    "sell_price_diff_2",
    "sell_price_diff_3",
    "sell_price_diff_7",
    "count_lag_28",
    "count_lag_29",
    "count_lag_30",
    "count_lag_31",
    "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
    "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
#     "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
#     "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",
]
FEATURES = CATEGORICAL_FEATURES + CONTINOUS_FEATURES
LABEL = "count"

# Parameters
DATASET_PARAMETERS = {}
TRAIN_PARAMETERS = {
    "objective": "tweedie",
    'tweedie_variance_power': 1.1,
    "learning_rate": 0.025,
    "num_leaves": 2 ** 7 - 1,
    "max_bin": 2 ** 7 - 1,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8, 
    "metric": "rmse",
    "force_col_wise": True,
    "seed": 1,
    "histogram_pool_size": 11_000,
}

In [None]:
# Train test split
horizon = 28
start_t = MAX_TRAIN_TIMESTAMP - horizon
end_t = MAX_TRAIN_TIMESTAMP

train_df = train_data_set[train_data_set["d"] < start_t]
valid_df = train_data_set[(train_data_set["d"] >= start_t) & (train_data_set["d"] <= end_t)]
print(f"Train idx (start, end): ({train_df['d'].min()}, {train_df['d'].max()})")
print(f"Valid idx (start, end): ({valid_df['d'].min()}, {valid_df['d'].max()})")
print("============")

X_train, y_train = train_df[FEATURES], train_df[LABEL]
X_valid, y_valid = valid_df[FEATURES], valid_df[LABEL]
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_valid.shape}, y_test shape: {y_valid.shape}")

train_data = lgbm.Dataset(data=X_train, label=y_train)
valid_data = lgbm.Dataset(data=X_valid, label=y_valid)

del train_df
del valid_df
gc.collect()

In [None]:
# Train model
train_valid_loss = {}
model = lgbm.train(
    TRAIN_PARAMETERS,
    num_boost_round=1000,
    train_set=train_data,
    valid_sets=[train_data, valid_data],
    callbacks=[
        lgbm.early_stopping(stopping_rounds=5),
        lgbm.log_evaluation(period=10),
        lgbm.record_evaluation(train_valid_loss)
    ]
)

In [None]:
# Forecast and save to onto valid df
y_hat = model.predict(data=X_valid)
y_hat = np.clip(y_hat, a_min=0, a_max=np.inf)

valid_df["count"] = y_valid
valid_df["forecast"] = y_hat

## 1. Train / valid loss curves

In [None]:
def learning_curve(train_loss, valid_loss)

## 2. Residual analysis

In [None]:
# Residual vs predicted values -> heteroscedasticity
# Distribution
# Residual vs. forecast horizon
# Residuals vs. predicted values
# Residuals vs. 
# Auto-correlation

## 3. Feature importance

In [None]:
def item_level_validation_plot(
    valid_df: pd.DataFrame,
    item_id: int,
):
    """
    Plots forecast and ground truth sales for a given item id across all stores.
    
    :param valid_df: DataFrame containing forecast and ground truth sales data
        by item and store id.
    :param item_id: The item id to plot sales data for.
    """
    valid_for_item = valid_df[valid_df["item_id"] == item_id]
    store_ids = list(valid_for_item["store_id"].unique())

    fig, ax = plt.subplots(len(store_ids), 1, figsize=(8, 2 * len(store_ids)), sharex=True)
    colors = cm.viridis(np.linspace(0, 1, len(store_ids)))

    for i, store_id in enumerate(store_ids):
        valid_for_store = valid_for_item[valid_for_item["store_id"] == store_id]
        ax[i].plot(
            valid_for_store["date"].values,
            valid_for_store["count"].values,
#             label=id_store_map[store_id],
            color=colors[i]
        )
        ax[i].plot(
            valid_for_store["date"].values,
            valid_for_store["forecast"].values,
            color=colors[i],
            ls="--",
        )
#         ax[i].set(ylabel=f"Store {id_store_map[store_id]}")
    
#     ax[0].set(title=f"Item {id_item_map[item_id]}")
    for tick in ax[-1].get_xticklabels():
        tick.set_rotation(45)

    fig.tight_layout();


def aggregate_validation_plot(
    valid_df: pd.DataFrame,
    state: str,  #["CA", "TX", "WI"]
    cat: str, # ["FOODS", "HOBBIES", "HOUSEHOLD"]
):
    state_id = state_id_map[state]
    cat_id = cat_id_map[cat]
    
    state_cat_df = valid_df[(valid_df["state_id"] == state_id) & (valid_df["cat_id"] == cat_id)]
    g = state_cat_df.groupby(["store_id", "dept_id", "date"], observed=True)[["count", "forecast"]].sum()
    
    store_ids = g.index.get_level_values("store_id").unique()
    fig, ax = plt.subplots(len(store_ids), 1, figsize=(8, 2 * len(store_ids)), sharex=True)

    for i, store_id in enumerate(store_ids):
        ts = g.loc[store_id]

        dept_ids = ts.index.get_level_values("dept_id").unique()
        colors = cm.viridis(np.linspace(0, 1, len(dept_ids)))
        
        for j, dept_id in enumerate(dept_ids):
            ax[i].plot(
                ts.loc[dept_id].index,
                ts.loc[dept_id]["count"].values,
                label=id_dept_map[dept_id],
                color=colors[j]
            )
            ax[i].plot(
                ts.loc[dept_id].index,
                ts.loc[dept_id]["forecast"].values,
                color=colors[j],
                ls="--"
            )
#         ax[i].set(ylabel="count", title=id_store_map[store_id])
        ax[i].legend()

    for tick in ax[-1].get_xticklabels():
        tick.set_rotation(45)

    fig.tight_layout()

    
    
def residual_analysis():
    # Residuals for different items / stores / states / cats
    ...

In [None]:
item_id = 0
item_level_validation_plot(valid_df, item_id)