In [1]:
import json
import gc

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm

import lightgbm as lgbm

In [2]:
SUBMISSION_RUN = True

In [3]:
# Constants

# Paths
INPUT_BASE_PATH = "/kaggle/input/"
RAW_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-forecasting-accuracy"
PROCESSED_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-acc"
OUTPUT_BASE_BATH = "/kaggle/working"

# Timestamps
MAX_TRAIN_TIMESTAMP = 1941
START_TEST_TIMESTAMP = 1942
START_TEST_WM_YR_WK = 11617

# Data Input

In [4]:
# Load raw data

# CALENDAR_DATA = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/calendar.csv")
# SELL_PRICES = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/sell_prices.csv")
# SALES_TRAIN_EVALUATION = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/sales_train_evaluation.csv")

In [5]:
def cast_category_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_category_cols = ["dept_id", "cat_id", "store_id", "state_id"]
    date_category_cols = ["weekday", "month", "quarter", "year"]
    event_cols = [
        "event_name", "event_type",
        "event_name_lag_-3", "event_type_lag_-3",
        "event_name_lag_-2", "event_type_lag_-2",
        "event_name_lag_-1", "event_type_lag_-1",
        "event_name_lag_1", "event_type_lag_1",
        "event_name_lag_2", "event_type_lag_2",
        "event_name_lag_3", "event_type_lag_3",
        "snap_CA", "snap_TX", "snap_WI"
    ]
    sale_cols = ["item_on_sale"]
    
    all_cat_cols = item_category_cols + date_category_cols + event_cols + sale_cols
    for category_col in all_cat_cols:
        try:
            dataset[category_col] = dataset[category_col].astype("category")
        except KeyError:
            # print(f"Column {category_col} does not exist. Skipping ...")
            pass
    
    return dataset


def cast_int_types(dataset: pd.DataFrame) -> pd.DataFrame:
    int_cols = ["d", "count"]
    for int_col in int_cols:
        try:
            dataset[int_col] = dataset[int_col].astype(np.int16)
        except KeyError:
            # print(f"Column {int_col} does not exist. Skipping ...")
            pass
    
    return dataset


def cast_float_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_id_cols = ["item_id"]
    float_sales_cols = [
        "count_lag_28", "count_lag_29", "count_lag_30", "count_lag_31",
        "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
        "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
        "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
        "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",
    ]
    float_price_cols = [
        "sell_price",
        "sell_price_diff_1", "sell_price_diff_2", "sell_price_diff_3", "sell_price_diff_7",
        "sell_price_rolling_mean_window_7", "sell_price_rolling_std_window_7", "sell_price_rolling_kurt_window_7",
        "sell_price_rolling_mean_window_14", "sell_price_rolling_std_window_14", "sell_price_rolling_kurt_window_14",
        "sell_price_rolling_mean_window_21", "sell_price_rolling_std_window_21", "sell_price_rolling_kurt_window_21",
        "sell_price_rolling_mean_window_28", "sell_price_rolling_std_window_28", "sell_price_rolling_kurt_window_28",
    ]
    float_date_cols = [
        "weekday_sin", "weekday_cos",
        "month_sin", "month_cos",
        "quarter_sin", "quarter_cos",
    ]
    float_cols = item_id_cols + float_sales_cols + float_price_cols + float_date_cols
    for float_col in float_cols:
        try:
            dataset[float_col] = dataset[float_col].astype(np.float16)
        except KeyError:
            # print(f"Column {int_col} does not exist. Skipping ...")
            pass
    
    return dataset

# Cast to lower resolution types to save memory
def cast_data_types(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset = cast_category_types(dataset)
    dataset = cast_int_types(dataset)
    dataset = cast_float_types(dataset)
    return dataset

In [6]:
def drop_cols(dataset: pd.DataFrame) -> pd.DataFrame:
    date_cols_to_drop = ["date", "weekday", "month", "quarter"]
    event_cols_to_drop = [
        "event_type",
        "event_type_lag_-3", "event_type_lag_-2", "event_type_lag_-1",
        "event_type_lag_3", "event_type_lag_2", "event_type_lag_1", 
    ]
    sell_cols_to_drop = [
        "sell_price_rolling_mean_window_7", "sell_price_rolling_std_window_7", "sell_price_rolling_kurt_window_7",
        "sell_price_rolling_mean_window_14", "sell_price_rolling_std_window_14", "sell_price_rolling_kurt_window_14",
        "sell_price_rolling_mean_window_21", "sell_price_rolling_std_window_21", "sell_price_rolling_kurt_window_21",
        "sell_price_rolling_mean_window_28", "sell_price_rolling_std_window_28", "sell_price_rolling_kurt_window_28",
    ]
    count_cols_to_drop = [
#         "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
#         "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
        "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
        "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",

    ]

    for column_set in (date_cols_to_drop, event_cols_to_drop, sell_cols_to_drop, count_cols_to_drop):
        try:
            dataset = dataset.drop(columns=column_set)
        except KeyError:
            # print(f"Column '{col}' not found in axis. Skipping ...")
            pass

        _ = gc.collect()
    return dataset

In [None]:
# Load train data in chunks
train_pq_file = pq.ParquetFile(f"{PROCESSED_DATA_INPUT_PATH}/m5-acc-train.parquet")

train_data_set = pd.DataFrame()
for batch in tqdm(train_pq_file.iter_batches(batch_size=131_072)):
    train_batch_df = batch.to_pandas()
    train_batch_df = drop_cols(train_batch_df)
    train_batch_df = cast_data_types(train_batch_df)
    
    train_data_set = pd.concat([train_data_set, train_batch_df], ignore_index=True)
    
    del train_batch_df
    _ = gc.collect()

train_data_set = cast_data_types(train_data_set)

# LightGBM Model

In [8]:
# Rolling window validation

def time_series_split(
    df: pd.DataFrame,
    n_folds: int = 5,
    horizon: int = 28,
    overlap: int = 0,
    max_timestamp: int = MAX_TRAIN_TIMESTAMP,
):
    min_timestamp = max_timestamp - n_folds * horizon + (n_folds - 1) * overlap
    for fold_idx in range(n_folds):
        start = min_timestamp + fold_idx * (horizon - overlap)
        stop = start + horizon

        train_data = df[df["d"] < start]
        valid_data = df[(df["d"] >= start) & (df["d"] < stop)]
        
        print(f"Fold index: {fold_idx}")
        print(f"Train idx (start, end): ({train_data['d'].min()}, {train_data['d'].max()})")
        print(f"Valid idx (start, end): ({valid_data['d'].min()}, {valid_data['d'].max()})")
        print("==================")
        
        yield train_data, valid_data
        
        
def train_predict_score(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    train_parameters: dict,
    dataset_parameters: dict | None = None,
) -> tuple[float, float]:
    dataset_parameters = dataset_parameters or {}
    train_data = lgbm.Dataset(data=X_train, label=y_train, **dataset_parameters)
    valid_data = lgbm.Dataset(data=X_test, label=y_test, **dataset_parameters)
    model = lgbm.train(
        train_parameters,
        num_boost_round=3000,
        train_set=train_data,
        valid_sets=[train_data, valid_data],
        callbacks=[
            lgbm.early_stopping(stopping_rounds=5),
            lgbm.log_evaluation(period=10),
        ]
    )
    
    y_hat = model.predict(X_test)
    y_hat = np.clip(y_hat, a_min=0, a_max=np.inf)
    rmse = np.sqrt(np.mean((y_test - y_hat) ** 2))

    return rmse, model.best_iteration

In [9]:
CATEGORICAL_FEATURES = [
    "dept_id",
    "cat_id",
    "store_id",
    "state_id",
    "year",
    "event_name",
#     "event_name_lag_-3",
    "event_name_lag_-2",
    "event_name_lag_-1",
    "event_name_lag_1",
    "event_name_lag_2",
#     "event_name_lag_3",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "item_on_sale",
]
CONTINOUS_FEATURES = [
    "item_id",
    "weekday_sin",
    "weekday_cos",
    "month_sin",
    "month_cos",
    "quarter_sin",
    "quarter_cos",
    "sell_price",
    "sell_price_diff_1",
    "sell_price_diff_2",
    "sell_price_diff_3",
    "sell_price_diff_7",
    "count_lag_28",
    "count_lag_29",
    "count_lag_30",
    "count_lag_31",
    "count_lag_28_rolling_mean_window_7", "count_lag_28_rolling_std_window_7", "count_lag_28_rolling_kurt_window_7",
    "count_lag_28_rolling_mean_window_14", "count_lag_28_rolling_std_window_14", "count_lag_28_rolling_kurt_window_14",
#     "count_lag_28_rolling_mean_window_21", "count_lag_28_rolling_std_window_21", "count_lag_28_rolling_kurt_window_21",
#     "count_lag_28_rolling_mean_window_28", "count_lag_28_rolling_std_window_28", "count_lag_28_rolling_kurt_window_28",
]
FEATURES = CATEGORICAL_FEATURES + CONTINOUS_FEATURES
LABEL = "count"

# Parameters
DATASET_PARAMETERS = {}
TRAIN_PARAMETERS = {
    "objective": "tweedie",
    'tweedie_variance_power': 1.1,
    "learning_rate": 0.025,
    "num_leaves": 2 ** 7 - 1,
    "max_bin": 2 ** 7 - 1,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8, 
    "metric": "rmse",
    "force_col_wise": True,
    "seed": 1,
    "histogram_pool_size": 11000,
}

In [None]:
if not SUBMISSION_RUN:
    scores = []
    best_iterations = []
    
    for train, valid in tqdm(time_series_split(train_data_set, n_folds=5), total=5):
    
        X_train, y_train = train[FEATURES], train[LABEL]
        X_test, y_test = valid[FEATURES], valid[LABEL]
        score, best_iter = train_predict_score(
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            train_parameters=TRAIN_PARAMETERS,
            dataset_parameters=DATASET_PARAMETERS,
        )
        scores.append(score)
        best_iterations.append(best_iter)
    
    cv_results = {
        "scores": scores,
        "mean_score": np.mean(scores),
        "std_score": np.std(scores),
        "best_iterations": best_iterations,
        "mean_best_iteration": np.mean(best_iterations)
    }
    
    with open(f"{OUTPUT_BASE_BATH}/cv_results.json", "w") as f:
        json.dump(cv_results, f)
    
else:
    X_train, y_train = train_data_set[FEATURES], train_data_set[LABEL]
    train_data = lgbm.Dataset(data=X_train, label=y_train, **DATASET_PARAMETERS)
    model = lgbm.train(
        TRAIN_PARAMETERS,
        num_boost_round=550,
        train_set=train_data,
        valid_sets=[train_data],
        callbacks=[lgbm.log_evaluation(period=10)]
    )

# Forecast & Submit

In [11]:
def pivot_test_df(test_data_set: pd.DataFrame) -> pd.DataFrame:
    test_data_set = test_data_set.pivot(columns="d", index="id", values="count").sort_index(axis=1)
    test_data_set = test_data_set.rename_axis(None, axis=1).reset_index()
    return test_data_set

def rename_forecast_columns(forecast_df: pd.DataFrame, forecast_horizon: int = 28) -> pd.DataFrame:
    forecast_df = forecast_df.set_index("id")
    forecast_df.columns = [f"F{i}" for i in range(1, forecast_horizon + 1)]
    return forecast_df.reset_index()

In [12]:
if SUBMISSION_RUN:
    SAMPLE_SUBMISSION = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/sample_submission.csv")
    SUBMISSION_INDEX = SAMPLE_SUBMISSION.set_index("id").index
    VAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("validation")]
    EVAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("evaluation")]
    
    test_data_set = pd.read_parquet(f"{PROCESSED_DATA_INPUT_PATH}/m5-acc-test.parquet")
    test_data_set = drop_cols(test_data_set)
    test_data_set = cast_data_types(test_data_set)

    y_hat = model.predict(test_data_set[FEATURES])
    y_hat = np.clip(y_hat, a_min=0, a_max=np.inf)
    test_data_set["count"] = y_hat
    
    test_data_set = pivot_test_df(test_data_set)
    test_data_set = rename_forecast_columns(test_data_set)
    
    # Merge with sample submissions
    EVAL_SUBMISSION = EVAL_SUBMISSION[["id"]].merge(test_data_set, on="id", how="left")
    FINAL_SUBMISSIONS = pd.concat([VAL_SUBMISSION, EVAL_SUBMISSION])
    FINAL_SUBMISSIONS = FINAL_SUBMISSIONS.set_index("id").reindex(SUBMISSION_INDEX).reset_index()
    
    FINAL_SUBMISSIONS.to_csv(f"{OUTPUT_BASE_BATH}/submission.csv", index=False)