In [1]:
import json
import gc

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import numpy as np
import pandas as pd
from tqdm import tqdm

import lightgbm as lgbm

In [2]:
SUBMISSION_RUN = False

In [3]:
# Constants

# Paths
INPUT_BASE_PATH = "/kaggle/input/"
RAW_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-forecasting-accuracy"
PROCESSED_DATA_INPUT_PATH = f"{INPUT_BASE_PATH}/m5-acc"
OUTPUT_BASE_BATH = "/kaggle/working"

# Timestamps
MAX_TRAIN_TIMESTAMP = 1941
START_TEST_TIMESTAMP = 1942
START_TEST_WM_YR_WK = 11617

# Data Input

In [4]:
# Load raw data

# CALENDAR_DATA = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/calendar.csv")
# SELL_PRICES = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/sell_prices.csv")
# SALES_TRAIN_EVALUATION = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/sales_train_evaluation.csv")

SAMPLE_SUBMISSION = pd.read_csv(f"{RAW_DATA_INPUT_PATH}/sample_submission.csv")
SUBMISSION_INDEX = SAMPLE_SUBMISSION.set_index("id").index
VAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("validation")]
EVAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("evaluation")]

In [5]:
# Load preprocessed data
train_data_set = pd.read_parquet(f"{PROCESSED_DATA_INPUT_PATH}/m5-acc-train.parquet")

In [6]:
# Drop any columns that we dont need for training
cols_to_drop = ["av_store_dept_sales", "date", "wm_yr_wk"]
try:
    train_data_set = train_data_set.drop(columns=cols_to_drop)
except KeyError:
    print(f"Columns not found in axis. Skipping ...")

_ = gc.collect()

In [7]:
# Cast to lower resolution types to save memory
def cast_data_types(dataset: pd.DataFrame) -> pd.DataFrame:
    item_category_cols = ["dept_id", "cat_id", "store_id", "state_id"]
    date_category_cols = ["wm_yr_wk", "weekday", "month", "year", "event_name", "event_type", "snap_CA", "snap_TX", "snap_WI"]
    price_category_cols = ["item_on_sale"]
    for category_col in item_category_cols + date_category_cols + price_category_cols:
        try:
            dataset[category_col] = dataset[category_col].astype("category")
        except KeyError:
            print(f"Column {category_col} does not exist. Skipping ...")
    
    int_cols = ["d", "count"]
    for int_col in int_cols:
        try:
            dataset[int_col] = pd.to_numeric(dataset[int_col], downcast="integer")
        except KeyError:
            print(f"Column {int_col} does not exist. Skipping ...")
            
    float16_cols = ["item_id", "sell_price", "weekday_sin", "weekday_cos", "month_sin", "month_cos"]
    for float_col in float16_cols:
        try:
            dataset[float_col] = dataset[float_col].astype(np.float16)
        except KeyError:
            print(f"Column {int_col} does not exist. Skipping ...")
    
    return dataset

train_data_set = cast_data_types(train_data_set)

Column wm_yr_wk does not exist. Skipping ...


# LightGBM Model

In [8]:
# Rolling window validation

def time_series_split(
    df: pd.DataFrame,
    n_folds: int = 5,
    horizon: int = 28,
    overlap: int = 0,
    max_timestamp: int = MAX_TRAIN_TIMESTAMP,
):
    min_timestamp = max_timestamp - n_folds * horizon + (n_folds - 1) * overlap
    for fold_idx in range(n_folds):
        start = min_timestamp + fold_idx * (horizon - overlap)
        stop = start + horizon

        train_data = df[df["d"] < start]
        valid_data = df[(df["d"] >= start) & (df["d"] < stop)]
        
        print(f"Fold index: {fold_idx}")
        print(f"Train idx (start, end): ({train_data['d'].min()}, {train_data['d'].max()})")
        print(f"Valid idx (start, end): ({valid_data['d'].min()}, {valid_data['d'].max()})")
        print("==================")
        
        yield train_data, valid_data
        
        
def train_predict_score(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    train_parameters: dict,
    dataset_parameters: dict | None = None,
) -> tuple[float, float]:
    dataset_parameters = dataset_parameters or {}
    train_data = lgbm.Dataset(data=X_train, label=y_train, **dataset_parameters)
    valid_data = lgbm.Dataset(data=X_test, label=y_test, **dataset_parameters)
    model = lgbm.train(
        train_parameters,
        num_boost_round=3000,
        train_set=train_data,
        valid_sets=[valid_data],
        callbacks=[
            lgbm.early_stopping(stopping_rounds=5),
            lgbm.log_evaluation(period=10),
        ]
    )
    
    y_hat = model.predict(X_test)
    y_hat = np.clip(np.round(y_hat), a_min=0, a_max=np.inf)
    y_hat = y_hat.astype(np.int32)
    rmse = np.sqrt(np.mean((y_test - y_hat) ** 2))

    return rmse, model.best_iteration

In [9]:
CATEGORICAL_FEATURES = [
    "dept_id",
    "cat_id",
    "store_id",
    "state_id",
    "year",
    "event_name",
    "event_type",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "item_on_sale",
]
CONTINOUS_FEATURES = [
    "item_id",
    "weekday_sin",
    "weekday_cos",
    "month_sin",
    "month_cos",
    "sell_price",
    "sell_price_diff_1",
    "sell_price_diff_2",
    "sell_price_diff_3",
    "sell_price_diff_7",
#     "av_item_state_sell_price",
#     "av_dept_state_sell_price",
    "count_lag_28",
    "count_lag_29",
    "count_lag_30",
    "count_lag_31",
    "av_store_dept_sales_lag_28",
    "av_store_dept_sales_lag_29",
    "av_store_dept_sales_lag_30",
    "av_store_dept_sales_lag_31"
]
FEATURES = CATEGORICAL_FEATURES + CONTINOUS_FEATURES
LABEL = "count"

# Parameters
DATASET_PARAMETERS = {}
TRAIN_PARAMETERS = {
    "objective": "tweedie",
    "learning_rate": 0.05,
    "num_leaves": 2 ** 7 - 1,
    "max_bin": 2 ** 7 - 1,
    "metric": "rmse",
    "force_col_wise": True,
}

In [10]:
if not SUBMISSION_RUN:
    scores = []
    best_iterations = []
    
    for train, valid in tqdm(time_series_split(train_data_set, n_folds=5), total=5):
    
        X_train, y_train = train[FEATURES], train[LABEL]
        X_test, y_test = valid[FEATURES], valid[LABEL]
        score, best_iter = train_predict_score(
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            train_parameters=TRAIN_PARAMETERS,
            dataset_parameters=DATASET_PARAMETERS,
        )
        scores.append(score)
        best_iterations.append(best_iter)
    
    cv_results = {
        "scores": scores,
        "mean_score": np.mean(scores),
        "std_score": np.std(scores),
        "best_iterations": best_iterations,
        "mean_best_iteration": np.mean(best_iterations)
    }
    
    with open(f"{OUTPUT_BASE_BATH}/cv_results.json", "w") as f:
        json.dump(cv_results, f)
    
else:
    train_data = lgbm.Dataset(data=data[FEATURES], label=data[LABEL], **DATASET_PARAMETERS)
    model = lgbm.train(
        TRAIN_PARAMETERS,
        num_boost_round=400,
        train_set=train_data,
    )

  0%|          | 0/5 [00:00<?, ?it/s]

Fold index: 0
Train idx (start, end): (1, 1800)
Valid idx (start, end): (1801, 1828)
[LightGBM] [Info] Total Bins 1664
[LightGBM] [Info] Number of data points in the train set: 54882000, number of used features: 29
[LightGBM] [Info] Start training from score 0.106051
Training until validation scores don't improve for 5 rounds
[10]	valid_0's rmse: 3.23551
[20]	valid_0's rmse: 3.05209
[30]	valid_0's rmse: 2.87331
[40]	valid_0's rmse: 2.72877
[50]	valid_0's rmse: 2.63391
[60]	valid_0's rmse: 2.58194
[70]	valid_0's rmse: 2.55349
[80]	valid_0's rmse: 2.54434
[90]	valid_0's rmse: 2.53961
[100]	valid_0's rmse: 2.53472
[110]	valid_0's rmse: 2.53274
[120]	valid_0's rmse: 2.52919
[130]	valid_0's rmse: 2.52682
[140]	valid_0's rmse: 2.52485
[150]	valid_0's rmse: 2.52358
[160]	valid_0's rmse: 2.52163
[170]	valid_0's rmse: 2.51882
[180]	valid_0's rmse: 2.51725
[190]	valid_0's rmse: 2.51556
[200]	valid_0's rmse: 2.51137
[210]	valid_0's rmse: 2.5078
[220]	valid_0's rmse: 2.50636
[230]	valid_0's rmse: 

 20%|██        | 1/5 [59:18<3:57:14, 3558.60s/it]

Fold index: 1
Train idx (start, end): (1, 1828)
Valid idx (start, end): (1829, 1856)
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 55735720, number of used features: 29
[LightGBM] [Info] Start training from score 0.108273
Training until validation scores don't improve for 5 rounds
[10]	valid_0's rmse: 3.52227
[20]	valid_0's rmse: 3.30238
[30]	valid_0's rmse: 3.06667
[40]	valid_0's rmse: 2.8552
[50]	valid_0's rmse: 2.69645
[60]	valid_0's rmse: 2.59224
[70]	valid_0's rmse: 2.52889
[80]	valid_0's rmse: 2.49289
[90]	valid_0's rmse: 2.47332
[100]	valid_0's rmse: 2.46043
[110]	valid_0's rmse: 2.45362
[120]	valid_0's rmse: 2.4481
[130]	valid_0's rmse: 2.44278
[140]	valid_0's rmse: 2.43749
[150]	valid_0's rmse: 2.43322
[160]	valid_0's rmse: 2.43026
[170]	valid_0's rmse: 2.42848
[180]	valid_0's rmse: 2.42648
[190]	valid_0's rmse: 2.42448
[200]	valid_0's rmse: 2.42453
Early stopping, best iteration is:
[195]	valid_0's rmse: 2.42357


 40%|████      | 2/5 [1:25:12<1:58:57, 2379.15s/it]

Fold index: 2
Train idx (start, end): (1, 1856)
Valid idx (start, end): (1857, 1884)
[LightGBM] [Info] Total Bins 1655
[LightGBM] [Info] Number of data points in the train set: 56589440, number of used features: 29
[LightGBM] [Info] Start training from score 0.111823
Training until validation scores don't improve for 5 rounds
[10]	valid_0's rmse: 3.43877
[20]	valid_0's rmse: 3.2076
[30]	valid_0's rmse: 2.95816
[40]	valid_0's rmse: 2.73505
[50]	valid_0's rmse: 2.57387
[60]	valid_0's rmse: 2.47475
[70]	valid_0's rmse: 2.42104
[80]	valid_0's rmse: 2.39369
[90]	valid_0's rmse: 2.38
[100]	valid_0's rmse: 2.37194
[110]	valid_0's rmse: 2.36503
[120]	valid_0's rmse: 2.36135
[130]	valid_0's rmse: 2.35899
[140]	valid_0's rmse: 2.35531
[150]	valid_0's rmse: 2.35338
Early stopping, best iteration is:
[154]	valid_0's rmse: 2.35273


 60%|██████    | 3/5 [1:46:50<1:02:51, 1885.75s/it]

Fold index: 3
Train idx (start, end): (1, 1884)
Valid idx (start, end): (1885, 1912)
[LightGBM] [Info] Total Bins 1667
[LightGBM] [Info] Number of data points in the train set: 57443160, number of used features: 29
[LightGBM] [Info] Start training from score 0.115333
Training until validation scores don't improve for 5 rounds
[10]	valid_0's rmse: 3.43339
[20]	valid_0's rmse: 3.20278
[30]	valid_0's rmse: 2.95198
[40]	valid_0's rmse: 2.7254
[50]	valid_0's rmse: 2.55809
[60]	valid_0's rmse: 2.45083
[70]	valid_0's rmse: 2.39007
[80]	valid_0's rmse: 2.35695
[90]	valid_0's rmse: 2.33834
[100]	valid_0's rmse: 2.32781
[110]	valid_0's rmse: 2.32054
[120]	valid_0's rmse: 2.31565
[130]	valid_0's rmse: 2.31194
[140]	valid_0's rmse: 2.30893
[150]	valid_0's rmse: 2.30636
[160]	valid_0's rmse: 2.30432
[170]	valid_0's rmse: 2.30293
[180]	valid_0's rmse: 2.30175
[190]	valid_0's rmse: 2.30119
[200]	valid_0's rmse: 2.29919
[210]	valid_0's rmse: 2.29683
[220]	valid_0's rmse: 2.2947
[230]	valid_0's rmse: 2

 80%|████████  | 4/5 [2:20:27<32:17, 1937.44s/it]  

Fold index: 4
Train idx (start, end): (1, 1912)
Valid idx (start, end): (1913, 1940)
[LightGBM] [Info] Total Bins 1677
[LightGBM] [Info] Number of data points in the train set: 58296880, number of used features: 29
[LightGBM] [Info] Start training from score 0.118722
Training until validation scores don't improve for 5 rounds
[10]	valid_0's rmse: 3.43512
[20]	valid_0's rmse: 3.20142
[30]	valid_0's rmse: 2.94567
[40]	valid_0's rmse: 2.71377
[50]	valid_0's rmse: 2.5427
[60]	valid_0's rmse: 2.43461
[70]	valid_0's rmse: 2.37436
[80]	valid_0's rmse: 2.34112
[90]	valid_0's rmse: 2.32193
[100]	valid_0's rmse: 2.31274
[110]	valid_0's rmse: 2.30513
[120]	valid_0's rmse: 2.30081
[130]	valid_0's rmse: 2.29789
[140]	valid_0's rmse: 2.29458
[150]	valid_0's rmse: 2.29316
[160]	valid_0's rmse: 2.29198
[170]	valid_0's rmse: 2.29191
Early stopping, best iteration is:
[166]	valid_0's rmse: 2.2915


100%|██████████| 5/5 [2:44:43<00:00, 1976.61s/it]


# Forecast & Submit

In [11]:
def construct_test_df(
    d_start: int,
    d_end: int,
    item_category_df: pd.DataFrame,
    calendar_df: pd.DataFrame,
    price_df: pd.DataFrame,
) -> pd.DataFrame:
    
    d_range = list(range(d_start, d_end + 1))
    d_range_df = pd.DataFrame(d_range, columns=["d"])
    test_df = pd.merge(item_id_categories, d_range_df, how="cross")
    test_df = merge_sales_and_calendar_data(test_df, calendar_df)
    test_df = merge_sales_and_price_data(test_df, price_df)
    return test_df

def map_test_df_columns(test_df: pd.DataFrame) -> pd.DataFrame:
    test_df["item_id"] = test_df["item_id"].map(item_id_map)
    test_df["dept_id"] = test_df["dept_id"].map(dept_id_map)
    test_df["cat_id"] = test_df["cat_id"].map(cat_id_map)
    test_df["store_id"] = test_df["store_id"].map(store_id_map)
    test_df["state_id"] = test_df["state_id"].map(state_id_map)
    test_df["event_name"] = test_df["event_name"].map(event_name_map)
    test_df["event_type"] = test_df["event_type"].map(event_type_map)
    return test_df

def pivot_forecast_df(X_test: pd.DataFrame, y_hat: np.ndarray) -> pd.DataFrame:
    forecast_df = pd.concat([X_test, pd.DataFrame(y_hat, columns=["count"])], axis=1)
    forecast_df = forecast_df.pivot(columns="d", index="id", values="count").sort_index(axis=1)
    forecast_df = forecast_df.rename_axis(None, axis=1).reset_index()
    return forecast_df

def rename_forecast_columns(forecast_df: pd.DataFrame, forecast_horizon: int = 28) -> pd.DataFrame:
    forecast_df = forecast_df.set_index("id")
    forecast_df.columns = [f"F{i}" for i in range(1, forecast_horizon + 1)]
    return forecast_df.reset_index()

In [12]:
if SUBMISSION_RUN:
    test_d_start, test_d_end = 1942, 1969
    test_df = construct_test_df(
        d_start=test_d_start,
        d_end=test_d_end,
        item_category_df=item_id_categories,
        calendar_df=calendar_df,
        price_df=price_df,
    )
    test_df = map_test_df_columns(test_df)
    test_df = cast_types(test_df)

    y_hat = model.predict(test_df[FEATURES])
    y_hat = np.clip(np.round(y_hat), a_min=0, a_max=np.inf)
    y_hat = y_hat.astype(np.int32)

    forecast_df = pivot_forecast_df(test_df, y_hat)
    forecast_df = rename_forecast_columns(forecast_df)
    
    # Merge with sample submissions
    EVAL_SUBMISSION = EVAL_SUBMISSION[["id"]].merge(forecast_df, on="id", how="left")
    FINAL_SUBMISSIONS = pd.concat([VAL_SUBMISSION, EVAL_SUBMISSION])
    FINAL_SUBMISSIONS = FINAL_SUBMISSIONS.set_index("id").reindex(SUBMISSION_INDEX).reset_index()
    
    FINAL_SUBMISSIONS.to_csv(f"{OUTPUT_BASE_BATH}/submission.csv", index=False)
    