In [None]:
import json

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import numpy as np
import pandas as pd
from tqdm import tqdm

import lightgbm as lgbm

# Load raw datasets

In [None]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")

SAMPLE_SUBMISSION = pd.read_csv(f"{INPUT_BASE_PATH}/sample_submission.csv")
SUBMISSION_INDEX = SAMPLE_SUBMISSION.set_index("id").index
VAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("validation")]
EVAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("evaluation")]

In [None]:
# Constants
MAX_TRAIN_TIMESTAMP = 1941
START_TEST_TIMESTAMP = 1942
START_TEST_WM_YR_WK = 11617

LAG_PERIODS = range(28, 32)

# Utility functions for preprocessing and feature engineering

In [None]:
# Extract categorical mappings
def get_unique_value_id_map(df: pd.DataFrame, col_name: str):
    return {value_id: i for (i, value_id) in enumerate(df[col_name].unique())}


def map_category_ids(
    df: pd.DataFrame,
    column_name: str,
    category_id_map: dict | None = None,
    submission_run: bool = False
) -> pd.DataFrame:
    if category_id_map is None:
        category_id_map = get_unique_value_id_map(df, column_name)
    id_category_map = {v: k for (k, v) in category_id_map.items()}
    df[column_name] = df[column_name].map(category_id_map)
    return (category_id_map, None) if submission_run else (category_id_map, id_category_map)

In [None]:
# Process sales data
def unpivot_sales_df(sales_df: pd.DataFrame, timestamp_cols: list[str]) -> pd.DataFrame:
    sales_df = sales_df.melt(
        id_vars=["item_id", "dept_id", "cat_id", "store_id", "state_id"],
        value_vars=timestamp_cols,
        var_name="d",
        value_name="count"
    )
    sales_df["d"] = sales_df["d"].apply(lambda d: int(d.lstrip("d_")))
    sales_df = sales_df.sort_values(by=["store_id", "item_id", "d"]).reset_index(drop=True)
    return sales_df


def cast_sales_data_types(sales_df: pd.DataFrame) -> pd.DataFrame:
    category_cols = ["dept_id", "cat_id", "store_id", "state_id"]
    for category_col in category_cols:
        try:
            sales_df[category_col] = sales_df[category_col].astype("category")
        except KeyError:
            print(f"Column {category_col} does not exist. Skipping.")
            pass
        
    column_downcast_map = [("item_id", "float"), ("d", "integer"), ("count", "integer")]
    for (col, dtype) in column_downcast_map:
        try:
            sales_df[col] = pd.to_numeric(sales_df[col], downcast=dtype)
        except KeyError:
            print(f"Column {col} does not exist. Skipping.")
            pass
    
    return sales_df

In [None]:
# Process calendar data
def select_format_calendar_features(calendar_data: pd.DataFrame) -> pd.DataFrame:
    # Drop columns
    cols_to_drop = ["weekday", "event_name_2", "event_type_2"]
    calendar_data = calendar_data.copy().drop(columns=cols_to_drop)
    
    # Format cols
    calendar_data["d"] = calendar_data["d"].apply(lambda d: int(d.lstrip("d_")))
    calendar_data = calendar_data.rename(columns={
        "event_name_1": "event_name",
        "event_type_1": "event_type",
        "wday": "weekday"
    })
    
    return calendar_data


def cast_calendar_data_types(calendar_df: pd.DataFrame) -> pd.DataFrame:
    category_cols = [
        "weekday",
        "month",
        "year",
        "wm_yr_wk",
        "event_name",
        "event_type",
        "snap_CA",
        "snap_TX",
        "snap_WI"
    ]
    for category_col in category_cols:
        try:
            calendar_df[category_col] = calendar_df[category_col].astype("category")
        except KeyError:
            print(f"Col {category_col} does not exist. Skipping ...")
    
    try:
        calendar_df["d"] = pd.to_numeric(calendar_df["d"], downcast="integer")
    except KeyError:
            print(f"Col 'd' does not exist. Skipping ...")
    
    return calendar_df

In [None]:
# Process sell price data
def cast_price_data_types(price_df: pd.DataFrame) -> pd.DataFrame:
    category_cols = ["store_id", "wm_yr_wk"]
    for category_col in category_cols:
        try:
            price_df[category_col] = price_df[category_col].astype("category")
        except KeyError:
            print(f"Col {category_col} does not exist. Skipping ...")
    
    for float_col in ["item_id", "sell_price"]:
        try:
            price_df[float_col] = pd.to_numeric(price_df[float_col], downcast="float")
        except KeyError:
            print(f"Col {float_col} does not exist. Skipping ...")
    
    return price_df

In [None]:
# Merge
def merge_sales_calendar_price_data(
    sales_df: pd.DataFrame,
    calendar_df: pd.DataFrame,
    price_df: pd.DataFrame,
    chunk_size: int = 500_000,
) -> pd.DataFrame:
    chunks = []
    for chunk_start in tqdm(range(0, len(sales_df), chunk_size)):
        sales_chunk = sales_df.iloc[chunk_start: chunk_start + chunk_size]
        merged_chunk = (
            sales_chunk.merge(
                right=calendar_df,
                on="d",
                how="left",
            ).merge(
                right=price_df,
                on=["item_id", "store_id", "wm_yr_wk"],
                how="left",
            )
        )
        chunks.append(merged_chunk)
    return pd.concat(chunks, ignore_index=True)

In [None]:
# Feature engineering funcs

def fourier_features(data: pd.DataFrame) -> pd.DataFrame:
    data["weekday_sin"] = np.sin((2 * np.pi * data["weekday"].astype(float)) / 7)
    data["weekday_cos"] = np.cos((2 * np.pi * data["weekday"].astype(float)) / 7)
    data["month_sin"] = np.sin((2 * np.pi * data["month"].astype(float)) / 12)
    data["month_cos"] = np.cos((2 * np.pi * data["month"].astype(float)) / 12)
    
    for col in ["weekday_sin", "weekday_cos", "month_sin", "month_cos"]:
        data[col] = pd.to_numeric(data[col], downcast="float")
    
    return data


def av_store_dept_sales(sales_df: pd.DataFrame, chunk_size: int = 500_000) -> pd.DataFrame:
    av_store_dept_sales = (
        sales_df
        .groupby(["store_id", "dept_id", "d"], observed=True)
        [["count"]]
        .mean()
        .rename(columns={"count": "av_store_dept_sales"})
        .reset_index()
    )
    
    # Merge in chunks then concatenate
    store_dept_d_df = data[["store_id", "dept_id", "d"]]
    chunks = []
    for chunk_start in tqdm(range(0, len(store_dept_d_df), chunk_size)):
        store_dept_d_chunk = store_dept_d_df.iloc[chunk_start: chunk_start + chunk_size]
        merged_chunk = store_dept_d_chunk.merge(
            right=av_store_dept_sales,
            on=["store_id", "dept_id", "d"],
            how="left"
        )
        chunks.append(merged_chunk["av_store_dept_sales"])
    merged_av_sales = pd.concat(chunks, ignore_index=True)
    data["av_store_dept_sales"] = merged_av_sales
    
    # Cast col types
    for sales_col in ["count", "av_store_dept_sales"]:
        data[sales_col] = pd.to_numeric(data[sales_col], downcast="float")
    
    return data


def av_item_state_prices(
    data: pd.DataFrame,
    chunk_size: int = 500_000,
) -> pd.DataFrame:
    av_price = (
        data
        .groupby(["item_id", "state_id", "d"], observed=True)
        [["sell_price"]]
        .mean()
        .rename(columns={"sell_price": "av_item_state_sell_price"})
        .reset_index()
    )
    
    # Merge in chunks then concatenate
    item_state_d_df = data[["item_id", "state_id", "d"]]
    chunks = []
    for chunk_start in tqdm(range(0, len(item_state_d_df), chunk_size)):
        item_state_d_chunk = item_state_d_df.iloc[chunk_start: chunk_start + chunk_size]
        merged_chunk = item_state_d_chunk.merge(
            right=av_price,
            on=["item_id", "state_id", "d"],
            how="left"
        )
        chunks.append(merged_chunk["av_item_state_sell_price"])
    merged_av_price = pd.concat(chunks, ignore_index=True)
    data["av_item_state_sell_price"] = merged_av_price
    
    # Cast col types
    for price_col in ["sell_price", "av_item_state_sell_price"]:
        data[price_col] = pd.to_numeric(data[price_col], downcast="float")
    
    return data


def av_dept_state_price(
    data: pd.DataFrame,
    chunk_size: int = 500_000
) -> pd.DataFrame:
    av_price = (
        data
        .groupby(["dept_id", "state_id", "d"], observed=True)
        [["sell_price"]]
        .mean()
        .rename(columns={"sell_price": "av_dept_state_sell_price"})
        .reset_index()
    )
    
    # Merge in chunks then concatenate
    dept_state_d_df = data[["dept_id", "state_id", "d"]]
    chunks = []
    for chunk_start in tqdm(range(0, len(dept_state_d_df), chunk_size)):
        dept_state_d_chunk = dept_state_d_df.iloc[chunk_start: chunk_start + chunk_size]
        merged_chunk = dept_state_d_chunk.merge(
            right=av_price,
            on=["dept_id", "state_id", "d"],
            how="left"
        )
        chunks.append(merged_chunk["av_dept_state_sell_price"])
    merged_av_price = pd.concat(chunks, ignore_index=True)
    data["av_dept_state_sell_price"] = merged_av_price
    
    # Cast col types
    for price_col in ["sell_price", "av_dept_state_sell_price"]:
        data[price_col] = pd.to_numeric(data[price_col], downcast="float")
    
    return data


def lagged_sales_features(sales_df: pd.DataFrame, periods: list[int]) -> pd.DataFrame:
    # TODO: Consider applying the same merge + concat approach here
    # rather than storing the whole df in lagged_dfs
    lagged_dfs = []
    for _, group_df in tqdm(sales_df.groupby(["item_id", "store_id"], observed=True)):
        group_df = group_df.sort_values("d")
        for p in periods:
            lag_series = group_df["count"].shift(p)
            group_df[f"count_lag_{p}"] = pd.to_numeric(lag_series, downcast="float")
        lagged_dfs.append(group_df)
    return pd.concat(lagged_dfs, ignore_index=True)


def cast_lagged_sales_features(sales_df: pd.DataFrame, periods: list[int]) -> pd.DataFrame:
    lag_column_names = [f"count_lag_{p}" for p in periods]
    for column_name in lag_column_names:
        sales_df[column_name] = pd.to_numeric(sales_df[column_name], downcast="float")
    return sales_df


def lagged_av_store_dept_sales_features(sales_df: pd.DataFrame, periods: list[int]) -> pd.DataFrame:
    # TODO: Consider applying the same merge + concat approach here
    # rather than storing the whole df in lagged_dfs
    lagged_dfs = []
    for _, group_df in tqdm(sales_df.groupby(["item_id", "store_id"], observed=True)):
        group_df = group_df.sort_values("d")
        for p in periods:
            lag_series = group_df["av_store_dept_sales"].shift(p)
            group_df[f"av_store_dept_sales_lag_{p}"] = pd.to_numeric(lag_series, downcast="float")
        lagged_dfs.append(group_df)
    return pd.concat(lagged_dfs, ignore_index=True)


def cast_lagged_av_sales_features(sales_df: pd.DataFrame, periods: list[int]) -> pd.DataFrame:
    lag_column_names = ["av_store_dept_sales"] + [f"av_store_dept_sales_lag_{p}" for p in periods]
    for column_name in lag_column_names:
        sales_df[column_name] = pd.to_numeric(sales_df[column_name], downcast="float")
    return sales_df


def lagged_sell_price_diff_features(
    data: pd.DataFrame,
    price_column: str,
    periods: list[int],
) -> pd.DataFrame:
    # TODO: Consider applying the same merge + concat approach here
    # rather than storing the whole df in lagged_dfs
    lagged_dfs = []
    for _, group_df in tqdm(data.groupby(["item_id", "store_id"], observed=True)):
        group_df = group_df.sort_values("d")
        for p in periods:
            lag_price_column = f"{price_column}_diff_{p}"
            lag_series = group_df[price_column].diff(p)
            group_df[lag_price_column] = pd.to_numeric(lag_series, downcast="float")
        lagged_dfs.append(group_df)
    return pd.concat(lagged_dfs, ignore_index=True)


# Training data

## Sales data

In [None]:
# Process sales data

# Map item ids to categories
item_id_map, id_item_map = map_category_ids(SALES_TRAIN_EVALUATION, "item_id", submission_run=False)
dept_id_map, id_dept_map = map_category_ids(SALES_TRAIN_EVALUATION, "dept_id", submission_run=False)
cat_id_map, id_cat_map = map_category_ids(SALES_TRAIN_EVALUATION, "cat_id", submission_run=False)
store_id_map, id_store_map = map_category_ids(SALES_TRAIN_EVALUATION, "store_id", submission_run=False)
state_id_map, id_state_map = map_category_ids(SALES_TRAIN_EVALUATION, "state_id", submission_run=False)

# Store category info for each item id
ITEM_ID_CATEGORIES = SALES_TRAIN_EVALUATION[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]]

# Convert sales table to long table format
timestamp_cols = [f"d_{i}" for i in range(1, MAX_TRAIN_TIMESTAMP + 1)]
sales_df = unpivot_sales_df(SALES_TRAIN_EVALUATION, timestamp_cols)

# The following ops are memory intensive so downcast datatypes to save on mem
sales_df = cast_sales_data_types(sales_df)

## Calendar data

In [None]:
# Process calendar data
calendar_df = select_format_calendar_features(CALENDAR_DATA)

# Map event categories
event_name_map, id_event_name_map = map_category_ids(calendar_df, "event_name", submission_run=False)
event_type_map, id_event_type_map = map_category_ids(calendar_df, "event_type", submission_run=False)

# Cast datatypes
calendar_df = cast_calendar_data_types(calendar_df)

## Price data

In [None]:
# Process price data
item_id_map, id_item_map = map_category_ids(SELL_PRICES, "item_id", item_id_map, submission_run=False)
store_id_map, id_store_map = map_category_ids(SELL_PRICES, "store_id", store_id_map, submission_run=False)

price_df = cast_price_data_types(SELL_PRICES)

## Combine and engineer new features

In [None]:
data = pd.read_parquet(f"{OUTPUT_BASE_BATH}/m5-acc-train-intermediate-2.parquet")

In [None]:
data = cast_sales_data_types(data)
data = cast_calendar_data_types(data)
data = cast_price_data_types(data)

In [None]:
data = lagged_sell_price_features(data, "sell_price", [1])

In [None]:
# Merge
data = merge_sales_calendar_price_data(sales_df, calendar_df, price_df)

In [None]:
# Feature engineering
data = av_item_state_prices(data)
data = av_dept_state_price(data)
data = av_store_dept_sales(data)

# Add lagged sales data
data = lagged_sales_features(data, periods=LAG_PERIODS)
data = lagged_av_store_dept_sales_features(data, periods=LAG_PERIODS)

# Fourier features
data = fourier_features(data)

## Save

In [None]:
# Save to parquet
data.to_parquet(f"{OUTPUT_BASE_BATH}/m5-acc-train.parquet")

# Test data

In [None]:
def construct_test_df(
    d_start: int,
    d_end: int,
    item_category_ids: pd.DataFrame,
    calendar_df: pd.DataFrame,
    price_df: pd.DataFrame,
) -> pd.DataFrame:
    d_range = list(range(d_start, d_end + 1))
    d_range_df = pd.DataFrame(d_range, columns=["d"])
    test_df = pd.merge(ITEM_ID_CATEGORIES, d_range_df, how="cross")
    test_df = cast_sales_data_types(test_df)
    test_df = merge_sales_calendar_price_data(test_df, calendar_df, price_df)
    return test_df


def add_lagged_sales_features_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    periods: list[int],
) -> pd.DataFrame:
    # Select columns needed for merge
    lagged_train_cols = [f"count_lag_{p}" for p in periods]
    train_df_slice = train_df[["item_id", "store_id", "d"] + lagged_train_cols]
    train_df_slice = train_df_slice.rename(columns={"d": "d_train"})
    
    # Merge onto test
    for p in tqdm(periods):
        test_df[f"d_lag_{p}"] = test_df["d"] - p
        test_df = test_df.merge(
            train_df_slice[["item_id", "store_id", "d_train", f"count_lag_{p}"]],
            left_on=["item_id", "store_id", f"d_lag_{p}"],
            right_on=["item_id", "store_id", "d_train"],
            how="left",
        )
        test_df = test_df.drop(columns=[f"d_lag_{p}", "d_train"])
    return test_df


def add_lagged_av_sales_features_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    periods: list[int],
) -> pd.DataFrame:
    # Select columns needed for merge
    lagged_train_cols = [f"av_store_dept_sales_lag_{p}" for p in periods]
    train_df_slice = train_df[["item_id", "store_id", "d"] + lagged_train_cols]
    train_df_slice = train_df_slice.rename(columns={"d": "d_train"})
    
    # Merge onto test
    for p in tqdm(periods):
        test_df[f"d_lag_{p}"] = test_df["d"] - p
        test_df = test_df.merge(
            train_df_slice[["item_id", "store_id", "d_train", f"av_store_dept_sales_lag_{p}"]],
            left_on=["item_id", "store_id", f"d_lag_{p}"],
            right_on=["item_id", "store_id", "d_train"],
            how="left",
        )
        test_df = test_df.drop(columns=[f"d_lag_{p}", "d_train"])
    return test_df

In [None]:
test_d_start, test_d_end = 1942, 1969
test_df = construct_test_df(
    d_start=test_d_start,
    d_end=test_d_end,
    item_category_ids=ITEM_ID_CATEGORIES,
    calendar_df=calendar_df,
    price_df=price_df,
)

test_df = add_lagged_sales_features_from_training_set(
    test_df=test_df,
    train_df=data,
    periods=LAG_PERIODS,
)
test_df = add_lagged_av_sales_features_from_training_set(
    test_df=test_df,
    train_df=data,
    periods=LAG_PERIODS,
)

# Feature engineering
test_df = av_item_state_prices(test_df)
test_df = av_dept_state_price(test_df)
test_df = fourier_features(test_df)

In [None]:
test_df.to_parquet(f"{OUTPUT_BASE_BATH}/m5-acc-test.parquet")