In [None]:
import gc
import json

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm

pd.set_option('display.max_columns', None)

# Load raw datasets

In [None]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")

SAMPLE_SUBMISSION = pd.read_csv(f"{INPUT_BASE_PATH}/sample_submission.csv")
SUBMISSION_INDEX = SAMPLE_SUBMISSION.set_index("id").index
VAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("validation")]
EVAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("evaluation")]

In [None]:
# Constants
MAX_TRAIN_TIMESTAMP = 1941
START_TEST_TIMESTAMP = 1942
START_TEST_WM_YR_WK = 11617

# Feature engineering constants
SALES_LAG_PERIODS = [28, 29, 30, 31]
SALES_ROLLING_WINDOWS = [7, 14, 21, 28]
SALES_ROLLING_AGG_FUNCS = ["mean", "std", "kurt"]

PRICE_LAG_PERIODS = [1, 2, 3, 7]
PRICE_ROLLING_WINDOWS = [7, 14, 21, 28]
PRICE_ROLLING_AGG_FUNCS = ["mean", "std", "kurt"]

EVENT_LAG_PERIODS = [-3, -2, -1, 1, 2, 3]

# Data preprocessing and feature engineering

In [None]:
# Extract categorical mappings
def get_unique_value_id_map(df: pd.DataFrame, col_name: str):
    return {value_id: i for (i, value_id) in enumerate(df[col_name].unique())}


def map_category_ids(
    df: pd.DataFrame,
    column_name: str,
    category_id_map: dict | None = None,
    submission_run: bool = False
) -> pd.DataFrame:
    if category_id_map is None:
        category_id_map = get_unique_value_id_map(df, column_name)
    id_category_map = {v: k for (k, v) in category_id_map.items()}
    df[column_name] = df[column_name].map(category_id_map)
    return (category_id_map, None) if submission_run else (category_id_map, id_category_map)

## Preprocess sales data

In [None]:
# Process sales data
def unpivot_sales_df(sales_df: pd.DataFrame, timestamp_cols: list[str]) -> pd.DataFrame:
    sales_df = sales_df.melt(
        id_vars=["item_id", "dept_id", "cat_id", "store_id", "state_id"],
        value_vars=timestamp_cols,
        var_name="d",
        value_name="count"
    )
    sales_df["d"] = sales_df["d"].apply(lambda d: int(d.lstrip("d_")))
    sales_df = sales_df.sort_values(by=["store_id", "item_id", "d"]).reset_index(drop=True)
    return sales_df


def cast_sales_data_types(sales_df: pd.DataFrame) -> pd.DataFrame:
    category_cols = ["item_id", "dept_id", "cat_id", "store_id", "state_id"]
    for category_col in category_cols:
        try:
            sales_df[category_col] = sales_df[category_col].astype("category")
        except KeyError:
            print(f"Column {category_col} does not exist. Skipping.")
        
    column_downcast_map = [("d", np.int16), ("count", np.int16)]
    for (col, dtype) in column_downcast_map:
        try:
            sales_df[col] = sales_df[col].astype(dtype)
        except KeyError:
            print(f"Column {col} does not exist. Skipping.")
    
    return sales_df

## Preprocess calendar data

In [None]:
# Process calendar data
def select_format_calendar_features(calendar_data: pd.DataFrame) -> pd.DataFrame:
    # Drop columns
    cols_to_drop = ["weekday", "wday", "month", "year", "event_name_2", "event_type_2"]
    calendar_data = calendar_data.copy().drop(columns=cols_to_drop)
    
    # Format cols
    calendar_data["d"] = calendar_data["d"].apply(lambda d: int(d.lstrip("d_")))
    calendar_data = calendar_data.rename(columns={
        "event_name_1": "event_name",
        "event_type_1": "event_type",
        "wday": "weekday"
    })
    
    return calendar_data


def cast_calendar_data_types(calendar_df: pd.DataFrame) -> pd.DataFrame:
    category_cols = [
        "weekday",
        "month",
        "year",
        "wm_yr_wk",
        "event_name",
        "event_type",
        "snap_CA",
        "snap_TX",
        "snap_WI"
    ]
    for category_col in category_cols:
        try:
            calendar_df[category_col] = calendar_df[category_col].astype("category")
        except KeyError:
            print(f"Col {category_col} does not exist. Skipping ...")
    
    try:
        calendar_df["d"] = calendar_df["d"].astype(np.int16)
    except KeyError:
            print(f"Col 'd' does not exist. Skipping ...")
    
    return calendar_df

## Preprocess price data

In [None]:
# Process sell price data
def cast_price_data_types(price_df: pd.DataFrame) -> pd.DataFrame:
    category_cols = ["item_id", "store_id", "wm_yr_wk"]
    for category_col in category_cols:
        try:
            price_df[category_col] = price_df[category_col].astype("category")
        except KeyError:
            print(f"Col {category_col} does not exist. Skipping ...")
    
    price_df["sell_price"] = price_df["sell_price"].astype(np.float16)
    return price_df

## Merge raw datasets

In [None]:
# Merge
def merge_sales_calendar_price_data(
    sales_df: pd.DataFrame,
    calendar_df: pd.DataFrame,
    price_df: pd.DataFrame,
    chunk_size: int = 500_000,
) -> pd.DataFrame:
    chunks = []
    for chunk_start in tqdm(range(0, len(sales_df), chunk_size)):
        sales_chunk = sales_df.iloc[chunk_start: chunk_start + chunk_size]
        merged_chunk = sales_chunk.merge(right=calendar_df, on="d", how="left")
        merged_chunk = merged_chunk.merge(right=price_df, on=["item_id", "store_id", "wm_yr_wk"], how="left")
        chunks.append(merged_chunk)
    return pd.concat(chunks, ignore_index=True)

## Feature engineering

### Datetime features

In [None]:
# Feature engineering funcs
def datetime_features(data: pd.DataFrame) -> pd.DataFrame:
    dates = pd.to_datetime(data["date"])
    data["weekday"] = dates.dt.weekday.astype("category")
    data["month"] = dates.dt.month.astype("category")
    data["quarter"] = dates.dt.quarter.astype("category")
    data["year"] = dates.dt.year.astype("category")
    return data


def fourier_features(data: pd.DataFrame) -> pd.DataFrame:
    data["weekday_sin"] = np.sin((2 * np.pi * data["weekday"].astype(float)) / 7)
    data["weekday_cos"] = np.cos((2 * np.pi * data["weekday"].astype(float)) / 7)
    
    data["month_sin"] = np.sin((2 * np.pi * data["month"].astype(float)) / 12)
    data["month_cos"] = np.cos((2 * np.pi * data["month"].astype(float)) / 12)
    
    data["quarter_sin"] = np.sin((2 * np.pi * data["quarter"].astype(float)) / 4)
    data["quarter_cos"] = np.cos((2 * np.pi * data["quarter"].astype(float)) / 4)
    
    freqs = ["weekday", "month", "quarter"]
    cols = [f"{freq}_{trig}" for freq in freqs for trig in ("sin", "cos")]
    for col in cols:
        data[col] = data[col].astype(np.float16)
    
    return data

### Item in stock feature

In [None]:
def item_on_sale_feature(data: pd.DataFrame) -> pd.DataFrame:
    """
    Adds an indicator feature of whether an item is in stock right now.
    An item is regarded out of stock if its 'sell_price' is NaN.
    
    :param data: Dataframe with 'sell_price' column.
    :return: Dataframe with item_on_sale column added.
    """
    # When price is NaN sale count is 0
    item_on_sale = data["sell_price"].notnull()
    data["item_on_sale"] = item_on_sale.astype(int).astype("category")
    return data

### Aggregate sales and price features

In [None]:
def merge_in_chunks(
    left: pd.DataFrame,
    right: pd.DataFrame,
    merge_keys: list[str],
    target_col_from_right: str,
    how: str = "left",
    chunk_size: int = 500_000
) -> pd.Series:
    """
    Merge two dataframes in chucks and return the target column from the
    right dataframe.
    
    :param left: Left, typically larger dataframe to join onto.
    :param right: Right dataframe containing target feature to add to left.
    :param merge_keys: The keys to merge on.
    :param target_cols_from_right: The target column from the right dataframe
        to return.
    :param how: How to merge.
    :param chunk_size: Number of rows to merge on each iteration.
    :return: pd.Series. The target column from right. This can be inserted
        directly into left outside of the function:
        ```
        left[target_col] = merge_in_chunks(...)
        ```
    """
    # Only select columns needed for the merge
    left_slice_df = left[merge_keys]
    
    chunks = []
    for chunk_start in tqdm(range(0, len(left_slice_df), chunk_size)):
        left_chunk = left_slice_df.iloc[chunk_start: chunk_start + chunk_size]
        merged_chunk = left_chunk.merge(right=right, on=merge_keys, how=how)
        chunks.append(merged_chunk[target_col_from_right])
     
    return pd.concat(chunks, ignore_index=True)


def av_store_dept_sales(data: pd.DataFrame, chunk_size: int = 500_000) -> pd.DataFrame:
    """
    Gets the average number of sales of all products in a given department and store.
    
    :param sales_df: Sales data frame containing 'store_id', 'dept_id', 'd', and 'count' columns.
    :param chunk_size: Dataframe chunk_size to use for merging. Defaults to 500_000
    :return: Dataframe with 'av_store_dept_sales' column added.
    """
    av_store_dept_sales = (
        data
        .groupby(["store_id", "dept_id", "d"], observed=True)
        [["count"]]
        .mean()
        .rename(columns={"count": "av_store_dept_sales"})
        .reset_index()
    )
    
    # Merge into data in chunks
    merged_sales_series = merge_in_chunks(
        left=data,
        right=av_store_dept_sales,
        merge_keys=["store_id", "dept_id", "d"],
        target_col_from_right="av_store_dept_sales",
        how="left",
        chunk_size=chunk_size,
    )
    data["av_store_dept_sales"] = merged_sales_series.astype(np.float16)
    
    return data


def av_item_state_prices(data: pd.DataFrame, chunk_size: int = 500_000) -> pd.DataFrame:
    """
    Gets the average sell price for an item across states.
    
    :param data: Dataframe with 'item_id', 'state_id', 'd', and 'sell_price' columns.
    :param chunk_size: Dataframe chunk_size to use for merging. Defaults to 500_000
    :return: Dataframe with 'av_item_state_sell_price' column added.
    """
    av_price = (
        data
        .groupby(["item_id", "state_id", "d"], observed=True)
        [["sell_price"]]
        .mean()
        .rename(columns={"sell_price": "av_item_state_sell_price"})
        .reset_index()
    )
    
    # Merge into data in chunks
    merged_price_series = merge_in_chunks(
        left=data,
        right=av_price,
        merge_keys=["item_id", "state_id", "d"],
        target_col_from_right="av_item_state_sell_price",
        how="left",
        chunk_size=chunk_size,
    )
    data["av_item_state_sell_price"] = merged_price_series.astype(np.float16)
    
    return data


def av_dept_state_price(data: pd.DataFrame, chunk_size: int = 500_000) -> pd.DataFrame:
    """
    Gets the average sell price of all items across departments and states.
    
    :param data: Dataframe with 'dept_id', 'state_id', 'd' and 'sell_price' columns.
    :param chunk_size: Dataframe chunk_size to use for merging. Defaults to 500_000.
    :return: Dataframe with 'av_dept_state_sell_price' column added.
    """
    av_price = (
        data
        .groupby(["dept_id", "state_id", "d"], observed=True)
        [["sell_price"]]
        .mean()
        .rename(columns={"sell_price": "av_dept_state_sell_price"})
        .reset_index()
    )
    
    # Merge onto data in chunks
    merged_price_series = merge_in_chunks(
        left=data,
        right=av_price,
        merge_keys=["dept_id", "state_id", "d"],
        target_col_from_right="av_dept_state_sell_price",
        how="left",
        chunk_size=chunk_size,
    )
    data["av_dept_state_sell_price"] = merged_price_series.astype(np.float16)
    
    return data

### Lagged and rolling features

In [None]:
def lagged_features(
    data: pd.DataFrame,
    feature_columns: list[str],
    periods: list[int]
) -> pd.DataFrame:
    lagged_dfs = []
    for _, group_df in tqdm(data.groupby(["item_id", "store_id"], observed=True)):
        group_df = group_df.sort_values("d")
        for p in periods:
            lag_feature_names = [f"{c}_lag_{p}" for c in feature_columns]
            group_df[lag_feature_names] = group_df[feature_columns].shift(p)
        lagged_dfs.append(group_df)
    all_lags = pd.concat(lagged_dfs, ignore_index=True)
    return all_lags


def rolling_features(
    data: pd.DataFrame,
    feature_column: str,
    windows: list[int],
    agg_funcs: list[str]
) -> pd.DataFrame:
    rolling_dfs = []
    for _, group_df in tqdm(data.groupby(["item_id", "store_id"], observed=True)):
        group_df = group_df.sort_values("d")
        
        window_dfs = []
        for w in windows:
            window_df = group_df[feature_column].rolling(w).aggregate(agg_funcs)
            window_df = window_df.astype(np.float16)
            
            col_names = [f"{feature_column}_rolling_{f}_window_{w}" for f in agg_funcs]
            window_df.columns = col_names
            window_dfs.append(window_df)
        
        all_window_dfs = pd.concat(window_dfs, axis=1)
        rolling_dfs.append(pd.concat([group_df, all_window_dfs], axis=1))
    all_rolling_dfs = pd.concat(rolling_dfs, axis=0, ignore_index=True)
    return all_rolling_dfs

In [None]:
# Lagged event features

def lagged_event_features(
    data: pd.DataFrame,
    periods: list[int],
    no_event_name_category: int,
    no_event_type_category: int
) -> pd.DataFrame:

    data = lagged_features(
        data=data,
        feature_columns=["event_name", "event_type"],
        periods=periods
    )
    
    # Fill missing values
    event_name_lag_cols = [f"event_name_lag_{p}" for p in periods]
    data[event_name_lag_cols] = data[event_name_lag_cols].fillna(no_event_name_category)
    data[event_name_lag_cols] = data[event_name_lag_cols].astype("category")
    
    event_type_lag_cols = [f"event_type_lag_{p}" for p in periods]
    data[event_type_lag_cols] = data[event_type_lag_cols].fillna(no_event_type_category)
    data[event_type_lag_cols] = data[event_type_lag_cols].astype("category")

    return data

In [None]:
# Lagged and rolling target features

def lagged_target_features(data: pd.DataFrame, periods: list[int]) -> pd.DataFrame:
    data = lagged_features(data=data, feature_columns=["count"], periods=periods)
    lagged_target_cols = [f"count_lag_{p}" for p in periods]
    data[lagged_target_cols] = data[lagged_target_cols].astype(np.float16)
    return data


def lagged_rolling_target_features(
    data: pd.DataFrame,
    windows: list[int],
    agg_funcs: list[str],
    lag: int = 28,
) -> pd.DataFrame:
    data = rolling_features(data, f"count_lag_{lag}", windows=windows, agg_funcs=agg_funcs)
    return data

In [None]:
# Lagged and rolling price features

def diff_price_features(data: pd.DataFrame, periods: list[int]) -> pd.DataFrame:
    diff_dfs = []
    for _, group_df in tqdm(data.groupby(["item_id", "store_id"], observed=True)):
        group_df = group_df.sort_values("d")
        for p in periods:
            diff_series = group_df["sell_price"].diff(p)
            group_df[f"sell_price_diff_{p}"] = diff_series.astype(np.float16)
        diff_dfs.append(group_df)
    return pd.concat(diff_dfs, ignore_index=True)


def rolling_price_features(data: pd.DataFrame, windows: list[int], agg_funcs: list[str]) -> pd.DataFrame:
    data = rolling_features(data, "sell_price", windows=windows, agg_funcs=agg_funcs)
    return data

# Training data

## Sales data

In [None]:
# Process sales data

# Map item ids to categories
item_id_map, id_item_map = map_category_ids(SALES_TRAIN_EVALUATION, "item_id")
dept_id_map, id_dept_map = map_category_ids(SALES_TRAIN_EVALUATION, "dept_id")
cat_id_map, id_cat_map = map_category_ids(SALES_TRAIN_EVALUATION, "cat_id")
store_id_map, id_store_map = map_category_ids(SALES_TRAIN_EVALUATION, "store_id")
state_id_map, id_state_map = map_category_ids(SALES_TRAIN_EVALUATION, "state_id")

# Store category info for each item id
ITEM_ID_CATEGORIES = SALES_TRAIN_EVALUATION[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]]

# Convert sales table to long table format
timestamp_cols = [f"d_{i}" for i in range(1, MAX_TRAIN_TIMESTAMP + 1)]
sales_df = unpivot_sales_df(SALES_TRAIN_EVALUATION, timestamp_cols)

# The following ops are memory intensive so downcast datatypes to save on mem
sales_df = cast_sales_data_types(sales_df)

## Calendar data

In [None]:
# Process calendar data
calendar_df = select_format_calendar_features(CALENDAR_DATA)

# Map event categories
event_name_map, id_event_name_map = map_category_ids(calendar_df, "event_name")
event_type_map, id_event_type_map = map_category_ids(calendar_df, "event_type")

# Cast datatypes
calendar_df = cast_calendar_data_types(calendar_df)

## Price data

In [None]:
# Process price data
item_id_map, id_item_map = map_category_ids(SELL_PRICES, "item_id", item_id_map)
store_id_map, id_store_map = map_category_ids(SELL_PRICES, "store_id", store_id_map)

price_df = cast_price_data_types(SELL_PRICES)

## Combine and engineer new features

In [None]:
# Merge
data = merge_sales_calendar_price_data(sales_df, calendar_df, price_df)
data = data.drop("wm_yr_wk", axis=1)

_ = gc.collect()

In [None]:
# Feature engineering

# Aggregate sales and price features
# data = av_item_state_prices(data)
# data = av_dept_state_price(data)
# data = av_store_dept_sales(data)

_ = gc.collect()

In [None]:
# Add lagged event features
data = lagged_event_features(
    data=data,
    periods=EVENT_LAG_PERIODS,
    no_event_name_category=event_name_map[np.nan],
    no_event_type_category=event_type_map[np.nan],
)

_ = gc.collect()

In [None]:
# Lagged and rolling price features
data = diff_price_features(
    data=data,
    periods=PRICE_LAG_PERIODS
)
data = rolling_price_features(
    data=data,
    windows=PRICE_ROLLING_WINDOWS,
    agg_funcs=PRICE_ROLLING_AGG_FUNCS
)

_ = gc.collect()

In [None]:
# Lagged and rolling target features
data = lagged_target_features(
    data=data,
    periods=SALES_LAG_PERIODS
)
data = lagged_rolling_target_features(
    data=data,
    windows=SALES_ROLLING_WINDOWS,
    agg_funcs=SALES_ROLLING_AGG_FUNCS
)

_ = gc.collect()

In [None]:
# Item in stock feature
data = item_on_sale_feature(data)

# Datetime features
data = datetime_features(data)
data = fourier_features(data)

_ = gc.collect()

## Save

In [None]:
# Save to parquet
data.to_parquet(f"{OUTPUT_BASE_BATH}/m5-acc-train.parquet")

In [None]:
# Save category maps
def save_category_map(category_map: dict, name: str) -> None:
    file_name = f"{OUTPUT_BASE_BATH}/{name}.json"
    with open(file_name, "w") as fp:
        json.dump(category_map, fp)

maps_to_names = [
    (item_id_map, "item_id_map"),
    (dept_id_map, "dept_id_map"),
    (cat_id_map, "cat_id_map"),
    (store_id_map, "store_id_map"),
    (state_id_map, "state_id_map"),
    (event_name_map, "event_name_map"),
    (event_type_map, "event_type_map")
]
for category_map, map_name in maps_to_names:
    save_category_map(category_map, map_name)

# Test data

In [None]:
def construct_test_df(
    d_start: int,
    d_end: int,
    item_category_ids: pd.DataFrame,
    calendar_df: pd.DataFrame,
    price_df: pd.DataFrame,
) -> pd.DataFrame:
    d_range = list(range(d_start, d_end + 1))
    d_range_df = pd.DataFrame(d_range, columns=["d"])
    test_df = pd.merge(ITEM_ID_CATEGORIES, d_range_df, how="cross")
    test_df = cast_sales_data_types(test_df)
    test_df = merge_sales_calendar_price_data(test_df, calendar_df, price_df)
    return test_df

In [None]:
def add_lagged_event_features_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    periods: list[int],
    no_event_name_category: int,
    no_event_type_category: int,
) -> pd.DataFrame:
    
    # Select columns needed for merge
    train_df_slice = train_df[["item_id", "store_id", "d", "event_name", "event_type"]]
    train_df_slice = train_df_slice[train_df_slice["d"] >= START_TEST_TIMESTAMP - max(periods)]
    test_df = pd.concat([test_df, train_df_slice])
    
    test_df = lagged_event_features(
        data=test_df,
        periods=periods,
        no_event_name_category=no_event_name_category,
        no_event_type_category=no_event_type_category
    )
    
    test_df = test_df[test_df["d"] >= START_TEST_TIMESTAMP]
    return test_df

In [None]:
def add_lagged_target_features_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    periods: list[int],
) -> pd.DataFrame:
    # Select columns needed for merge
    train_df_slice = train_df[["item_id", "store_id", "d", "count"]]
    train_df_slice = train_df_slice.rename(columns={"d": "d_train"})
    
    # Merge onto test
    for p in tqdm(periods):
        test_df[f"d_lag_{p}"] = test_df["d"] - p
        test_df = test_df.merge(
            train_df_slice.rename(columns={"count": f"count_lag_{p}"}),
            left_on=["item_id", "store_id", f"d_lag_{p}"],
            right_on=["item_id", "store_id", "d_train"],
            how="left",
        )
        test_df[f"count_lag_{p}"] = test_df[f"count_lag_{p}"].astype(np.float16)
        test_df = test_df.drop(columns=[f"d_lag_{p}", "d_train"])
    return test_df


def add_lagged_rolling_target_features_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    windows: list[int],
    agg_funcs: list[str],
    lag: int = 28,
) -> pd.DataFrame:
    # Select columns needed for merge
    train_df_slice = train_df[["item_id", "store_id", "d", "count"]]
    
    # Compute rolling features
    rolling_features_df = rolling_features(
        data=train_df_slice,
        feature_column="count",
        windows=windows,
        agg_funcs=agg_funcs,
    )
    
    # Rename cols to include lag
    rolling_feature_cols = [c for c in rolling_features_df if c.startswith("count_rolling_")]
    new_feature_col_names = [c.replace("count", f"count_lag_{lag}") for c in rolling_feature_cols]
    old_new_col_name_map = dict(zip(rolling_feature_cols, new_feature_col_names))
    rolling_features_df = rolling_features_df.rename(columns=old_new_col_name_map)
    
    # Merge onto test
    test_df[f"d_lag_{lag}"] = test_df["d"] - lag
    rolling_features_df = rolling_features_df.rename(columns={"d": "d_train"})
    test_df = test_df.merge(
        rolling_features_df.drop("count", axis=1),
        left_on=["item_id", "store_id", f"d_lag_{lag}"],
        right_on=["item_id", "store_id", "d_train"],
        how="left",
    )
    test_df = test_df.drop(columns=[f"d_lag_{lag}", "d_train"])
    return test_df

In [None]:
def add_sale_price_diffs_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    periods: list[int],
) -> pd.DataFrame:
    
    # Select data needed for merge
    train_df_slice = train_df[["item_id", "store_id", "d", "sell_price"]]
    train_df_slice = train_df_slice[train_df_slice["d"] >= START_TEST_TIMESTAMP - max(periods)]
    test_df = pd.concat([test_df, train_df_slice])
    
    # Cast types
    test_df["item_id"] = test_df["item_id"].astype("category")
    test_df["store_id"] = test_df["store_id"].astype("category")
    test_df["d"] = test_df["d"].astype(np.int16)
    test_df["sell_price"] = test_df["sell_price"].astype(np.float16)
    
    # Compute diffs
    test_df = diff_price_features(test_df, periods=periods)
    test_df = test_df[test_df["d"] >= START_TEST_TIMESTAMP]
    return test_df


def add_rolling_price_features_from_training_set(
    test_df: pd.DataFrame,
    train_df: pd.DataFrame,
    windows: list[int],
    agg_funcs: list[str],
) -> pd.DataFrame:
    
    # Select columns needed for merge
    train_df_slice = data[["item_id", "store_id", "d", "sell_price"]]
    train_df_slice = train_df_slice[train_df_slice["d"] >= START_TEST_TIMESTAMP - max(windows)]
    test_df = pd.concat([test_df, train_df_slice])
    
    test_df["item_id"] = test_df["item_id"].astype("category")
    test_df["store_id"] = test_df["store_id"].astype("category")
    test_df["d"] = test_df["d"].astype(np.int16)
    test_df["sell_price"] = test_df["sell_price"].astype(np.float16)
    
    # Compute rolling price features
    test_df = rolling_price_features(test_df, windows=windows, agg_funcs=agg_funcs)
    test_df = test_df[test_df["d"] >= START_TEST_TIMESTAMP]
    return test_df

In [None]:
test_d_start, test_d_end = 1942, 1969
test_df = construct_test_df(
    d_start=test_d_start,
    d_end=test_d_end,
    item_category_ids=ITEM_ID_CATEGORIES,
    calendar_df=calendar_df,
    price_df=price_df,
)
test_df = test_df.drop("wm_yr_wk", axis=1)

In [None]:
# Lagged events features
test_df = add_lagged_event_features_from_training_set(
    test_df=test_df,
    train_df=data,
    periods=EVENT_LAG_PERIODS,
    no_event_name_category=event_name_map[np.nan],
    no_event_type_category=event_type_map[np.nan]
)

# Lagged sales features
test_df = add_lagged_target_features_from_training_set(
    test_df=test_df,
    train_df=data,
    periods=SALES_LAG_PERIODS,
)
test_df = add_lagged_rolling_target_features_from_training_set(
    test_df=test_df,
    train_df=data,
    windows=SALES_ROLLING_WINDOWS,
    agg_funcs=SALES_ROLLING_AGG_FUNCS,
    lag=28
)

# Lagged price features
test_df = add_sale_price_diffs_from_training_set(
    test_df=test_df,
    train_df=data,
    periods=PRICE_LAG_PERIODS,
)
test_df = add_rolling_price_features_from_training_set(
    test_df=test_df,
    train_df=data,
    windows=PRICE_ROLLING_WINDOWS,
    agg_funcs=PRICE_ROLLING_AGG_FUNCS
)

# Item in stock feature
test_df = item_on_sale_feature(test_df)

# Datetime features
test_df = datetime_features(test_df)
test_df = fourier_features(test_df)

In [None]:
test_df.to_parquet(f"{OUTPUT_BASE_BATH}/m5-acc-test.parquet")