In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
# SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
# SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")
SALES_TRAIN_VALIDATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_validation.csv")
SAMPLE_SUBMISSION = pd.read_csv(f"{INPUT_BASE_PATH}/sample_submission.csv")

# PREPROCESSING & FEATURE ENGINEERING

In [None]:
def flatten_sales_df(sales_df: pd.DataFrame, timestamp_cols: list[str]) -> pd.DataFrame:
    sales_dfs = []
    grouper = train_df.groupby(
        [
            "item_id",
            "dept_id",
            "cat_id",
            "store_id",
            "state_id"
        ]
    )
    for key, sales_df in tqdm(grouper):
        (item, dept, cat, store, state) = key
        sales_df = sales_df[timestamp_cols].T.reset_index()
        sales_df.columns = ["d", "count"]
        sales_df = sales_df.assign(
            d = sales_df["d"].apply(lambda d: int(d.lstrip("d_"))),
            item_id = item,
            dept_id = dept,
            cat_id = cat,
            store_id = store,
            state_id = state,
        )
        sales_dfs.append(sales_df)
    return pd.concat(sales_dfs)


def select_format_calendar_features(calendar_data: pd.DataFrame) -> pd.DataFrame:
    # Drop columns
    cols_to_drop = ["wm_yr_wk", "weekday", "event_name_2", "event_type_2"]
    calendar_data = calendar_data.copy().drop(columns=cols_to_drop)
    
    # Format cols
    calendar_data["d"] = calendar_data["d"].apply(lambda d: int(d.lstrip("d_")))
    calendar_data = calendar_data.rename(columns={"event_name_1": "event_name", "event_type_1": "event_type"})
    
    return calendar_data


def get_calendar_event_mapping(calendar_data: pd.DataFrame):
    event_name_map = {e_name: i for (i, e_name) in enumerate(calendar_data["event_name"].unique())}
    event_type_map = {e_type: i for (i, e_type) in enumerate(calendar_data["event_type"].unique())}
    return event_name_map, event_type_map
    

def merge_sales_and_calendar_data(sales_df: pd.DataFrame, calendar_data: pd.DataFrame) -> pd.DataFrame:
    return sales_df.merge(right=calendar_data, on="d", how="left")

In [None]:
timestamp_cols = [f"d_{i}" for i in range(1000, 1914)]

# Process sales data
train_df = flatten_sales_df(SALES_TRAIN_VALIDATION, timestamp_cols)

# Process calendar data
calendar_data = select_format_calendar_features(CALENDAR_DATA)
event_name_map, event_type_map = get_calendar_event_mapping(calendar_data)
calendar_data["event_name"] = calendar_data["event_name"].map(event_name_map)
calendar_data["event_type"] = calendar_data["event_type"].map(event_type_map)

# Merge
train_df = merge_sales_and_calendar_data(train_df, calendar_data)


In [None]:
# Rolling window validation

MAX_VALIDATION_TIMESTAMP = 1913
HORIZON = 28
OVERLAP = 14
N_FOLDS = 10
MIN_VALIDATION_TIMESTAMP = MAX_VALIDATION_TIMESTAMP - N_FOLDS * HORIZON + (N_FOLDS - 1) * OVERLAP

for fold_idx in range(N_FOLDS):
    start = MIN_VALIDATION_TIMESTAMP + fold_idx * (HORIZON - OVERLAP)
    stop = start + HORIZON
    
    # Select train data and valid data
    train_data = train_df[train_df["d"] < start]
    valid_data = train_df[(train_df["d"] >= start) & (train_df["d"] < stop)]

    if fold_idx == 2: break
        
    del train_data
    del valid_data

In [None]:
print(train_data["d"].min(), train_data["d"].max())
print(valid_data["d"].min(), valid_data["d"].max())

In [None]:
print(train_data["d"].min(), train_data["d"].max())
print(valid_data["d"].min(), valid_data["d"].max())

In [None]:
print(train_data["d"].min(), train_data["d"].max())
print(valid_data["d"].min(), valid_data["d"].max())