In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from tqdm import tqdm

import lightgbm as lgbm

In [2]:
SUBMISSION_RUN = False

In [3]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")

SAMPLE_SUBMISSION = pd.read_csv(f"{INPUT_BASE_PATH}/sample_submission.csv")
SUBMISSION_INDEX = SAMPLE_SUBMISSION.set_index("id").index
VAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("validation")]
EVAL_SUBMISSION = SAMPLE_SUBMISSION[SAMPLE_SUBMISSION["id"].str.contains("evaluation")]

In [4]:
# Constants
MAX_TIMESTAMP = 1941

# PREPROCESSING & FEATURE ENGINEERING

In [5]:
# Extract categorical mappings
def get_unique_value_id_map(df: pd.DataFrame, col_name: str):
    return {value_id: i for (i, value_id) in enumerate(df[col_name].unique())}


def map_category_ids(sales_df: pd.DataFrame, column_name: str, submission_run: bool):
    category_id_map = get_unique_value_id_map(sales_df, column_name)
    id_category_map = {v: k for (k, v) in category_id_map.items()}
    sales_df[column_name] = sales_df[column_name].map(category_id_map)
    return (category_id_map, None) if submission_run else (category_id_map, id_category_map)

In [6]:
# Process sales data
def unpivot_sales_df(sales_df: pd.DataFrame, timestamp_cols: list[str]) -> pd.DataFrame:
    sales_df = sales_df.melt(
        id_vars=["item_id", "dept_id", "cat_id", "store_id", "state_id"],
        value_vars=timestamp_cols,
        var_name="d",
        value_name="count"
    )
    sales_df["d"] = sales_df["d"].apply(lambda d: int(d.lstrip("d_")))
    return sales_df
    

In [7]:
# Process calendar data
def select_format_calendar_features(calendar_data: pd.DataFrame) -> pd.DataFrame:
    # Drop columns
    cols_to_drop = ["weekday", "event_name_2", "event_type_2"]
    calendar_data = calendar_data.copy().drop(columns=cols_to_drop)
    
    # Format cols
    calendar_data["d"] = calendar_data["d"].apply(lambda d: int(d.lstrip("d_")))
    calendar_data = calendar_data.rename(columns={
        "event_name_1": "event_name",
        "event_type_1": "event_type",
        "wday": "weekday"
    })
    
    return calendar_data

In [8]:
# Process sell price data
def extract_state_and_dept_id(sell_prices: pd.DataFrame) -> pd.DataFrame:
    return (
        sell_prices
        .assign(state_id=sell_prices["store_id"].str.extract(r"(^[A-Z]+)"))
        .assign(dept_id=sell_prices["item_id"].str.extract(r"(^[A-Z]+_[0-9]+)"))
    )


def get_item_state_level_sell_prices(sell_prices: pd.DataFrame) -> pd.DataFrame:
    return (
        sell_prices
        .groupby(["item_id", "state_id", "wm_yr_wk"])
        [["sell_price"]]
        .mean()
        .rename(columns={"sell_price": "av_item_state_sell_price"})
    )


def get_dept_state_level_sell_prices(sell_prices: pd.DataFrame) -> pd.DataFrame:
    return (
        sell_prices
        .groupby(["dept_id", "state_id", "wm_yr_wk"])
        [["sell_price"]]
        .mean()
        .rename(columns={"sell_price": "av_dept_state_sell_price"})
        .reset_index()
    )


def add_average_item_dept_sell_prices(sell_prices: pd.DataFrame) -> pd.DataFrame:
    item_state_prices = get_item_state_level_sell_prices(sell_prices)
    dept_state_prices = get_dept_state_level_sell_prices(sell_prices)
    sell_prices = (
        sell_prices.merge(
            right=item_state_prices,
            on=["item_id", "state_id", "wm_yr_wk"],
            how="left"
        ).merge(
            right=dept_state_prices,
            on=["dept_id", "state_id", "wm_yr_wk"],
            how="left"
        )
    )
    return sell_prices


In [9]:
# Merge
def merge_sales_and_calendar_data(sales_df: pd.DataFrame, calendar_data: pd.DataFrame) -> pd.DataFrame:
    return sales_df.merge(right=calendar_data, on="d", how="left")

    
def merge_sales_and_price_data(sales_df: pd.DataFrame, sell_prices: pd.DataFrame) -> pd.DataFrame:
    dedupe_item_state_prices = (
        sell_prices[["item_id", "state_id", "wm_yr_wk", "av_item_state_sell_price"]]
        .drop_duplicates(subset=["item_id", "state_id", "wm_yr_wk"])
    )
    dedupe_dept_state_prices = (
        sell_prices[["dept_id", "state_id", "wm_yr_wk", "av_dept_state_sell_price"]]
        .drop_duplicates(subset=["dept_id", "state_id", "wm_yr_wk"])
    )
    merged = sales_df.merge(
        right=sell_prices[["item_id", "store_id", "wm_yr_wk", "sell_price"]],
        on=["item_id", "store_id", "wm_yr_wk"],
        how="left"
    ).merge(
        right=dedupe_item_state_prices,
        on=["item_id", "state_id", "wm_yr_wk"],
        how="left",
    ).merge(
        right=dedupe_dept_state_prices,
        on=["dept_id", "state_id", "wm_yr_wk"],
        how="left",
    )
    return merged


def fill_missing_sell_prices(df: pd.DataFrame) -> pd.DataFrame:
    df["sell_price"] = (
        df["sell_price"]
        .fillna(df["av_item_state_sell_price"])
        .fillna(df["av_dept_state_sell_price"])
    )
    df["av_item_state_sell_price"] = (
        df["av_item_state_sell_price"]
        .fillna(df["av_dept_state_sell_price"])
    )
    return df
    
    
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    return df.astype(
        {
            "item_id": "float",
            "dept_id": "category",
            "cat_id": "category",
            "store_id": "category",
            "state_id": "category",
            "weekday": "category",
            "month": "category",
            "year": "category",
            "event_name": "category",
            "event_type": "category",
            "sell_price": "float",
            "av_item_state_sell_price": "float",
            "av_dept_state_sell_price": "float",
        }
    )
    

# EVALUATION DATASET -- PRIVATE LEADERBOARD

In [10]:
# Store category info for each item id
item_id_categories = SALES_TRAIN_EVALUATION[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]]

In [11]:
start_t = 500 if SUBMISSION_RUN else 1000
timestamp_cols = [f"d_{i}" for i in range(start_t, MAX_TIMESTAMP + 1)]

# Process sales data
sales_df = unpivot_sales_df(SALES_TRAIN_EVALUATION, timestamp_cols)

# Process calendar data
calendar_df = select_format_calendar_features(CALENDAR_DATA)

# Process price data
price_df = extract_state_and_dept_id(SELL_PRICES)
price_df = add_average_item_dept_sell_prices(price_df)

# Merge
data = merge_sales_and_calendar_data(sales_df, calendar_df)
data = merge_sales_and_price_data(data, price_df)

# Fill missing price data
data = fill_missing_sell_prices(data)

In [None]:
# Map categories to intergers
item_id_map, id_item_map = map_category_ids(data, "item_id", SUBMISSION_RUN)
dept_id_map, id_dept_map = map_category_ids(data, "dept_id", SUBMISSION_RUN)
cat_id_map, id_cat_map = map_category_ids(data, "cat_id", SUBMISSION_RUN)
store_id_map, id_store_map = map_category_ids(data, "store_id", SUBMISSION_RUN)
state_id_map, id_state_map = map_category_ids(data, "state_id", SUBMISSION_RUN)
event_name_map, id_event_name_map = map_category_ids(data, "event_name", SUBMISSION_RUN)
event_type_map, id_event_type_map = map_category_ids(data, "event_type", SUBMISSION_RUN)

# Set types
data = cast_types(data)

## LightGBM Model

In [None]:
# Rolling window validation

def time_series_split(
    df: pd.DataFrame,
    n_folds: int = 5,
    horizon: int = 28,
    overlap: int = 14,
    max_timestamp: int = MAX_TIMESTAMP,
):
    min_timestamp = max_timestamp - n_folds * horizon + (n_folds - 1) * overlap
    for fold_idx in range(n_folds):
        start = min_timestamp + fold_idx * (horizon - overlap)
        stop = start + horizon

        train_data = df[df["d"] < start].copy()
        valid_data = df[(df["d"] >= start) & (df["d"] < stop)].copy()
        
        print(f"Fold index: {fold_idx}")
        print(f"Train idx (start, end): ({train_data['d'].min()}, {train_data['d'].max()})")
        print(f"Valid idx (start, end): ({valid_data['d'].min()}, {valid_data['d'].max()})")
        print("==================")
        
        yield train_data, valid_data
        
        
def train_predict_score(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
) -> float:
    ...
        

In [None]:
def item_level_validation_plot(
    valid_df: pd.DataFrame,
    item_id: int,
):
    valid_for_item = valid[valid["item_id"] == item_id].copy()
    store_ids = list(valid_for_item["store_id"].unique())

    fig, ax = plt.subplots(len(store_ids), 1, figsize=(8, 2 * len(store_ids)), sharex=True)
    colors = cm.viridis(np.linspace(0, 1, len(store_ids)))

    for i, store_id in enumerate(store_ids):
        valid_for_store = valid_for_item[valid_for_item["store_id"] == store_id]
        ax[i].plot(
            valid_for_store["date"].values,
            valid_for_store["count"].values,
            label=id_store_map[store_id],
            color=colors[i]
        )
        ax[i].plot(
            valid_for_store["date"].values,
            valid_for_store["forecast"].values,
            color=colors[i],
            ls="--",
        )
        ax[i].set(ylabel=f"Store {id_store_map[store_id]}")
    ax[0].set(title=f"Item {id_item_map[item_id]}")
    for tick in ax[-1].get_xticklabels():
        tick.set_rotation(45)

    fig.tight_layout();


def aggregate_validation_plot(
    valid_df: pd.DataFrame,
    state: str,  #["CA", "TX", "WI"]
    cat: str, # ["FOODS", "HOBBIES", "HOUSEHOLD"]
):
    state_id = state_id_map[state]
    cat_id = cat_id_map[cat]
    
    state_cat_df = valid_df[(valid_df["state_id"] == state_id) & (valid_df["cat_id"] == cat_id)]
    g = state_cat_df.groupby(["store_id", "dept_id", "date"], observed=True)[["count", "forecast"]].sum()
    
    store_ids = g.index.get_level_values("store_id").unique()
    fig, ax = plt.subplots(len(store_ids), 1, figsize=(8, 2 * len(store_ids)), sharex=True)

    for i, store_id in enumerate(store_ids):
        ts = g.loc[store_id]

        dept_ids = ts.index.get_level_values("dept_id").unique()
        colors = cm.viridis(np.linspace(0, 1, len(dept_ids)))
        
        for j, dept_id in enumerate(dept_ids):
            ax[i].plot(
                ts.loc[dept_id].index,
                ts.loc[dept_id]["count"].values,
                label=id_dept_map[dept_id],
                color=colors[j]
            )
            ax[i].plot(
                ts.loc[dept_id].index,
                ts.loc[dept_id]["forecast"].values,
                color=colors[j],
                ls="--"
            )
        ax[i].set(ylabel="count", title=id_store_map[store_id])
        ax[i].legend()

    for tick in ax[-1].get_xticklabels():
        tick.set_rotation(45)

    fig.tight_layout()

    
    
def residual_analysis():
    # Residuals for different items / stores / states / cats
    ...

In [None]:
FEATURES = [
    "item_id",
    "dept_id",
    "cat_id",
    "store_id",
    "state_id",
    "weekday",
    "month",
    "year",
    "event_name",
    "event_type",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "sell_price",
    "av_item_state_sell_price",
    "av_dept_state_sell_price",
]
LABEL = "count"

PARAMETERS = {
    "objective": "poisson",
    "learning_rate": 0.2,
    "metric": "rmse",
}

In [None]:
if not SUBMISSION_RUN:
    for train, valid in time_series_split(data):
        break
    
    train_data = lgbm.Dataset(data=train[FEATURES], label=train[LABEL])
    valid_data = lgbm.Dataset(data=valid[FEATURES], label=valid[LABEL])
    model = lgbm.train(
        PARAMETERS,
        num_boost_round=600,
        train_set=train_data,
        valid_sets=[valid_data],
        callbacks=[
            lgbm.early_stopping(stopping_rounds=5),
            lgbm.log_evaluation(period=10)
        ]
    )
    
    y_hat = model.predict(valid[FEATURES])
    y_hat = np.clip(np.round(y_hat), a_min=0, a_max=np.inf)
    y_hat = y_hat.astype(np.int32)
    valid["forecast"] = y_hat
    

else:
    train_data = lgbm.Dataset(data=data[FEATURES], label=data[LABEL])
    model = lgbm.train(
        PARAMETERS,
        num_boost_round=350,
        train_set=train_data,
    )

## Error Analysis

In [None]:
if not SUBMISSION_RUN:
    aggregate_validation_plot(valid, "WI", "HOUSEHOLD")

In [None]:
if not SUBMISSION_RUN:
    item_level_validation_plot(valid, 100)

# FORECAST

In [None]:
def construct_test_df(
    d_start: int,
    d_end: int,
    item_category_df: pd.DataFrame,
    calendar_df: pd.DataFrame,
    price_df: pd.DataFrame,
) -> pd.DataFrame:
    
    d_range = list(range(d_start, d_end + 1))
    d_range_df = pd.DataFrame(d_range, columns=["d"])
    test_df = pd.merge(item_id_categories, d_range_df, how="cross")
    test_df = merge_sales_and_calendar_data(test_df, calendar_df)
    test_df = merge_sales_and_price_data(test_df, price_df)
    return test_df

def map_test_df_columns(test_df: pd.DataFrame) -> pd.DataFrame:
    test_df["item_id"] = test_df["item_id"].map(item_id_map)
    test_df["dept_id"] = test_df["dept_id"].map(dept_id_map)
    test_df["cat_id"] = test_df["cat_id"].map(cat_id_map)
    test_df["store_id"] = test_df["store_id"].map(store_id_map)
    test_df["state_id"] = test_df["state_id"].map(state_id_map)
    test_df["event_name"] = test_df["event_name"].map(event_name_map)
    test_df["event_type"] = test_df["event_type"].map(event_type_map)
    return test_df

def pivot_forecast_df(X_test: pd.DataFrame, y_hat: np.ndarray) -> pd.DataFrame:
    forecast_df = pd.concat([X_test, pd.DataFrame(y_hat, columns=["count"])], axis=1)
    forecast_df = forecast_df.pivot(columns="d", index="id", values="count").sort_index(axis=1)
    forecast_df = forecast_df.rename_axis(None, axis=1).reset_index()
    return forecast_df

def rename_forecast_columns(forecast_df: pd.DataFrame, forecast_horizon: int = 28) -> pd.DataFrame:
    forecast_df = forecast_df.set_index("id")
    forecast_df.columns = [f"F{i}" for i in range(1, forecast_horizon + 1)]
    return forecast_df.reset_index()

In [None]:
test_d_start, test_d_end = 1942, 1969
test_df = construct_test_df(
    d_start=test_d_start,
    d_end=test_d_end,
    item_category_df=item_id_categories,
    calendar_df=calendar_df,
    price_df=price_df,
)

test_df = map_test_df_columns(test_df)
test_df = cast_types(test_df)


y_hat = model.predict(test_df[FEATURES])
y_hat = np.clip(np.round(y_hat), a_min=0, a_max=np.inf)
y_hat = y_hat.astype(np.int32)

forecast_df = pivot_forecast_df(test_df, y_hat)
forecast_df = rename_forecast_columns(forecast_df)

In [None]:
# Merge with sample submissions
EVAL_SUBMISSION = EVAL_SUBMISSION[["id"]].merge(forecast_df, on="id", how="left")

# SUBMISSION

In [None]:
FINAL_SUBMISSIONS = pd.concat([VAL_SUBMISSION, EVAL_SUBMISSION])
FINAL_SUBMISSIONS = FINAL_SUBMISSIONS.set_index("id").reindex(SUBMISSION_INDEX).reset_index()

In [None]:
FINAL_SUBMISSIONS.to_csv(f"{OUTPUT_BASE_BATH}/submission.csv", index=False)