In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns
from tqdm import tqdm

import lightgbm as lgbm

In [None]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")

In [None]:
# Extract categorical mappings
def get_unique_value_id_map(df: pd.DataFrame, col_name: str):
    return {value_id: i for (i, value_id) in enumerate(df[col_name].unique())}


def map_category_ids(sales_df: pd.DataFrame, column_name: str, submission_run: bool):
    category_id_map = get_unique_value_id_map(sales_df, column_name)
    id_category_map = {v: k for (k, v) in category_id_map.items()}
    sales_df[column_name] = sales_df[column_name].map(category_id_map)
    return (category_id_map, None) if submission_run else (category_id_map, id_category_map)

In [None]:
# Process sales data
def unpivot_sales_df(sales_df: pd.DataFrame, timestamp_cols: list[str]) -> pd.DataFrame:
    sales_df = sales_df.melt(
        id_vars=["item_id", "dept_id", "cat_id", "store_id", "state_id"],
        value_vars=timestamp_cols,
        var_name="d",
        value_name="count"
    )
    sales_df["d"] = sales_df["d"].apply(lambda d: int(d.lstrip("d_")))
    return sales_df
    

In [None]:
# Process calendar data
def select_format_calendar_features(calendar_data: pd.DataFrame) -> pd.DataFrame:
    # Drop columns
    cols_to_drop = ["weekday", "event_name_2", "event_type_2"]
    calendar_data = calendar_data.copy().drop(columns=cols_to_drop)
    
    # Format cols
    calendar_data["d"] = calendar_data["d"].apply(lambda d: int(d.lstrip("d_")))
    calendar_data = calendar_data.rename(columns={
        "event_name_1": "event_name",
        "event_type_1": "event_type",
        "wday": "weekday"
    })
    
    return calendar_data


# Merge
def merge_sales_and_calendar_data(sales_df: pd.DataFrame, calendar_data: pd.DataFrame) -> pd.DataFrame:
    return sales_df.merge(right=calendar_data, on="d", how="left")

In [None]:
start_t, end_t = 1000, 1941
timestamp_cols = [f"d_{i}" for i in range(start_t, end_t + 1)]

# Process sales data
sales_df = unpivot_sales_df(SALES_TRAIN_EVALUATION, timestamp_cols)

# Process calendar data
calendar_df = select_format_calendar_features(CALENDAR_DATA)

# Merge
data = merge_sales_and_calendar_data(sales_df, calendar_df)

# SALES BY STORE

In [None]:
# Aggregate sales by store
sales_by_store = data.groupby(["store_id", "date"])[["count"]].sum()
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
n_samples = 500

fig, ax = plt.subplots(5, 2, sharex=True, sharey=True, figsize=(15, 10))
ax = ax.flatten()
for i, store_id in enumerate(store_ids):
    store_df = sales_by_store.loc[(store_id, slice(None))].copy().reset_index()
    store_df = store_df.iloc[-n_samples:]
    ax[i].plot(np.arange(n_samples), store_df["count"].values, label=store_id)
    
    date_labels = store_df["date"].values[::100]
    x_ticks = np.arange(0, n_samples, 100)
    ax[i].set_xticks(x_ticks)
    ax[i].set_xticklabels(date_labels, rotation=45)
    if i % 2 == 0: ax[i].set_ylabel("count")
    ax[i].legend()

fig.tight_layout();

In [None]:
# Compare daily seasonalities across stores
total_sales_by_store = data.groupby(["date", "weekday", "store_id"])[["count"]].sum()
av_weekday_sales = (
    total_sales_by_store
    .groupby(["weekday", "store_id"])
    [["count"]]
    .mean()
    .unstack(level=1)
    .droplevel(level=0, axis=1)
)
std_weekday_sales = (
    total_sales_by_store
    .groupby(["weekday", "store_id"])
    [["count"]]
    .std()
    .unstack(level=1)
    .droplevel(level=0, axis=1)
)


store_groups = [
    ["CA_1", "CA_2", "CA_3", "CA_4"],
    ["TX_1", "TX_2", "TX_3"],
    ["WI_1", "WI_2", "WI_3"],   
]
fig, ax = plt.subplots(1, 3, figsize=(15, 3.5),)
for i, group in enumerate(store_groups):
    for store in group:
        ax[i].plot(
            av_weekday_sales.index,
            av_weekday_sales[store].values,
            label=store,
        )
        ax[i].fill_between(
            av_weekday_sales.index,
            av_weekday_sales[store].values + std_weekday_sales[store].values,
            av_weekday_sales[store].values - std_weekday_sales[store].values,
            alpha=0.25,
        )
    ax[i].set(xlabel="weekday", ylabel="count")
    ax[i].legend(ncols=len(group), fontsize="small", loc=1)
    
fig.tight_layout();

# SALES BY CATEGORY

In [None]:
sales_by_category = data.groupby(["cat_id", "date"])[["count"]].sum()
category_ids = ["FOODS", "HOBBIES", "HOUSEHOLD"]

n_samples = 500
fig, ax = plt.subplots(3, 1, figsize=(8, 6), sharex=True)
for i, cat_id in enumerate(category_ids):
    cat_df = sales_by_category.loc[(cat_id, slice(None))].copy().reset_index()
    cat_df = cat_df.iloc[-n_samples:]
    ax[i].plot(np.arange(n_samples), cat_df["count"].values, label=cat_id)
    
    date_labels = cat_df["date"].values[::100]
    x_ticks = np.arange(0, n_samples, 100)
    ax[i].set_xticks(x_ticks)
    ax[i].set_xticklabels(date_labels, rotation=45)

    ax[i].legend()
fig.tight_layout();

# SALES BY CATEGORY & STORE

In [None]:
sales_by_store_and_cat = data.groupby(["store_id", "cat_id", "date"])[["count"]].sum()
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
category_ids = ["FOODS", "HOBBIES", "HOUSEHOLD"]
n_samples = 500

fig, ax = plt.subplots(5, 2, sharex=True, sharey=True, figsize=(15, 10))
ax = ax.flatten()
for i, store_id in enumerate(store_ids):
    store_df = sales_by_store_and_cat.loc[store_id]
    for cat_id in ["FOODS", "HOBBIES", "HOUSEHOLD"]:
        cat_df = store_df.loc[cat_id].reset_index()
        cat_df = cat_df.iloc[-n_samples:]
        ax[i].plot(np.arange(n_samples), cat_df["count"].values, label=(store_id, cat_id))

    ax[i].legend(fontsize="small", ncols=3)
    
    date_labels = cat_df["date"].values[::100]
    x_ticks = np.arange(0, n_samples, 100)
    ax[i].set_xticks(x_ticks)
    ax[i].set_xticklabels(date_labels, rotation=45)

fig.tight_layout();

# SALES BY STORE & DEPARTMENT

In [None]:
sales_by_store_and_dept = data.groupby(["store_id", "dept_id", "date"])[["count"]].sum()
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']

n_samples = 500

fig, ax = plt.subplots(5, 2, sharex=True, sharey=True, figsize=(15, 10))
ax = ax.flatten()
for i, store_id in enumerate(store_ids):
    store_df = sales_by_store_and_dept.loc[store_id]
    store_df = store_df.unstack(level=0).droplevel(level=0, axis=1)
    store_df = store_df.iloc[-n_samples:]
    ax[i] = store_df.plot(ax=ax[i])
    
    ax[i].legend(fontsize="x-small", ncols=4)
    ax[i].set_title(store_id)

fig.tight_layout();

# SALES BY ITEM ID & STORE

In [None]:
dept_ids = list(data["dept_id"].unique())
store_ids = list(data["store_id"].unique())
item_ids_by_dept = {}
for dept_id in dept_ids:
    item_ids = list(data[data["dept_id"] == dept_id]["item_id"].unique())
    item_ids_by_dept[dept_id] = item_ids

In [None]:
# Plot sales for different items
n_items = 10
fig, ax = plt.subplots(len(dept_ids), n_items, figsize=(len(dept_ids) * 3.5, n_items * 1.5), sharey=True)
for d, dept_id in enumerate(dept_ids):
    print(f"Plotting count distributions for {dept_id}")
    for i in tqdm(range(5)):
        item_ids = np.random.choice(item_ids_by_dept[dept_id], size=n_items)
        for i, item_id in enumerate(item_ids):
            df_slice = data[(data["item_id"] == item_id)]
            counts, bins = np.histogram(df_slice["count"].values, bins=max(df_slice["count"].values))
            ax[d, i].stairs(
                counts / len(df_slice["count"].values),
                bins,
                label=item_id
            )
    
    for i in range(n_items):
        ax[d, i].legend(fontsize='x-small')
        ax[d, i].set(xlim=(-1, 20))
    
    ax[d, 0].set(ylabel="density")

for i in range(n_items):
    ax[-1, i].set(xlabel="count")
        
fig.tight_layout();
plt.savefig(f"{OUTPUT_BASE_BATH}/count_dist_for_item_cross_section.png")

In [None]:
# Plot sales for items by store
n_items = 10
fig, ax = plt.subplots(len(dept_ids), n_items, figsize=(len(dept_ids) * 3.5, n_items * 1.5), sharey=True)
for d, dept_id in enumerate(dept_ids):
    print(f"Plotting count distributions for {dept_id}")
    for s, store_id in tqdm(enumerate(store_ids)):
        item_ids = np.random.choice(item_ids_by_dept[dept_id], size=n_items)
        for i, item_id in enumerate(item_ids):
            df_slice = data[(data["store_id"] == store_id) & (data["item_id"] == item_id)]
            counts, bins = np.histogram(df_slice["count"].values, bins=max(df_slice["count"].values))
            ax[d, i].stairs(
                counts / len(df_slice["count"].values),
                bins,
                label=store_id
            )
    
    # Set plot properties for all plots of this dept
    for i, item_id in enumerate(item_ids):
        ax[d, i].set_title(item_id, fontsize='small')
        ax[d, i].set_xlim((-1, 20))
    
    # ylabel for first column plots
    ax[d, 0].set(ylabel="density")

# xlabels for last row plots
for i in range(n_items):
    ax[-1, i].set(xlabel="count")

handles, labels = ax[0,0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', ncols=len(store_ids), bbox_to_anchor=[0.5, 1.02])
        
fig.tight_layout();
plt.savefig(f"{OUTPUT_BASE_BATH}/count_dist_for_item_cross_section_by_store.png")

In [None]:
fig, ax = plt.subplots(1, len(dept_ids), figsize=(len(dept_ids) * 3.5, 3.5), sharey=True)
for d, dept_id in enumerate(dept_ids):
    print(f"Plotting count distributions for {dept_id}")
    for i, item_id in tqdm(enumerate(item_ids_by_dept[dept_id])):
        item_id_df = data[(data["item_id"] == item_id)]
        counts, bins = np.histogram(
            item_id_df["count"].values,
            bins=max(item_id_df["count"].values)
        )
        ax[d].stairs(
            counts / len(item_id_df["count"].values),
            bins,
            label=item_id,
            color="gray",
            alpha=0.5,
            lw=0.3,
        )
    ax[d].set(title=dept_id, xlabel="count", xlim=(-1, 40))

ax[0].set(ylabel="density")

fig.tight_layout();
plt.savefig(f"{OUTPUT_BASE_BATH}/count_dist_for_items_by_dept.png")