In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
# SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
# SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")
SALES_TRAIN_VALIDATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_validation.csv")
SAMPLE_SUBMISSION = pd.read_csv(f"{INPUT_BASE_PATH}/sample_submission.csv")

In [None]:
def get_product_id_mappings(df: pd.DataFrame):
    id_to_int = {
        product_id: i
        for (i, product_id) in enumerate(df["id"].unique(), start=1)
    }
    int_to_id = {v: k for (k,v) in id_to_int.items()}
    return id_to_int, int_to_id


def flatten_sales_df(sales_df: pd.DataFrame, timestamp_cols: list[str]) -> pd.DataFrame:
    sales_dfs = []
    grouper = train_df.groupby(
        [
            "item_id",
            "dept_id",
            "cat_id",
            "store_id",
            "state_id"
        ]
    )
    for key, sales_df in tqdm(grouper):
        (item, dept, cat, store, state) = key
        sales_df = sales_df[timestamp_cols].T.reset_index()
        sales_df.columns = ["d", "count"]
        sales_df["item_id"] = item
        sales_df["dept_id"] = dept
        sales_df["cat_id"] = cat
        sales_df["store_id"] = store
        sales_df["state_id"] = state
        sales_dfs.append(sales_df)
    return pd.concat(sales_dfs)
        

def merge_sales_and_calendar_data(sales_df: pd.DataFrame, calendar_data: pd.DataFrame) -> pd.DataFrame:
    return sales_df.merge(right=calendar_data, on="d", how="left")

In [None]:
timestamp_cols = [f"d_{i}" for i in range(1, 1914)]

train_df = SALES_TRAIN_VALIDATION.copy(deep=True)
train_df = flatten_sales_df(train_df, timestamp_cols)
train_df = merge_sales_and_calendar_data(train_df, CALENDAR_DATA)

# SALES BY STORE

In [None]:
sales_by_store = train_df.groupby(["store_id", "date"])[["count"]].sum()
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
n_samples = 500

fig, ax = plt.subplots(5, 2, sharex=True, sharey=True, figsize=(15, 10))
ax = ax.flatten()
for i, store_id in enumerate(store_ids):
    store_df = sales_by_store.loc[(store_id, slice(None))].copy().reset_index()
    store_df = store_df.iloc[-n_samples:]
    ax[i].plot(np.arange(n_samples), store_df["count"].values, label=store_id)
    
    date_labels = store_df["date"].values[::100]
    x_ticks = np.arange(0, n_samples, 100)
    ax[i].set_xticks(x_ticks)
    ax[i].set_xticklabels(date_labels, rotation=45)
    ax[i].legend()

fig.tight_layout();

# SALES BY CATEGORY

In [None]:
sales_by_category = train_df.groupby(["cat_id", "date"])[["count"]].sum()
category_ids = ["FOODS", "HOBBIES", "HOUSEHOLD"]

n_samples = 500
fig, ax = plt.subplots(3, 1, figsize=(8, 6), sharex=True)
for i, cat_id in enumerate(category_ids):
    cat_df = sales_by_category.loc[(cat_id, slice(None))].copy().reset_index()
    cat_df = cat_df.iloc[-n_samples:]
    ax[i].plot(np.arange(n_samples), cat_df["count"].values, label=cat_id)
    
    date_labels = cat_df["date"].values[::100]
    x_ticks = np.arange(0, n_samples, 100)
    ax[i].set_xticks(x_ticks)
    ax[i].set_xticklabels(date_labels, rotation=45)

    ax[i].legend()
fig.tight_layout();

# SALES BY CATEGORY & STORE

In [None]:
sales_by_store_and_cat = train_df.groupby(["store_id", "cat_id", "date"])[["count"]].sum()
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
category_ids = ["FOODS", "HOBBIES", "HOUSEHOLD"]
n_samples = 500

fig, ax = plt.subplots(5, 2, sharex=True, sharey=True, figsize=(15, 10))
ax = ax.flatten()
for i, store_id in enumerate(store_ids):
    store_df = sales_by_store_and_cat.loc[store_id]
    for cat_id in ["FOODS", "HOBBIES", "HOUSEHOLD"]:
        cat_df = store_df.loc[cat_id].reset_index()
        cat_df = cat_df.iloc[-n_samples:]
        ax[i].plot(np.arange(n_samples), cat_df["count"].values, label=(store_id, cat_id))

    ax[i].legend(fontsize="small", ncols=3)
    
    date_labels = cat_df["date"].values[::100]
    x_ticks = np.arange(0, n_samples, 100)
    ax[i].set_xticks(x_ticks)
    ax[i].set_xticklabels(date_labels, rotation=45)

fig.tight_layout();

# SALES BY STORE & DEPARTMENT

In [None]:
sales_by_store_and_dept = train_df.groupby(["store_id", "dept_id", "date"])[["count"]].sum()
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']

n_samples = 500

fig, ax = plt.subplots(5, 2, sharex=True, sharey=True, figsize=(15, 10))
ax = ax.flatten()
for i, store_id in enumerate(store_ids):
    store_df = sales_by_store_and_dept.loc[store_id]
    store_df = store_df.unstack(level=0).droplevel(level=0, axis=1)
    store_df = store_df.iloc[-n_samples:]
    ax[i] = store_df.plot(ax=ax[i])
    
    ax[i].legend(fontsize="x-small", ncols=4)
    ax[i].set_title(store_id)

fig.tight_layout();