In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
INPUT_BASE_PATH = "/kaggle/input/m5-forecasting-accuracy"
OUTPUT_BASE_BATH = "/kaggle/working"

CALENDAR_DATA = pd.read_csv(f"{INPUT_BASE_PATH}/calendar.csv")
SELL_PRICES = pd.read_csv(f"{INPUT_BASE_PATH}/sell_prices.csv")
SALES_TRAIN_EVALUATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_evaluation.csv")
SALES_TRAIN_VALIDATION = pd.read_csv(f"{INPUT_BASE_PATH}/sales_train_validation.csv")
SAMPLE_SUBMISSION = pd.read_csv(f"{INPUT_BASE_PATH}/sample_submission.csv")

## FEATURE ENGINEERING FUNCTIONS

In [None]:
def mean_sales(df: pd.DataFrame) -> pd.DataFrame:
    df = df.groupby(["date", "d"])[["count"]].mean().reset_index()
    return df
    

def temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df["weekday"] = df["date"].dt.weekday
    df["week"] = df["date"].dt.isocalendar().week
    df["year"] = df["date"].dt.year
    
    return df

def lagged_features(df: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    lag_dfs = []
    for lag in lags:
        lagged_sales_df = pd.DataFrame(
            {
                "date": df["date"],
                f"date_lag_{lag}": df["date"] - pd.Timedelta(days=lag)
            }
        )
        lagged_sales_df = pd.merge(
            left=lagged_sales_df,
            right=df[["date", "count"]],
            how="left",
            left_on=f"date_lag_{lag}",
            right_on="date",
        )
        lagged_sales_df = lagged_sales_df[["date_x", f"date_lag_{lag}", "count"]]
        lagged_sales_df = lagged_sales_df.rename(columns={"date_x": "date", "count": f"count_lag_{lag}"})
        lag_dfs.append(lagged_sales_df)
    
    # Merge everything
    for lag_df in lag_dfs:
        df = pd.merge(left=df, right=lag_df, on="date")
    return df

In [None]:
## Feature engineering funcs for forecasting

def lagged_features_forecast(train_df: pd.DataFrame, test_df: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    lagged_dfs = []
    for lag in lags:
        lagged_df = pd.DataFrame(
            {
                "date": test_df["date"],
                f"date_lag_{lag}": test_df["date"] - pd.Timedelta(days=lag)
            }
        )
        lagged_df = pd.merge(
            left=lagged_df,
            right=train_df[["date", "count"]],
            how="left",
            left_on=f"date_lag_{lag}",
            right_on="date",
        )
        lagged_df = lagged_df[["date_x", f"date_lag_{lag}", "count"]]
        lagged_df = lagged_df.rename(columns={"date_x": "date", "count": f"count_lag_{lag}"})
        lagged_dfs.append(lagged_df)
    
    # Merge everything
    for lag_df in lagged_dfs:
        test_df = pd.merge(left=test_df, right=lag_df, on="date")
    return test_df


def construct_test_df(train_df: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    forecast_dates = pd.date_range(
        start=train_df["date"].max(),
        end=train_df["date"].max() + pd.Timedelta(days=28),
        freq="d",
        inclusive="right",
    )
    forecast_df = pd.DataFrame({"date": forecast_dates})
    foreacst_df = temporal_features(forecast_df)
    foreacst_df = lagged_features_forecast(train_df, forecast_df, lags)
    return forecast_df

## VALIDATION DATASET -- PUBLIC LEADERBOARD

In [None]:
# Utils for working with datasets

validation_product_id_to_int = {
    product_id: idx
    for (idx, product_id) in enumerate(SALES_TRAIN_VALIDATION["id"].unique(), start=1)
}

validation_int_to_product_id = {
    idx: product_id
    for (product_id, idx) in validation_product_id_to_int.items()
}

timestamp_ids = [f"d_{i}" for i in range(1, 1914)]

In [None]:
# Unpivot dataset

train_df_validation = SALES_TRAIN_VALIDATION.copy(deep=True)
train_df_validation["product_id"] = train_df_validation["id"].map(validation_product_id_to_int)

In [None]:
product_sale_dfs = []
product_ids = train_df_validation["product_id"].unique()
for p_id in product_ids:
    product_sale_counts = train_df_validation[train_df_validation["product_id"] == p_id]
    product_sale_counts = product_sale_counts[timestamp_ids].T.reset_index()
    product_sale_counts.columns = ["timestamp_id", "count"]
    product_sale_counts["product_id"] = p_id
    product_sale_dfs.append(product_sale_counts)

product_sales_df = pd.concat(product_sale_dfs)

In [None]:
validation_product_sales = pd.merge(
    left=product_sales_df,
    right=CALENDAR_DATA[["date", "d", "event_name_1", "event_type_1", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI"]],
    how="left",
    left_on=["timestamp_id"],
    right_on=["d"],
    indicator=True,
)

In [None]:
# As a start only predict the mean sales
mean_validation_sales = mean_sales(validation_product_sales)

In [11]:
# Temporal features
mean_validation_sales["date"] = pd.to_datetime(mean_validation_sales["date"], yearfirst=True)
mean_validation_sales = temporal_features(mean_validation_sales)

In [13]:
# Lagged features
LAGS = [1, 2, 3, 7]
mean_validation_sales = lagged_features(mean_validation_sales, LAGS)

In [49]:
# Construct the test df
construct_test_df(mean_validation_sales, LAGS)

Unnamed: 0,date,weekday,week,year
0,2016-04-25,0,17,2016
1,2016-04-26,1,17,2016
2,2016-04-27,2,17,2016
3,2016-04-28,3,17,2016
4,2016-04-29,4,17,2016
5,2016-04-30,5,17,2016
6,2016-05-01,6,17,2016
7,2016-05-02,0,18,2016
8,2016-05-03,1,18,2016
9,2016-05-04,2,18,2016


In [48]:
lagged_features_forecast(mean_validation_sales, forecast_df, LAGS)

Unnamed: 0,date,weekday,week,year,date_lag_1,count_lag_1,date_lag_2,count_lag_2,date_lag_3,count_lag_3,date_lag_7,count_lag_7
0,2016-04-25,0,17,2016,2016-04-24,1.633158,2016-04-23,1.605838,2016-04-22,1.328862,2016-04-18,1.248245
1,2016-04-26,1,17,2016,2016-04-25,,2016-04-24,1.633158,2016-04-23,1.605838,2016-04-19,1.232207
2,2016-04-27,2,17,2016,2016-04-26,,2016-04-25,,2016-04-24,1.633158,2016-04-20,1.159167
3,2016-04-28,3,17,2016,2016-04-27,,2016-04-26,,2016-04-25,,2016-04-21,1.149
4,2016-04-29,4,17,2016,2016-04-28,,2016-04-27,,2016-04-26,,2016-04-22,1.328862
5,2016-04-30,5,17,2016,2016-04-29,,2016-04-28,,2016-04-27,,2016-04-23,1.605838
6,2016-05-01,6,17,2016,2016-04-30,,2016-04-29,,2016-04-28,,2016-04-24,1.633158
7,2016-05-02,0,18,2016,2016-05-01,,2016-04-30,,2016-04-29,,2016-04-25,
8,2016-05-03,1,18,2016,2016-05-02,,2016-05-01,,2016-04-30,,2016-04-26,
9,2016-05-04,2,18,2016,2016-05-03,,2016-05-02,,2016-05-01,,2016-04-27,


In [36]:
lagged_df

Unnamed: 0,date,date_lag_1,count_lag_1
0,2016-04-25,2016-04-24,1.633158
1,2016-04-26,2016-04-25,
2,2016-04-27,2016-04-26,
3,2016-04-28,2016-04-27,
4,2016-04-29,2016-04-28,
5,2016-04-30,2016-04-29,
6,2016-05-01,2016-04-30,
7,2016-05-02,2016-05-01,
8,2016-05-03,2016-05-02,
9,2016-05-04,2016-05-03,


DatetimeIndex(['2016-04-25', '2016-04-26', '2016-04-27', '2016-04-28',
               '2016-04-29', '2016-04-30', '2016-05-01', '2016-05-02',
               '2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06',
               '2016-05-07', '2016-05-08', '2016-05-09', '2016-05-10',
               '2016-05-11', '2016-05-12', '2016-05-13', '2016-05-14',
               '2016-05-15', '2016-05-16', '2016-05-17', '2016-05-18',
               '2016-05-19', '2016-05-20', '2016-05-21', '2016-05-22'],
              dtype='datetime64[ns]', freq='D')

In [23]:
mean_validation_sales

Unnamed: 0,date,d,count,weekday,week,year,date_lag_1,count_lag_1,date_lag_2,count_lag_2,date_lag_3,count_lag_3,date_lag_7,count_lag_7
0,2011-01-29,d_1,1.070220,5,4,2011,2011-01-28,,2011-01-27,,2011-01-26,,2011-01-22,
1,2011-01-30,d_2,1.041292,6,4,2011,2011-01-29,1.070220,2011-01-28,,2011-01-27,,2011-01-23,
2,2011-01-31,d_3,0.780026,0,5,2011,2011-01-30,1.041292,2011-01-29,1.070220,2011-01-28,,2011-01-24,
3,2011-02-01,d_4,0.833454,1,5,2011,2011-01-31,0.780026,2011-01-30,1.041292,2011-01-29,1.070220,2011-01-25,
4,2011-02-02,d_5,0.627944,2,5,2011,2011-02-01,0.833454,2011-01-31,0.780026,2011-01-30,1.041292,2011-01-26,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1908,2016-04-20,d_1909,1.159167,2,16,2016,2016-04-19,1.232207,2016-04-18,1.248245,2016-04-17,1.693670,2016-04-13,1.154247
1909,2016-04-21,d_1910,1.149000,3,16,2016,2016-04-20,1.159167,2016-04-19,1.232207,2016-04-18,1.248245,2016-04-14,1.230863
1910,2016-04-22,d_1911,1.328862,4,16,2016,2016-04-21,1.149000,2016-04-20,1.159167,2016-04-19,1.232207,2016-04-15,1.370581
1911,2016-04-23,d_1912,1.605838,5,16,2016,2016-04-22,1.328862,2016-04-21,1.149000,2016-04-20,1.159167,2016-04-16,1.586159


## EVALUATION DATASET -- PRIVATE LEADERBOARD

In [None]:
SALES_TRAIN_EVALUATION

In [20]:
SAMPLE_SUBMISSION

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60976,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60977,FOODS_3_825_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60978,FOODS_3_826_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Submission

In [25]:
SAMPLE_SUBMISSION

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60976,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60977,FOODS_3_825_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60978,FOODS_3_826_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
SAMPLE_SUBMISSION.to_csv(f"{OUTPUT_BASE_BATH}/submission.csv", index=False)