In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir('..')

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import preprocessing, metrics
from typing import Callable
from scipy.sparse import hstack
from itertools import product

from sales_forecasting.utils import timeseries_split, build_submission_df

In [None]:
df = pd.read_parquet(".data/data.parquet")

In [None]:
# Filter redundant columns
cols = ['date_block_num', 'shop_id', 'city_id', 'item_id', 'item_category_id', 'general_item_category_id', 'item_price', 'date_month', 'item_cnt_day']
df = df[cols]

In [None]:
# Filter outliers
shp = df.shape[0]
outliers_item_price_index = df['item_price'] > 100000
outliers_item_cnt_day_index = df['item_cnt_day'] > 1000

display(df[outliers_item_price_index])
display(df[outliers_item_cnt_day_index])

df = df.drop(df[outliers_item_price_index].index)
df = df.drop(df[outliers_item_cnt_day_index].index)
print(f"Filtered {shp - df.shape[0]} outliers")

In [None]:
# Imput missing/wrong values
missing_item_price_index = df['item_price'] <= 0
mean_imputation = df[(df.shop_id == 32) & (df.item_id == 2973) & (df.date_block_num == 4) & (df.item_price > 0)]['item_price'].mean()
df.loc[missing_item_price_index, 'item_price'] = mean_imputation
display(df[missing_item_price_index])

In [None]:
# Filter repeating shops
repeating_shops_pairs =  [(0, 57), (1, 58), (10, 11)]

for orig, rep in repeating_shops_pairs:
    df.loc[df.shop_id == rep, 'shop_id'] = orig

In [None]:
# # Set dtypes
# dtypes = {
#     'date_block_num': 'uint8',
#     'shop_id': 'uint8',
#     'city_id': 'uint8',
#     'item_id': 'uint16',
#     'item_category_id': 'uint8',
#     'general_item_category_id': 'uint8',
#     'item_price': 'float32',
#     'date_month': 'uint8',
#     'item_cnt_day': 'int32'
# }

# for column, dtype in dtypes.items():
#     df[column] = df[column].astype(dtype) # type: ignore

# print(df.dtypes)

In [None]:
# Aggregate data monthly (sum daily sales)
base_cols = ['date_block_num', 'shop_id', 'item_id']
cols = base_cols + ['city_id', 'item_category_id', 'general_item_category_id', 'date_month']

df_agg_monthly = df.copy()
df_agg_monthly['item_cnt_day'] = df['item_cnt_day'].fillna(0)
df_agg_monthly = df_agg_monthly \
    .groupby(base_cols).agg({"item_cnt_day": "sum"}) \
    .reset_index() \
    .rename(columns={"item_cnt_day": "item_cnt_month"})
df_agg_monthly = df_agg_monthly.merge(df[cols].drop_duplicates(), on=base_cols, how='left')

In [None]:
np.union1d(np.array([1,2,3]), np.array([2,3,4]))

In [None]:
# Oversample for month x shop x item where item_cnt_month == 0 (only train data, as test data already covers such combinations)
# matrix = []
# cols = ['date_block_num','shop_id','item_id']
# for i in range(34):
#     df_train_month = df[df.date_block_num==i]
#     matrix.append(np.array(list(product([i], df_train_month.shop_id.unique(), df_train_month.item_id.unique()))))

matrix, shops_cache, items_cache = [], np.array([]), np.array([])
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    df_train_month = df[df.date_block_num==i]
    shops_cache = np.union1d(shops_cache, df_train_month.shop_id.unique())
    items_cache = np.union1d(items_cache, df_train_month.item_id.unique())
    matrix.append(np.array(list(product([i], shops_cache, items_cache))))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix = pd.concat([matrix, df[df.date_block_num==34][cols]], ignore_index=True, sort=False)
matrix = matrix.sort_values(cols).reset_index(drop=True)

df_agg_monthly_oversampled = pd.merge(matrix, df_agg_monthly[cols + ['item_cnt_month']], on=cols, how='left').fillna(0)
df_agg_monthly_oversampled = df_agg_monthly_oversampled \
    .merge(df[['shop_id', 'city_id']].drop_duplicates(), on='shop_id', how='left') \
    .merge(df[['item_id', 'item_category_id', 'general_item_category_id']].drop_duplicates(), on='item_id', how='left') \
    .merge(df[['date_block_num', 'date_month']].drop_duplicates(), on='date_block_num', how='left')

In [None]:
df_agg_monthly_oversampled

In [None]:
df_agg_monthly_oversampled[(df_agg_monthly_oversampled.shop_id == 2) & (df_agg_monthly_oversampled.item_id.isin([30, 31, 32]))].groupby(["shop_id", "item_id"]).apply(display)

In [None]:
def plot_timeseries(df: pd.DataFrame, plt_rows: int = 10, plt_cols: int = 2) -> None:
    fig, ax = plt.subplots(plt_rows, plt_cols, figsize=(20, 30))

    group = df[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month']].groupby(["shop_id", "item_id"])#[(df.shop_id == 2) & (df.item_id.isin([30, 31, 32]))].groupby(["shop_id", "item_id"])
    group_iter = iter(group)

    for i in range(plt_rows * plt_cols):
        (_, df_group) = next(group_iter, (None, None))

        #display(df_group)
        
        if df_group is None:
            break
        
        df_group = df_group[df_group['date_block_num'] < 34]

        row = i // plt_cols
        col = i % plt_cols
        ax[row, col].plot(df_group['date_block_num'], df_group['item_cnt_month'], label="Sales")
        ax[row, col].scatter(df_group[df_group.item_cnt_month != 0]['date_block_num'], df_group[df_group.item_cnt_month != 0]['item_cnt_month'], color='blue')
        ax[row, col].set_ylim(-.1, max(df_group['item_cnt_month']) + 1)
        ax[row, col].set_title(f"Shop ID: {df_group.shop_id.iloc[0]}, Item ID: {df_group.item_id.iloc[0]}")
        ax[row, col].set_xlabel('Month')
        ax[row, col].set_ylabel('Sales')
        ax[row, col].set_xticks(range(0, 34))
        ax[row, col].legend()
        ax[row, col].grid()
    plt.tight_layout()
    plt.show()

plot_timeseries(df_agg_monthly_oversampled, plt_rows=10, plt_cols=2)

In [None]:
def col_name(prefix: str, iter: list[int]):
    return [f"{prefix}_{i}" for i in iter]

def merge_with_oversampled_index(df: pd.DataFrame, oversampled_index: pd.DataFrame) -> pd.DataFrame:
    df_features = df.copy()

    return pd.merge(df_features, oversampled_index.reset_index(), on=["shop_id", "item_id", "date_block_num"], how='outer', suffixes=(None, '_y'), indicator=True)

def build_month_features(df: pd.DataFrame) -> pd.DataFrame:
    df_features = df.copy()

    df_features['month_sin'] = np.sin(np.pi / 12 * df['date_month'])
    df_features['month_cos'] = np.cos(np.pi / 12 * df['date_month'])
    
    return df_features

def build_lagged_features(df: pd.DataFrame, lags: list[int]) -> pd.DataFrame:
    df_features = df.copy()

    lagged_features = col_name("lagged", lags)

    df_features = df_features.sort_values(["shop_id", "item_id", "date_block_num"])
    for feat, lag in zip(lagged_features, lags):
        df_features[feat] = df_features.groupby(["shop_id", "item_id"])['item_cnt_month'].shift(lag).bfill()#.fillna(0)
    
    return df_features

def build_rolling_features(df: pd.DataFrame, rolling: list[int]) -> pd.DataFrame:
    df_features = df.copy()
    
    assert "lagged_1" in df_features.columns, "lagged_1 column must be present in the dataframe to create roll for past months"
    rolling_features = col_name("rolling", rolling)

    df_features = df_features.sort_values(["shop_id", "item_id", "date_block_num"])
    for feat, roll in zip(rolling_features, rolling):
        df_features[feat] = df_features.groupby(["shop_id", "item_id"])['lagged_1'].rolling(roll).mean().bfill().reset_index(level=[0,1], drop=True)

    return df_features

def drop_merged(df: pd.DataFrame) -> pd.DataFrame:
    df_features = df.copy()
    
    df_features = df_features.drop(df_features[df_features['_merge'] == "right_only"].index)
    df_features = df_features.drop(columns=['_merge', 'item_cnt_month_y'])
    return df_features

def build_features(df: pd.DataFrame, lagged_features: list[int], rolling_features: list[int]) -> pd.DataFrame:
    df_featurized = df \
        .pipe(build_month_features) \
        .pipe(build_lagged_features, lags=lagged_features) \
        .pipe(build_rolling_features, rolling=rolling_features)
    
    assert df_featurized.shape[0] == df.shape[0]
    
    return df_featurized

In [None]:
lagged_features = list(range(1, 4))
rolling_features = [3, 6]

cols =  {
    'cat': ['shop_id', 'item_category_id', 'general_item_category_id', 'city_id',],
    'num': ['month_sin', 'month_cos'] + col_name("lagged", lagged_features) + col_name("rolling", rolling_features), #+ col_name("item_name_tfidf", list(range(0, 1000))),
    'target': "item_cnt_month"
}

In [None]:
dtm = df_agg_monthly_oversampled[(df_agg_monthly_oversampled.shop_id == 0) & (df_agg_monthly_oversampled.item_id == 30)]

In [None]:
dt = df[(df.shop_id == 0) & (df.item_id == 30)]

In [None]:
dt.groupby(["date_block_num", "shop_id", "item_id"]).apply(display)

In [None]:
dtm

In [None]:
df_agg_monthly_oversampled

In [None]:
def aggregate_historical_features_and_merge(
    df_monthly: pd.DataFrame,
    df_daily: pd.DataFrame,  
    index_cols: list[str],
    agg_col: str
) -> pd.DataFrame:
    cross_cols = index_cols[1:] # drop date_block_num
    new_column_name = f"avg_{'_'.join([x.split('_id')[0] for x in cross_cols])}_{agg_col}"

    avg_index = df_daily.groupby(index_cols).agg({agg_col: "mean"}).reset_index().rename(columns={agg_col: new_column_name})
    avg_index[f'{new_column_name}_lag_1'] = avg_index.groupby(cross_cols)[new_column_name].shift(1).bfill()
    avg_index = avg_index.drop(columns=[new_column_name])

    return df_monthly.merge(avg_index, on=index_cols, how='left').ffill()

In [None]:
df_agg_monthly_oversampled = df_agg_monthly_oversampled \
    .pipe(aggregate_historical_features_and_merge, df_daily=df, index_cols=['date_block_num', 'shop_id', 'item_id'], agg_col='item_price') \
    .pipe(aggregate_historical_features_and_merge, df_daily=df, index_cols=['date_block_num', 'shop_id', 'item_id'], agg_col='item_cnt_day')

In [None]:
df_agg_monthly_oversampled

In [None]:
avg_shop_item_price_index = df.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_price": "mean"}).reset_index().rename(columns={"item_price": "avg_shop_item_price"})
avg_shop_item_price_index['avg_shop_item_price_lag_1'] = avg_shop_item_price_index.groupby(["shop_id", "item_id"])['avg_shop_item_price'].shift(1).bfill()
avg_shop_item_price_index = avg_shop_item_price_index.drop(columns=['avg_shop_item_price'])

In [None]:
df_agg_monthly_oversampled.merge(avg_shop_item_price_index, on=["date_block_num", "shop_id", "item_id"], how='left').ffill()

In [None]:
avg_item_price_index = dt.groupby(["date_block_num", "item_id"]).agg({"item_price": "mean"}).reset_index().rename(columns={"item_price": "avg_item_price"})
avg_item_price_index['avg_item_price_lag_1'] = avg_item_price_index.groupby(["item_id"])['avg_item_price'].shift(1).bfill()

In [None]:
avg_shop_category_price_index = dt.groupby(["date_block_num", "shop_id", "item_category_id"]).agg({"item_price": "mean"}).reset_index().rename(columns={"item_price": "avg_shop_category_price"})
avg_shop_category_price_index['avg_shop_category_lag_1'] = avg_shop_category_price_index.groupby(["shop_id", "item_category_id"])['avg_shop_category_price'].shift(1).bfill()

In [None]:
avg_category_price_index = dt.groupby(["date_block_num", "item_category_id"]).agg({"item_price": "mean"}).reset_index().rename(columns={"item_price": "avg_category_price"})
avg_category_price_index['avg_category_lag_1'] = avg_category_price_index.groupby(["item_category_id"])['avg_category_price'].shift(1).bfill()

In [None]:
avg_shop_item_cnt_index = dt.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": "mean"}).reset_index().rename(columns={"item_cnt_day": "avg_shop_item_cnt"})
avg_shop_item_cnt_index['avg_shop_item_cnt_lag_1'] = avg_shop_item_cnt_index.groupby(["shop_id", "item_id"])['avg_shop_item_cnt'].shift(1).bfill()

In [None]:
avg_item_cnt_index = dt.groupby(["date_block_num", "item_id"]).agg({"item_cnt_day": "mean"}).reset_index().rename(columns={"item_cnt_day": "avg_item_cnt"})
avg_item_cnt_index['avg_item_cnt_lag_1'] = avg_item_cnt_index.groupby(["item_id"])['avg_item_cnt'].shift(1).bfill()

In [None]:
avg_shop_category_cnt_index = dt.groupby(["date_block_num", "shop_id", "item_category_id"]).agg({"item_cnt_day": "mean"}).reset_index().rename(columns={"item_cnt_day": "avg_shop_category_cnt"})
avg_shop_category_cnt_index['avg_shop_category_cnt_lag_1'] = avg_shop_category_cnt_index.groupby(["shop_id", "item_category_id"])['avg_shop_category_cnt'].shift(1).bfill()

In [None]:
avg_category_cnt_index = dt.groupby(["date_block_num", "item_category_id"]).agg({"item_cnt_day": "mean"}).reset_index().rename(columns={"item_cnt_day": "avg_category_cnt"})
avg_category_cnt_index['avg_category_cnt_lag_1'] = avg_category_cnt_index.groupby(["item_category_id"])['avg_category_cnt'].shift(1).bfill()

In [None]:
df_agg_monthly_oversampled

In [None]:
def p(df):
    count = -1
    months_since_last_buy = []
    for index, row in df.iterrows():
        count += 1
        months_since_last_buy.append(count)
        
        if row['item_cnt_month'] > 0:
            count = -1  # Reset counter if there's a sale
    return months_since_last_buy

In [None]:
dtm2['months_since_last_buy_shop_item3'] = p(dtm)

In [None]:
df_agg_monthly_oversampled.groupby(["shop_id", "item_id"]).apply(p)

In [None]:
dtm2[['date_block_num', 'item_cnt_month', 'months_since_last_buy_shop_item3']]

In [None]:
dtm2 = dtm.copy()
dtm2['months_since_last_buy_shop_item'] = dtm2[dtm2.item_cnt_month != 0].groupby(["shop_id", "item_id"])['date_block_num'].diff() - 1
dtm2['months_since_last_buy_shop_item2'] = dtm2[dtm2.item_cnt_month == 0]

In [None]:
dtm2['months_since_last_buy_shop_item2'] = dtm2.months_since_last_buy_shop_item.ffill()

In [None]:
dtm2

In [None]:
#dt.pipe(build_lagged_features, lags=[1, 2]).pipe(build_rolling_features, rolling=[1,2,3])

In [None]:
build_features(df_agg_monthly_oversampled, lagged_features, rolling_features).sort_values(by=['shop_id', 'item_id', 'date_block_num'])

In [None]:
#train_split, test_split = timeseries_split(df_agg_monthly_oversampled, max_month=33, col='date_block_num', continuous=False)

In [None]:
train_split.groupby(['shop_id', 'item_id'])['item_cnt_month'].transform(lambda x: x.rolling(window=3).mean())
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.heatmap(train_split_featurized[cols['num'] + [cols['target']]].corr(), annot=True, fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
target_col = cols['target']
#train_target, test_target = train_split_featurized[target_col].clip(0, 20), test_split_featurized[target_col].clip(0, 20)
train_target, test_target = train_split_featurized[target_col], test_split_featurized[target_col]

In [None]:
if False:
    train_split_featurized.to_parquet(".data/train_split_featurized.parquet")
    test_split_featurized.to_parquet(".data/test_split_featurized.parquet")

In [None]:
ohe_cols = cols['cat']
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')
ohe.fit(train_split_featurized[ohe_cols])

X_train_cat, X_test_cat = ohe.transform(train_split_featurized[ohe_cols]), ohe.transform(test_split_featurized[ohe_cols])

In [None]:
num_cols = cols['num']

X_train_num, X_test_num = train_split_featurized[num_cols], test_split_featurized[num_cols]

In [None]:
X_train = hstack([X_train_cat, X_train_num]).tocsr()
X_test = hstack([X_test_cat, X_test_num]).tocsr()

In [None]:
# def baseline_naive_mean_model(X):
#     return train_target.mean().repeat(X.shape[0])

# y_pred = baseline_naive_mean_model(X_test)
# rmse = metrics.root_mean_squared_error(test_target.values, y_pred)
# print("Baseline model Test RMSE: ", rmse)

In [None]:
params = {
    'num_leaves': 91,
    'max_depth': 37,
    'learning_rate': 0.033470401293385826,
    'n_estimators': 1748,
    'reg_alpha': 0.6471314252482143,
    'reg_lambda': 2.9415585687282055,
    'colsample_bytree': 0.3,
    'subsample': 0.8,
    'min_child_samples': 62,
    'random_state': 42
}
model = lgb.LGBMRegressor(**params)
model.fit(X_train, train_target)

y_pred = model.predict(X_test)
rmse = metrics.root_mean_squared_error(test_target.values, y_pred)
print("LGBM model Test RMSE: ", rmse)

In [None]:
train_split, test_split = timeseries_split(df_full, max_month=34, col='date_block_num', continuous=False)

In [None]:
train_split_featurized, test_split_featurized = build_features(train_split, lagged_features, rolling_features), build_features(test_split, lagged_features, rolling_features)

In [None]:
target_col = cols['target']
# train_target, test_target = train_split_featurized[target_col].clip(0, 20), test_split_featurized[target_col].clip(0, 20)
train_target, test_target = train_split_featurized[target_col], test_split_featurized[target_col]

In [None]:
ohe_cols = cols['cat']
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')
ohe.fit(train_split_featurized[ohe_cols])

X_train_cat, X_test_cat = ohe.transform(train_split_featurized[ohe_cols]), ohe.transform(test_split_featurized[ohe_cols])

In [None]:
num_cols = cols['num']

X_train_num, X_test_num = train_split_featurized[num_cols], test_split_featurized[num_cols]

In [None]:
X_train = hstack([X_train_cat, X_train_num]).tocsr()
X_test = hstack([X_test_cat, X_test_num]).tocsr()

In [None]:
# def baseline_naive_mean_model(X):
#     return train_target.mean().repeat(X.shape[0])

# y_pred = baseline_naive_mean_model(X_test)

# evaluation_dataset = test_split[['shop_id', 'item_id']]
# evaluation_dataset = evaluation_dataset.assign(item_cnt_month=y_pred)
# build_submission_df(evaluation_dataset, save_path=".data/submission_naive_mean.csv")

In [None]:
model = lgb.LGBMRegressor(**params)
model.fit(X_train, train_target)

y_pred = model.predict(X_test)

In [None]:
metrics.root_mean_squared_error(train_target.values, model.predict(X_train))

In [None]:
evaluation_dataset = test_split[['shop_id', 'item_id']]
evaluation_dataset = evaluation_dataset.assign(item_cnt_month=y_pred.clip(0, 20))
build_submission_df(evaluation_dataset, save_path=".data/submission_xgb4.csv")

In [None]:
import optuna

def objective(trial):
    param = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', -1, 50),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100),
        'random_state': 42,
        'verbose': -1
    }

    model = lgb.LGBMRegressor(**param)
    model.fit(X_train, train_split[cols['target']].values)

    y_pred = model.predict(X_test)
    rmse = metrics.root_mean_squared_error(test_split[cols['target']].values, y_pred)
    return rmse

# Create a study and optimize the hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Best parameters and score
print("Best Parameters:", study.best_params)
print("Best Score:", study.best_value)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

def train_sklearn_model(df, model, cols):
    ohe_cols = cols['cat']
    ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')
    ohe.fit(df[ohe_cols])

    train_split, test_split = timeseries_split(df, 33, col='date_block_num', continuous=False)

    X_train_cat = ohe.transform(train_split[ohe_cols])
    X_test_cat = ohe.transform(test_split[ohe_cols])

    num_cols = cols['num']
    X_train_num = train_split[num_cols]
    X_test_num = test_split[num_cols]

    X_train = hstack([X_train_cat, X_train_num]).tocsr()
    X_test = hstack([X_test_cat, X_test_num]).tocsr()

    y_train = train_split['item_cnt_month_clipped']
    y_test = test_split['item_cnt_month_clipped']

    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    train_rmse = metrics.root_mean_squared_error(y_train, y_pred_train)
    y_pred_test = model.predict(X_test)
    test_rmse = metrics.root_mean_squared_error(y_test, y_pred_test)
    print(f'Train RMSE: {train_rmse} \nTest RMSE: {test_rmse}')

    return model

feature_columns = {
    'cat': ['shop_id', 'item_category_id'],
    'num': ['month_sin', 'month_cos'] + [f'lagged_{_}' for _ in range(1,8)] + [f'rolling_{_}' for _ in (3, 6, 12, 24)]
}

In [None]:
linear_model = train_sklearn_model(df, model=LinearRegression(), cols=feature_columns)

In [None]:
nn_model = train_sklearn_model(df, model=MLPRegressor(max_iter=100, hidden_layer_sizes=[256, ], verbose=True), cols=feature_columns)

In [None]:
df_test_transformed = ohe.transform(df_test[['shop_id', 'item_id', 'item_category_id']])
test_data = lgb.Dataset(df_test_transformed)
df_test_predictions = df_test.assign(item_cnt_month=gbm.predict(df_test_transformed, num_iteration=gbm.best_iteration))

In [None]:
def mean_n_last(x: np.ndarray, n: int = 3) -> float:
    return x[-min(n, x.size):].mean()

def aggregate_monthly(df: pd.DataFrame, agg_fn: Callable, agg_scope_name: str) -> pd.DataFrame:
    new_cols = [f'prev_item_price_agg__{agg_scope_name}', f'is_prev_item_price__{agg_scope_name}']
    #display(df)
    months_sales = df['date_block_num'].unique()
    #print(months_sales)
    mapping = {months_sales[0]: np.nan}
    for i in range(1, len(months_sales)):
        i_date_block_num = months_sales[i]
        #display(df[df['date_block_num'] < i_date_block_num])
        mapping[i_date_block_num] = agg_fn(df[df['date_block_num'] < i_date_block_num]['item_price'])

    df[new_cols[0]] = df['date_block_num'].map(mapping)
    df[new_cols[1]] = df['date_block_num'].map({k: int(np.isnan(v)) for k,v in mapping.items()})

    display()

    return df[["date_block_num", *new_cols]].reset_index(drop=True)

def build_monthly_item_price_features(df: pd.DataFrame, agg_cols: list[str], agg_fun: Callable) -> pd.DataFrame:
    agg_scope_name = {"shop_id": "local", "item_id": "global"}[agg_cols[0]]
    new_features = df.groupby(agg_cols) \
        .apply(aggregate_monthly, agg_fn=agg_fun, agg_scope_name=agg_scope_name) \
        .reset_index(level=-1, drop=True).reset_index() \
        .drop_duplicates()
    return pd.merge(df, new_features, on=[*agg_cols, "date_block_num"], how='left')

In [None]:
# df_shop_5 = df_train[df_train.shop_id < 10]
# df_shop_5 = build_monthly_item_price_features(df_shop_5, agg_cols=["shop_id", "item_id"], agg_fun=np.mean)
# df_shop_5 = build_monthly_item_price_features(df_shop_5, agg_cols=["item_id"], agg_fun=np.mean)

In [None]:
df_shop_price_feat = build_monthly_item_price_features(df_train, agg_cols=["shop_id", "item_id"], agg_fun=np.mean)
df_shop_price_feat = build_monthly_item_price_features(df_shop_price_feat, agg_cols=["item_id"], agg_fun=np.mean)

In [None]:
feature_store_previous_price = df_shop_price_feat.copy()

In [None]:
feature_store_previous_price = feature_store_previous_price[['date_block_num', 'shop_id', 'item_id', 'prev_item_price_agg__local', 'is_prev_item_price__local', 'prev_item_price_agg__global', 'is_prev_item_price__global']] \
    .drop_duplicates() \
    .sort_values(['date_block_num', 'shop_id', 'item_id']) \
    .fillna(0)

In [None]:
df

In [None]:
df.merge(feature_store_previous_price, on=['date_block_num', 'shop_id', 'item_id'])

In [None]:
feature_store_previous_price

In [None]:
feature_store_previous_price[feature_store_previous_price.is_prev_item_price__local == 0]['prev_item_price_agg__local'].describe()