In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import lightgbm as lgb
from sklearn import preprocessing, pipeline, compose, model_selection, metrics
from typing import Callable
from scipy.sparse import hstack

from sales_forecasting.utils import timeseries_split, kfold_timeseries_split

In [4]:
df_train = pd.read_parquet(".data/df_train.parquet")
df_test = pd.read_parquet(".data/df_test.parquet")
df_full = pd.read_parquet(".data/df_full.parquet")
df_items = pd.read_csv(".data/items.csv")

In [None]:
date_block_num_date_month_map = df_train[['date_block_num', 'date']] \
    .assign(date_month=df_train['date'].dt.month - 1) \
    .drop_duplicates(subset=["date_block_num", "date_month"])[['date_block_num', 'date_month']] \
    .set_index('date_block_num')['date_month'].to_dict()

df_train_monthly_sum_index = df_train.groupby(["shop_id", "item_id", "date_block_num"]) \
    .apply(lambda x: x['item_cnt_day'].sum()) \
    .reset_index() \
    .rename(columns={0: "item_cnt_month"})
df_train_monthly_features = df_train.drop(columns=["date", "item_price", "item_cnt_day", "item_cnt_day_clipped"]).drop_duplicates()
df_train_monthly_sum = df_train_monthly_sum_index.merge(df_train_monthly_features, on=['shop_id', 'item_id', 'date_block_num'])
df_train_monthly_sum = df_train_monthly_sum.assign(item_cnt_month_clipped=df_train_monthly_sum["item_cnt_month"].clip(0, 20))
df_train_monthly_sum['date_month'] = df_train_monthly_sum['date_block_num'].map(date_block_num_date_month_map)

In [6]:
df = df_train_monthly_sum \
    .sort_values(["date_block_num", "shop_id", "item_id"]) \
    .reset_index(drop=True)
df_test = df_test \
    .sort_values(["date_block_num", "shop_id", "item_id"]) \
    .reset_index(drop=True)

In [None]:
def oversample_item_cnt_month(df, max_month):
    columns = ['date_block_num', 'item_cnt_month']
    min_month = df['date_block_num'].min()

    date_col_data = np.arange(0, max_month + 1, dtype=int)
    item_cnt_month_col_data = np.zeros_like(date_col_data, dtype=float)
    item_cnt_month_col_data[df['date_block_num'].values] = df['item_cnt_month'].values
    #date_col_data = date_col_data[min_month:max_month]
    #item_cnt_month_col_data = item_cnt_month_col_data[min_month:max_month]
    oversampled = pd.DataFrame(np.column_stack([date_col_data, item_cnt_month_col_data]), columns=columns)

    return oversampled

df_oversample = df[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month']].sort_values(["shop_id", "item_id", "date_block_num"])
df_oversample = df_oversample \
    .groupby(["shop_id", "item_id"]) \
    .apply(oversample_item_cnt_month, max_month=df_oversample['date_block_num'].max()) \
    .reset_index(level=-1, drop=True) \
    .reset_index() \
    .astype({"date_block_num": np.int64, "item_cnt_month": np.float64})
# df_oversample.to_parquet("df_oversampled.parquet")

In [None]:
# df_valid = df_oversample[(df_oversample.date_block_num == 33) & (df_oversample.item_cnt_month != 0.0)]

# for i in df_valid.sort_values(["shop_id", "item_id", "date_block_num"]).head(40).iterrows():
#     id, row = i

#     display(df_oversample[(df_oversample.shop_id == row.shop_id) & (df_oversample.item_id == row.item_id)])
#     display(df_valid[(df_valid.shop_id == row.shop_id) & (df_valid.item_id == row.item_id)])

In [None]:
# for i in df_test.sort_values(["shop_id", "item_id", "date_block_num"]).head(40).iterrows():
#     id, row = i

#     display(df_oversample[(df_oversample.shop_id == row.shop_id) & (df_oversample.item_id == row.item_id)])
#     display(df_test[(df_test.shop_id == row.shop_id) & (df_test.item_id == row.item_id)])

In [47]:
df_oversample = df_oversample.merge(df_train[['shop_id', "item_id", "item_category_id"]].drop_duplicates(), on=["shop_id", "item_id"], how='left')
df_oversample['date_month'] = df_oversample['date_block_num'].map(date_block_num_date_month_map)
df_oversample['month_sin'] = np.sin(np.pi / 12 * (df_oversample['date_month']))
df_oversample['month_cos'] = np.cos(np.pi / 12 * (df_oversample['date_month']))

In [50]:
lagged_features_list = list(range(1, 3))
for i in lagged_features_list:
    df_oversample[f"lagged_date_block_num_{i}"] = df_oversample.groupby(["shop_id", "item_id"])["date_block_num"].shift(i)

In [None]:
df_oversample[(df_oversample.shop_id == 0) & (df_oversample.shop_id == 30)].groupby(["shop_id", "item_id"]).apply(display)

In [26]:
df_test_transformed = ohe.transform(df_test[['shop_id', 'item_id', 'item_category_id']])

test_data = lgb.Dataset(df_test_transformed)

In [None]:
test_data.data

In [None]:
y_test_pred = gbm.predict(df_test_transformed, num_iteration=gbm.best_iteration)
y_test_pred

In [None]:
df_test

In [39]:
df_test_raw = pd.read_csv(".data/test.csv")
df_test_predictions = df_test.assign(item_cnt_month=y_test_pred)

In [None]:
df_test_raw

In [53]:
submission = df_test_raw.merge(df_test_predictions, on=["shop_id", "item_id"])[['ID', 'item_cnt_month']]
submission.to_csv(".data/submission.csv", index=False)


In [None]:
submission[['item_cnt_month']].describe()

In [None]:
ohe_cols = ['shop_id', 'item_id', 'item_category_id']
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')
ohe.fit(df[ohe_cols])

In [None]:
ohe.transform(df[ohe_cols])

In [None]:
np.random.randn(1609124, 2)

In [40]:

# Assuming `encoded_data` is your sparse one-hot encoded data,
# and `numerical_data` is a numpy array with shape (n_samples, 2)

# Combine sparse one-hot encoded data with dense numerical data



In [126]:
def train_lgb(df, params, k_min=1, k_max=33):
    ohe_cols = ['shop_id', 'item_id', 'item_category_id']
    ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')
    ohe.fit(df[ohe_cols])

    df['month_sin'] = np.sin(np.pi / 12 * (df['date_month']))
    df['month_cos'] = np.cos(np.pi / 12 * (df['date_month']))

    for i, (train_split, test_split) in enumerate(kfold_timeseries_split(df, col='date_block_num', k_min=k_min, k_max=k_max)):
        print(f"\n\nFold: {i}")
        # display(train_split)
        # display(test_split)
        train_transformed = ohe.transform(train_split[ohe_cols])
        test_transformed = ohe.transform(test_split[ohe_cols])

        train_combined = hstack([train_transformed, train_split[['month_sin', 'month_cos'] + ["prev_item_price_agg__local", "is_prev_item_price__local", "prev_item_price_agg__global", "is_prev_item_price__global"]]]).tocsr()
        test_combined = hstack([test_transformed, test_split[['month_sin', 'month_cos'] + ["prev_item_price_agg__local", "is_prev_item_price__local", "prev_item_price_agg__global", "is_prev_item_price__global"]]]).tocsr()

        train_data = lgb.Dataset(train_combined, label=train_split['item_cnt_month_clipped'])
        test_data = lgb.Dataset(test_combined, label=test_split['item_cnt_month_clipped'], reference=train_data)
        
        gbm = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data])

        print(gbm.best_score)
        
    return gbm

In [124]:
df_price_features_merged = df.merge(feature_store_previous_price, on=['date_block_num', 'shop_id', 'item_id'])

In [None]:
df_price_features_merged

In [138]:
# Assuming 'sales' is your target column, 'shop_id', 'product_id', and 'date' are identifiers
sales_df = df_price_features_merged.sort_values(by=['shop_id', 'item_id', 'date_block_num'])
for lag in range(1, 4):  # You can experiment with the lag range
    sales_df[f'sales_lag_{lag}'] = sales_df.groupby(['shop_id', 'item_id'])['item_cnt_month_clipped'].shift(lag)


In [None]:
df_price_features_merged.groupby(['shop_id', 'item_id'])['date_block_num'].count().sort_values(ascending=False)

In [150]:
sales_df = df_price_features_merged[(df_price_features_merged.shop_id == 52) & (df_price_features_merged.item_id == 1905)]

In [None]:
sales_df.groupby(["shop_id", "item_id"])['item_cnt_month'].rolling(window=3).mean()

In [None]:
sales_df.head(50)

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 127,
    'max_depth': 40,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

model_columns = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'date_month', 'item_cnt_month_clipped'] + ["prev_item_price_agg__local", "is_prev_item_price__local", "prev_item_price_agg__global", "is_prev_item_price__global"]
gbm = train_lgb(df_price_features_merged[model_columns], params, k_min=33, k_max=33)

In [77]:
gbm.pandas_categorical

In [None]:
df_train.dtypes

In [85]:
def mean_n_last(x: np.ndarray, n: int = 3) -> float:
    return x[-min(n, x.size):].mean()

def aggregate_monthly(df: pd.DataFrame, agg_fn: Callable, agg_scope_name: str) -> pd.DataFrame:
    new_cols = [f'prev_item_price_agg__{agg_scope_name}', f'is_prev_item_price__{agg_scope_name}']
    #display(df)
    months_sales = df['date_block_num'].unique()
    #print(months_sales)
    mapping = {months_sales[0]: np.nan}
    for i in range(1, len(months_sales)):
        i_date_block_num = months_sales[i]
        #display(df[df['date_block_num'] < i_date_block_num])
        mapping[i_date_block_num] = agg_fn(df[df['date_block_num'] < i_date_block_num]['item_price'])

    df[new_cols[0]] = df['date_block_num'].map(mapping)
    df[new_cols[1]] = df['date_block_num'].map({k: int(np.isnan(v)) for k,v in mapping.items()})

    display()

    return df[["date_block_num", *new_cols]].reset_index(drop=True)

def build_monthly_item_price_features(df: pd.DataFrame, agg_cols: list[str], agg_fun: Callable) -> pd.DataFrame:
    agg_scope_name = {"shop_id": "local", "item_id": "global"}[agg_cols[0]]
    new_features = df.groupby(agg_cols) \
        .apply(aggregate_monthly, agg_fn=agg_fun, agg_scope_name=agg_scope_name) \
        .reset_index(level=-1, drop=True).reset_index() \
        .drop_duplicates()
    return pd.merge(df, new_features, on=[*agg_cols, "date_block_num"], how='left')

In [None]:
df_train

In [100]:
# df_shop_5 = df_train[df_train.shop_id < 10]
# df_shop_5 = build_monthly_item_price_features(df_shop_5, agg_cols=["shop_id", "item_id"], agg_fun=np.mean)
# df_shop_5 = build_monthly_item_price_features(df_shop_5, agg_cols=["item_id"], agg_fun=np.mean)

In [None]:
df_shop_price_feat = build_monthly_item_price_features(df_train, agg_cols=["shop_id", "item_id"], agg_fun=np.mean)
df_shop_price_feat = build_monthly_item_price_features(df_shop_price_feat, agg_cols=["item_id"], agg_fun=np.mean)

In [107]:
feature_store_previous_price = df_shop_price_feat.copy()

In [108]:
feature_store_previous_price = feature_store_previous_price[['date_block_num', 'shop_id', 'item_id', 'prev_item_price_agg__local', 'is_prev_item_price__local', 'prev_item_price_agg__global', 'is_prev_item_price__global']] \
    .drop_duplicates() \
    .sort_values(['date_block_num', 'shop_id', 'item_id']) \
    .fillna(0)

In [None]:
df

In [None]:
df.merge(feature_store_previous_price, on=['date_block_num', 'shop_id', 'item_id'])

In [None]:
feature_store_previous_price

In [None]:
feature_store_previous_price[feature_store_previous_price.is_prev_item_price__local == 0]['prev_item_price_agg__local'].describe()

In [7]:
def build_month_features(df: pd.DataFrame) -> pd.DataFrame:
    df_features = df.copy()

    df_features['month_sin'] = np.sin(np.pi / 12 * (df['date'].dt.month - 1))
    df_features['month_cos'] = np.cos(np.pi / 12 * (df['date'].dt.month - 1))
    
    return df_features

In [8]:
df = build_month_features(df)

In [5]:
df['item_cnt_day'] = df['item_cnt_day'].clip(0, 20)
df = df[['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'item_cnt_day']]

In [6]:
X = df.drop(columns=["item_cnt_day"])
y = df[['item_cnt_day']]

In [None]:
num_columns = []
cat_columns = ['shop_id', 'item_id', 'item_category_id']

preprocessor = compose.ColumnTransformer(
    transformers=[
        ('num', preprocessing.StandardScaler(), num_columns),
        ('cat', preprocessing.OneHotEncoder(), cat_columns)
    ])

pipeline = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor),
])

In [12]:
X_train, X_valid, X_test = timeseries_split(X, col="date_block_num", continuous=False)
y_train, y_valid, y_test = y.iloc[X_train.index], y.iloc[X_valid.index], y.iloc[X_test.index]

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
X_train

In [None]:
X_valid

In [None]:
X_train_transformed = pipeline.transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)

In [28]:
X_transformed = pd.DataFrame(data=enc.transform(X).toarray(), columns=enc.get_feature_names_out())

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train_transformed = enc.transform(X_train)
X_test_transformed = enc.transform(X_test)

In [16]:
train_data = lgb.Dataset(X_train_transformed, label=y_train)
test_data = lgb.Dataset(X_test_transformed, label=y_test, reference=train_data)

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
gbm = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data])


In [21]:
y_pred = gbm.predict(X_train_transformed, num_iteration=gbm.best_iteration)

In [25]:
mse = metrics.mean_squared_error(y_train, y_pred)

In [None]:
y_pred

In [None]:
y_trains