In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import lightgbm as lgb
from sklearn import preprocessing, pipeline, compose, model_selection, metrics
from typing import Callable

from sales_forecasting.utils import timeseries_split

In [4]:
df = pd.read_parquet(".data/df_full.parquet")
df_items = pd.read_csv(".data/items.csv")

In [5]:
def mean_n_last(x: np.ndarray, n: int = 3) -> float:
    return x[-min(n, x.size):].mean()

def aggregate_monthly(df: pd.DataFrame, agg_fn: Callable, agg_scope_name: str) -> pd.DataFrame:
    new_cols = [f'prev_item_price_agg__{agg_scope_name}', f'is_prev_item_price__{agg_scope_name}']
    #display(df)
    months_sales = df['date_block_num'].unique()
    #print(months_sales)
    mapping = {months_sales[0]: np.nan}
    for i in range(1, len(months_sales)):
        i_date_block_num = months_sales[i]
        #display(df[df['date_block_num'] < i_date_block_num])
        mapping[i_date_block_num] = agg_fn(df[df['date_block_num'] < i_date_block_num]['item_price'])

    df[new_cols[0]] = df['date_block_num'].map(mapping)
    df[new_cols[1]] = df['date_block_num'].map({k: int(np.isnan(v)) for k,v in mapping.items()})

    display()

    return df[["date_block_num", *new_cols]].reset_index(drop=True)

def build_monthly_item_price_features(df: pd.DataFrame, agg_cols: list[str], agg_fun: Callable) -> pd.DataFrame:
    agg_scope_name = {"shop_id": "local", "item_id": "global"}[agg_cols[0]]
    new_features = df.groupby(agg_cols) \
        .apply(aggregate_monthly, agg_fn=agg_fun, agg_scope_name=agg_scope_name) \
        .reset_index(level=-1, drop=True).reset_index() \
        .drop_duplicates()
    return pd.merge(df, new_features, on=[*agg_cols, "date_block_num"], how='left')

In [6]:
df_shop_5 = df[df.shop_id < 5]
df_shop_5 = build_monthly_item_price_features(df_shop_5, agg_cols=["shop_id", "item_id"], agg_fun=np.mean)
df_shop_5 = build_monthly_item_price_features(df_shop_5, agg_cols=["item_id"], agg_fun=np.mean)

  new_features = df.groupby(agg_cols) \
  new_features = df.groupby(agg_cols) \


In [7]:
def build_month_features(df: pd.DataFrame) -> pd.DataFrame:
    df_features = df.copy()

    df_features['month_sin'] = np.sin(np.pi / 12 * (df['date'].dt.month - 1))
    df_features['month_cos'] = np.cos(np.pi / 12 * (df['date'].dt.month - 1))
    
    return df_features

In [8]:
df = build_month_features(df)

In [5]:
df['item_cnt_day'] = df['item_cnt_day'].clip(0, 20)
df = df[['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'item_cnt_day']]

In [6]:
X = df.drop(columns=["item_cnt_day"])
y = df[['item_cnt_day']]

In [None]:
num_columns = []
cat_columns = ['shop_id', 'item_id', 'item_category_id']

preprocessor = compose.ColumnTransformer(
    transformers=[
        ('num', preprocessing.StandardScaler(), num_columns),
        ('cat', preprocessing.OneHotEncoder(), cat_columns)
    ])

pipeline = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor),
])

In [12]:
X_train, X_valid, X_test = timeseries_split(X, col="date_block_num", continuous=False)
y_train, y_valid, y_test = y.iloc[X_train.index], y.iloc[X_valid.index], y.iloc[X_test.index]

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
X_train

In [None]:
X_valid

In [None]:
X_train_transformed = pipeline.transform(X_train)
X_valid_transformed = pipeline.transform(X_valid)

In [28]:
X_transformed = pd.DataFrame(data=enc.transform(X).toarray(), columns=enc.get_feature_names_out())

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train_transformed = enc.transform(X_train)
X_test_transformed = enc.transform(X_test)

In [16]:
train_data = lgb.Dataset(X_train_transformed, label=y_train)
test_data = lgb.Dataset(X_test_transformed, label=y_test, reference=train_data)

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
gbm = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data])


In [21]:
y_pred = gbm.predict(X_train_transformed, num_iteration=gbm.best_iteration)

In [25]:
mse = metrics.mean_squared_error(y_train, y_pred)

In [None]:
y_pred

In [None]:
y_trains