In [1]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

pd.options.plotting.backend = "plotly"

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
from sklearn.model_selection import train_test_split


In [2]:
def preprocessing(y, X, horizon):
    y.index.freq='D'

    # split
    y_train, y_test = train_test_split(y, test_size=horizon, shuffle=False)

    # transform (deseasonalize)
    transformer = make_pipeline(Deseasonalizer(sp=7), Detrender())
    y_train_trans = transformer.fit_transform(y_train)
    y_train_trans.name = y.name
    y_test_trans = transformer.transform(y_test)
    y_test_trans.name = y.name
    y_trans = pd.concat([y_train_trans, y_test_trans])

    # join
    df = X.join(y_trans).dropna()
    
    # extract exo. variables from date index
    df['dayofweek'] = df.index.dayofweek
    df['dayofmonth'] = df.index.day
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar()['week']
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    df['year'] = df.index.year

    # rolling mean
    df['rolling_mean_2'] = df['sales'].rolling(2).mean()
    df['rolling_mean_3'] = df['sales'].rolling(3).mean()
    df['rolling_mean_4'] = df['sales'].rolling(4).mean()
    df['rolling_mean_5'] = df['sales'].rolling(5).mean()
    df['rolling_mean_6'] = df['sales'].rolling(6).mean()
    df['rolling_mean_7'] = df['sales'].rolling(7).mean()

    df.dropna(inplace=True)
    
    return df, transformer


# CV 
def cross_validation_result(data, tuned_model, model_name, transformer, horizon, rolls):
    # score model with CV on store data
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        # print(f"fold {i}---------------")
        
        # split data
        y_train = data.iloc[: -(rolls - i) * horizon]
        y_test = data.iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]

        # fit model
        model = tuned_model
        model.fit(
            y    = y_train['sales'],
            exog = y_train[y_train.columns.difference(['sales'])]
        )
        
        # make forecast
        y_hat = model.predict(
                        steps = horizon,
                        exog = y_test[y_test.columns.difference(['sales'])]
                    )
        y_hat = pd.Series(data=y_hat, index=y_test.index)
        
        # inverse
        y_train = transformer.inverse_transform(y_train['sales'])
        y_test = transformer.inverse_transform(y_test['sales'])
        y_hat = transformer.inverse_transform(y_hat)
                
        # score
        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
        
    return {'store':model_name,
        'mae_XGB':np.mean(mae_CVs),
        'rmse_XGB':np.mean(rmse_CVs),
        'mape_XGB':np.mean(mape_CVs),
        'mase_XGB':np.mean(mase_CVs),
        'fc_XGB':y_hat,
        }


In [3]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


In [4]:
def grouping_pipe(ts, col_name):
    all_stores_result_CV = pd.DataFrame(columns=[col_name,'store','mape'])
    for group in ts[col_name].unique():
        # get aggregated data for each store_segment
        ts_group = ts[ts[col_name] == group].groupby('date').sum()['sales']
        horizon = 7
        # Prepare data
        df, transformer = preprocessing(ts_group, df_exog, horizon)
        

        # Grid search hyperparameter and lags
        # ==============================================================================
        pipe = make_pipeline(Normalizer(), XGBRegressor(random_state=123))

        forecaster = ForecasterAutoreg(
            regressor=pipe, lags=10  # This value will be replaced in the grid search
        )

        param_grid = {
            "xgbregressor__max_depth": [3, 10, 20],
            "xgbregressor__learning_rate": [0.01, 0.1, 0.3],
            "xgbregressor__subsample": [0.5, 0.7, 1.0],
            "xgbregressor__colsample_bytree": [0.5, 0.7, 1.0],
            "xgbregressor__colsample_bylevel": [0.5, 0.7, 1.0],
            "xgbregressor__n_estimators": [100, 500, 1000],
        }

        # Lags used as predictors
        lags_grid = [list(range(horizon, horizon*2))]

        # Grid search
        results_grid = grid_search_forecaster(
            y=df["sales"],
            initial_train_size=len(df) - horizon,
            exog=df[df.columns.difference(["sales"])],
            forecaster=forecaster,
            param_grid=param_grid,
            lags_grid=lags_grid,
            steps=horizon,
            refit=True,
            metric="mean_absolute_percentage_error",
            return_best=True,
            verbose=False,
        )

        stores_result_CV = []
        rolls = 4
        store_list=ts[ts[col_name]==group]['store_id'].unique()
        for store in store_list:  # print(store)
        # fit on store data
            print(f"processing stores {store}...")
            model_name = "store_" + str(store)

            # data
            ts_1_store = ts[ts["store_id"] == store].set_index("date")["sales"]
            df_1_store_pro, transformer = preprocessing(
                ts_1_store, df_exog, horizon=horizon * rolls
            )

            # CV
            cv_score = cross_validation_result(
                df_1_store_pro, 
                forecaster, 
                model_name, 
                transformer,
                horizon,
                rolls
            )

            # result
            stores_result_CV.append(cv_score['mape_XGB'])
        all_stores_result_CV = pd.concat(
                                    [all_stores_result_CV,
                                    pd.DataFrame({col_name:group,'store':store_list,'mape':stores_result_CV})])
    return all_stores_result_CV
            


In [5]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_cluster=pd.read_pickle("results/grouping/store_cluster.pkl")
df_store=pd.merge(df_store, df_cluster, on=['store_id'])

In [6]:
#columns_list= ['store_level','store_segment','cluster']
columns_list= ['province']
for col in columns_list:
    final_result=grouping_pipe(df_store, col)
    file_dir = "results/grouping/result_XGB_"+ col + ".pkl"
    final_result.to_pickle(file_dir)

Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:12<00:00, 1092.22s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.5, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 0.20661924638686222

processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...
processing stores 328165...
processing stores 349920...
processing stores 349924...
processing stores 349952...
processing stores 349958...
processing stores 349962...
processing stores 349972...
processing stores 349978...
processing stores 349980...
processing stores 452387...
processing stores 461349...
processing stores 480733...
processing stores 528854...
processing stores 566790...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:21<00:00, 1101.03s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 0.7, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.6040432881621337

processing stores 349998...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:25<00:00, 1105.46s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.14813787321869767

processing stores 350016...
processing stores 350018...
processing stores 441997...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:56<00:00, 1136.12s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.01, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 0.48686173155734214

processing stores 350026...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:39<00:00, 1119.30s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.6838382472294987

processing stores 350028...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [19:10<00:00, 1150.62s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 0.5459120574178099

processing stores 350040...
processing stores 350046...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [19:54<00:00, 1194.53s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.5, 'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 1.356938363590233

processing stores 350054...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [22:09<00:00, 1329.08s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.6443367286013325

processing stores 350056...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [21:48<00:00, 1308.22s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 1.1111519985466216

processing stores 350060...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [21:30<00:00, 1290.93s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.5, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 1.0}
  Backtesting metric: 0.5545333794420225

processing stores 354468...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [19:46<00:00, 1186.06s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.01, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 1.298677032206252

processing stores 387240...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [19:44<00:00, 1184.44s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.5, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.35727997386050053

processing stores 412585...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:06<00:00, 1086.13s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.5196548920455004

processing stores 464495...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [18:18<00:00, 1098.52s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 0.3473484589896351

processing stores 471477...
Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [16:44<00:00, 1004.30s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 0.7, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 100, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.4164337239481904

processing stores 476061...
processing stores 566792...
Number of models compared: 729


loop lags_grid: 100%|██████████████████████████████████████| 1/1 [14:05<00:00, 845.09s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 0.7, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 1.0}
  Backtesting metric: 0.22139816181221647

processing stores 536898...
Number of models compared: 729


loop lags_grid: 100%|██████████████████████████████████████| 1/1 [13:45<00:00, 825.87s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 0.7, 'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 20, 'xgbregressor__n_estimators': 500, 'xgbregressor__subsample': 0.7}
  Backtesting metric: 0.25979358990833373

processing stores 536902...
