In [57]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

pd.options.plotting.backend = "plotly"

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
from sklearn.model_selection import train_test_split


# define functions

In [58]:
def preprocessing(y, X, horizon):
    y.index.freq='D'

    # split
    y_train, y_test = train_test_split(y, test_size=horizon, shuffle=False)

    # transform (deseasonalize)
    transformer = make_pipeline(Deseasonalizer(sp=7), Detrender())
    y_train_trans = transformer.fit_transform(y_train)
    y_train_trans.name = y.name
    y_test_trans = transformer.transform(y_test)
    y_test_trans.name = y.name
    y_trans = pd.concat([y_train_trans, y_test_trans])

    # join
    df = X.join(y_trans).dropna()
    
    # extract exo. variables from date index
    df['dayofweek'] = df.index.dayofweek
    df['dayofmonth'] = df.index.day
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar()['week']
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    df['year'] = df.index.year

    # rolling mean
    df['rolling_mean_2'] = df['sales'].rolling(2).mean()
    df['rolling_mean_3'] = df['sales'].rolling(3).mean()
    df['rolling_mean_4'] = df['sales'].rolling(4).mean()
    df['rolling_mean_5'] = df['sales'].rolling(5).mean()
    df['rolling_mean_6'] = df['sales'].rolling(6).mean()
    df['rolling_mean_7'] = df['sales'].rolling(7).mean()

    df.dropna(inplace=True)
    
    return df, transformer


# CV 
def cross_validation_result(data, tuned_model, model_name, transformer, horizon, rolls):
    # score model with CV on store data
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        # print(f"fold {i}---------------")
        
        # split data
        y_train = data.iloc[: -(rolls - i) * horizon]
        y_test = data.iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]

        # fit model
        model = tuned_model
        model.fit(
            y    = y_train['sales'],
            exog = y_train[y_train.columns.difference(['sales'])]
        )
        
        # make forecast
        y_hat = model.predict(
                        steps = horizon,
                        exog = y_test[y_test.columns.difference(['sales'])]
                    )
        y_hat = pd.Series(data=y_hat, index=y_test.index)
        
        # inverse
        y_train = transformer.inverse_transform(y_train['sales'])
        y_test = transformer.inverse_transform(y_test['sales'])
        y_hat = transformer.inverse_transform(y_hat)
                
        # score
        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
        
    return {'store':model_name,
        'mae_XGB':np.mean(mae_CVs),
        'rmse_XGB':np.mean(rmse_CVs),
        'mape_XGB':np.mean(mape_CVs),
        'mase_XGB':np.mean(mase_CVs),
        'fc_XGB':y_hat,
        }


# read data

In [28]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


# horizon = 7


## tune on company data
grid search:
- fit once per param set on train set
- predict & score on test set (no CV)

In [47]:
horizon = 7

# Grid search hyperparameter and lags
# ==============================================================================
pipe = make_pipeline(Normalizer(), XGBRegressor(random_state=123))

forecaster = ForecasterAutoreg(
    regressor=pipe, lags=10  # This value will be replaced in the grid search
)

# Regressor hyperparameters
# param_grid = {
#     "xgbregressor__n_estimators": [50],
#     "xgbregressor__max_depth": [5],
# } 
param_grid = {
    "xgbregressor__max_depth": [3, 10, 20],
    "xgbregressor__learning_rate": [0.01, 0.1, 0.3],
    "xgbregressor__subsample": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bytree": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bylevel": [0.5, 0.7, 1.0],
    "xgbregressor__n_estimators": [100, 500, 1000],
}

# Lags used as predictors
lags_grid = [7]

# Prepare data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 729


loop lags_grid:   0%|                                               | 0/1 [00:09<?, ?it/s]


KeyboardInterrupt: 

In [43]:
# from joblib import dump, load
# # Save model
# dump(forecaster, filename='results/f8/XGB_forecaster_7.py')

# # Load model
# forecaster = load('results/f8/XGB_forecaster_7.py')


## fit on store data

In [None]:
rolls = 4
all_stores_result_CV = []

for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, 
        df_exog, 
        horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, 
        forecaster, 
        model_name, 
        transformer,
        horizon,
        rolls
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)



processing stores 307222...

processing stores 307244...

processing stores 307248...

processing stores 320264...

processing stores 328165...

processing stores 349920...

processing stores 349924...

processing stores 349952...

processing stores 349958...

processing stores 349962...

processing stores 349972...

processing stores 349978...

processing stores 349980...

processing stores 349998...

processing stores 350016...

processing stores 350018...

processing stores 350026...

processing stores 350028...

processing stores 350040...

processing stores 350046...

processing stores 350054...

processing stores 350056...

processing stores 350060...

processing stores 354468...

processing stores 387240...

processing stores 412585...

processing stores 441997...

processing stores 452387...

processing stores 461349...

processing stores 464495...

processing stores 471477...

processing stores 476061...

processing stores 480733...

processing stores 528854...

processing st

In [44]:
# all_stores_result_CV.to_pickle('results/f8/XGB_result_7.pkl')


# horizon = 14


## tune on company data
grid search:
- fit once per param set on train set
- predict & score on test set (no CV)

In [48]:
horizon = 14

# Grid search hyperparameter and lags
# ==============================================================================
pipe = make_pipeline(Normalizer(), XGBRegressor(random_state=123))

forecaster = ForecasterAutoreg(
    regressor=pipe, lags=10  # This value will be replaced in the grid search
)

param_grid = {
    "xgbregressor__max_depth": [3, 10, 20],
    "xgbregressor__learning_rate": [0.01, 0.1, 0.3],
    "xgbregressor__subsample": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bytree": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bylevel": [0.5, 0.7, 1.0],
    "xgbregressor__n_estimators": [100, 500, 1000],
}

# Lags used as predictors
lags_grid = [7]

# Prepare data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [39:39<00:00, 2379.78s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 1.0, 'xgbregressor__learning_rate': 0.01, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 0.07025408056191794



In [56]:
# from joblib import dump, load
# # Save model
# dump(forecaster, filename='results/f8/XGB_forecaster_14.py')

# # Load model
# forecaster = load('results/f8/XGB_forecaster_14.py')


## fit on store data

In [53]:
rolls = 4
all_stores_result_CV = []

for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    # print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, 
        df_exog, 
        horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, 
        forecaster, 
        model_name, 
        transformer,
        horizon,
        rolls
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)



processing stores 307222...

processing stores 307244...

processing stores 307248...

processing stores 320264...

processing stores 328165...

processing stores 349920...

processing stores 349924...

processing stores 349952...

processing stores 349958...

processing stores 349962...

processing stores 349972...

processing stores 349978...

processing stores 349980...

processing stores 349998...

processing stores 350016...

processing stores 350018...

processing stores 350026...

processing stores 350028...

processing stores 350040...

processing stores 350046...

processing stores 350054...

processing stores 350056...

processing stores 350060...

processing stores 354468...

processing stores 387240...

processing stores 412585...

processing stores 441997...

processing stores 452387...

processing stores 461349...

processing stores 464495...

processing stores 471477...

processing stores 476061...

processing stores 480733...

processing stores 528854...

processing st

In [55]:
# all_stores_result_CV.to_pickle('results/f8/XGB_result_14.pkl')


# horizon = 21

## tune on company data
grid search:
- fit once per param set on train set
- predict & score on test set (no CV)

In [60]:
horizon = 21

# Grid search hyperparameter and lags
# ==============================================================================
pipe = make_pipeline(Normalizer(), XGBRegressor(random_state=123))

forecaster = ForecasterAutoreg(
    regressor=pipe, lags=10  # This value will be replaced in the grid search
)

param_grid = {
    "xgbregressor__max_depth": [3, 10, 20],
    "xgbregressor__learning_rate": [0.01, 0.1, 0.3],
    "xgbregressor__subsample": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bytree": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bylevel": [0.5, 0.7, 1.0],
    "xgbregressor__n_estimators": [100, 500, 1000],
}

# Lags used as predictors
lags_grid = [7]

# Prepare data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [51:44<00:00, 3104.05s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'xgbregressor__colsample_bylevel': 1.0, 'xgbregressor__colsample_bytree': 0.7, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 0.5}
  Backtesting metric: 0.12988486571050464



In [61]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f8/XGB_forecaster_21.py')

# # Load model
# forecaster = load('results/f8/XGB_forecaster_21.py')


['results/f8/XGB_forecaster_21.py']

## fit on store data

In [62]:
rolls = 4
all_stores_result_CV = []

for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    # print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, 
        df_exog, 
        horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, 
        forecaster, 
        model_name, 
        transformer,
        horizon,
        rolls
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [63]:
all_stores_result_CV.to_pickle('results/f8/XGB_result_21.pkl')


# horizon = 28

## tune on company data
grid search:
- fit once per param set on train set
- predict & score on test set (no CV)

In [64]:
horizon = 28

# Grid search hyperparameter and lags
# ==============================================================================
pipe = make_pipeline(Normalizer(), XGBRegressor(random_state=123))

forecaster = ForecasterAutoreg(
    regressor=pipe, lags=10  # This value will be replaced in the grid search
)

param_grid = {
    "xgbregressor__max_depth": [3, 10, 20],
    "xgbregressor__learning_rate": [0.01, 0.1, 0.3],
    "xgbregressor__subsample": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bytree": [0.5, 0.7, 1.0],
    "xgbregressor__colsample_bylevel": [0.5, 0.7, 1.0],
    "xgbregressor__n_estimators": [100, 500, 1000],
}

# Lags used as predictors
lags_grid = [7]

# Prepare data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 729


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [36:53<00:00, 2213.43s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'xgbregressor__colsample_bylevel': 0.7, 'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 3, 'xgbregressor__n_estimators': 1000, 'xgbregressor__subsample': 1.0}
  Backtesting metric: 0.3241006999570049



In [65]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f8/XGB_forecaster_28.py')

# # Load model
# forecaster = load('results/f8/XGB_forecaster_28.py')


['results/f8/XGB_forecaster_28.py']

## fit on store data

In [66]:
rolls = 4
all_stores_result_CV = []

for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    # print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, 
        df_exog, 
        horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, 
        forecaster, 
        model_name, 
        transformer,
        horizon,
        rolls
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [None]:
all_stores_result_CV.to_pickle('results/f8/XGB_result_28.pkl')
