In [226]:
# imports
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
pd.options.plotting.backend = "plotly"
import plotly.io as pio
pio.orca.config.timeout = 3600
pio.orca.config.default_scale = 8
pio.orca.config.default_width = 800
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.model_evaluation import evaluate
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import (
    Detrender,
    Deseasonalizer
    )
from sktime.forecasting.compose import (
    ForecastingPipeline, 
    TransformedTargetForecaster)
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import ARIMA
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.base import ForecastingHorizon
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import (
    Normalizer, 
    MinMaxScaler
    )
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteScaledError,
    MeanAbsoluteError,
    MeanAbsolutePercentageError,
    MeanSquaredError,
    )
mase = MeanAbsoluteScaledError()
mape = MeanAbsolutePercentageError()
mae = MeanAbsoluteError()
rmse = MeanSquaredError(square_root=True)

from sktime.transformations.series.summarize import WindowSummarizer
from sktime.forecasting.model_selection import (
    SlidingWindowSplitter,
    ForecastingRandomizedSearchCV,
    ForecastingGridSearchCV,
    )



# data prep

In [11]:
# load data 
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]
horizon = 7
scaler = TabularToSeriesAdaptor(MinMaxScaler())



In [45]:
# prepare data
def data_prep(y, X, horizon):
    '''
    Extract lagged values, means, DateTime features from y

    Parameters
    ----------
    y: target time series
    X: exogenous variables
    horizon: number of steps ahead to forecast

    Returns
    -------
    y_short: raw values of y with length cut to equal length of X; frequency set to daily
    X_trans: transformed version of X
    '''
    # extract lags, means
    kwargs = {
        "lag_config": {
            "lag": ["lag", [[1,i+6] for i in range(horizon)]], 
            "expand_mean": ["mean", [[i,horizon-1] for i in range(2, horizon+1)]], 
            }}

    df_window = WindowSummarizer(**kwargs).fit_transform(y).dropna()

    # extract DateTimeFeatures
    df_from_y = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(df_window)
    df_X = X.merge(df_from_y, left_index=True, right_index=True)

    # transform X
    X_trans = scaler.fit_transform(df_X)

    # equalize len y & X
    y_short = y.tail(X_trans.shape[0])
    y_short.index.freq = "D"

    return y_short, X_trans

y_short, X_trans = data_prep(
    y=ts_company, 
    X=df_exog, 
    horizon=horizon)


  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex,

# tuning

In [None]:
### tunables
# scaler
# WindowSummarizer
# forecaster: estimator, window_length, strategy
# deseasonalize
# detrend

## set up

In [28]:
# define forecasting model
forecaster = make_reduction(
    estimator=XGBRegressor(eval_metric=mae), 
    window_length=7, 
    strategy="recursive",
    )

# pipeline
pipe = TransformedTargetForecaster([
    ("deseasonalize", Deseasonalizer(model="additive", sp=7)),
    ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
    ("scale", scaler),
    ("forecaster", forecaster),
    ])

# config CV
cv_folds = 4
cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_short) - horizon * cv_folds),
    step_length=horizon,
    )


## random search

In [39]:
# TUNE with RANDOM-SEARCH
param_grid = {
    "forecaster__estimator__subsample": np.arange(0.1, 1.1, 0.1).tolist(),
    "forecaster__estimator__n_estimators": list(range(1,1000)),
    "forecaster__estimator__max_depth": list(range(1,50)),
    "forecaster__estimator__learning_rate": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "forecaster__estimator__colsample_bytree": np.arange(0.1, 1.1, 0.1).tolist(),
}

rscv = ForecastingRandomizedSearchCV(
    pipe, 
    strategy="update", 
    cv=cv, 
    param_distributions=param_grid, 
    n_iter=1,
    n_jobs=-1,   
)

rscv.fit(y_short, X=X_trans)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


ForecastingRandomizedSearchCV(cv=SlidingWindowSplitter(fh=[1, 2, 3, 4, 5, 6, 7], step_length=7,
           window_length=1230),
                              forecaster=TransformedTargetForecaster(steps=[('deseasonalize',
                                                                             Deseasonalizer(sp=7)),
                                                                            ('detrend',
                                                                             Detrender(forecaster=PolynomialTrendForecaster())),
                                                                            ('scale',
                                                                             TabularToSeriesAdaptor(transformer=MinMaxScaler())),
                                                                            ('forecaster',
                                                                             RecursiveTabularRegr...
                                                   'f

## grid search

In [23]:
# TUNE with GRID-SEARCH
param_grid = {
    "forecaster__estimator__subsample":         [0.25, 0.3, 0.35],
    "forecaster__estimator__n_estimators":      [690, 692, 695],
    "forecaster__estimator__max_depth":         [7],
    "forecaster__estimator__learning_rate":     [0.05, 0.1, 0.15],
    "forecaster__estimator__colsample_bytree":  [0.35, 0.4, 0.45],
}

gscv = ForecastingGridSearchCV(
    forecaster=pipe, 
    strategy="refit", 
    cv=cv, 
    param_grid=param_grid, 
    n_jobs=-1,
    # verbose=1,
    # refit=False,
)

gscv.fit(y_short)


  values = pd.Int64Index(values, dtype=int)
  values = pd.Int64Index(values, dtype=int)
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  values = pd.Int64Index(values, dtype=int)
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  if x.freqstr is None:
  elif "-" in x.freq

# fitting

In [295]:
from joblib import dump, load
# # Save model
# dump(forecaster, filename='results/f8/XGB_forecaster_7.py')

# # Load model
# from joblib import dump, load
gscv = load('results/f9/RF_gscv.py')


In [296]:
best_pipe = TransformedTargetForecaster([
    ("deseasonalize", Deseasonalizer(model="additive", sp=7)),
    ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
    ("scale", scaler),
    # ("forecaster", gscv.best_forecaster_),
    ("forecaster", gscv.best_forecaster_),
    ])

In [297]:
RF_all_store_result = pd.DataFrame()
horizon=28
for store in df_store["store_id"].unique():
    # data
    ts_y = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    y_short, X_trans = data_prep(ts_y, df_exog, horizon=horizon)

    cv = SlidingWindowSplitter(
        fh=[i for i in range(1, horizon+1)],
        window_length=(len(y_short) - horizon * cv_folds),
        step_length=horizon,
        )

    # evaluate
    store_result = evaluate(
        forecaster=best_pipe, 
        cv=cv, 
        y=y_short, 
        X=X_trans, 
        scoring=MeanAbsoluteScaledError(),
        return_data=True,
        )

    store_result['store_id'] = str(store)
    store_result['mase'] = store_result['test_MeanAbsoluteScaledError']
    store_result['mape'] = [mape(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result['mae'] = [mae(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result['rmse'] = [rmse(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result.drop(columns=["test_MeanAbsoluteScaledError", "fit_time", "pred_time", "len_train_window"], inplace=True)

    RF_all_store_result = pd.concat([RF_all_store_result, store_result])

    print("Finish store", store)


  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, 

Finish store 307222
Finish store 307244
Finish store 307248
Finish store 320264
Finish store 328165
Finish store 349920
Finish store 349924
Finish store 349952
Finish store 349958
Finish store 349962
Finish store 349972
Finish store 349978
Finish store 349980
Finish store 349998
Finish store 350016
Finish store 350018
Finish store 350026
Finish store 350028
Finish store 350040
Finish store 350046
Finish store 350054
Finish store 350056
Finish store 350060
Finish store 354468
Finish store 387240
Finish store 412585
Finish store 441997
Finish store 452387
Finish store 461349
Finish store 464495
Finish store 471477
Finish store 476061
Finish store 480733
Finish store 528854
Finish store 536898
Finish store 536902
Finish store 566790
Finish store 566792


In [298]:
RF_all_store_result.to_pickle('results/f9/RF_all_store_result.pkl')

# SNAIVE

In [99]:
# model_SNAIVE = NaiveForecaster(sp=1, strategy="last")
model_SNAIVE = NaiveForecaster(sp=7, strategy="mean")


## results

In [206]:
# snaive function
def fc_snaive(df, horizon):
    all_store_result = pd.DataFrame()
    for store in df["store_id"].unique():
        # data
        ts_y = df[df["store_id"] == store].set_index("date")["sales"]
        ts_y.index.freq='D'
        
        cv = SlidingWindowSplitter(
            fh=[i for i in range(1, horizon+1)],
            window_length=(len(ts_y) - horizon * cv_folds),
            step_length=horizon,
            )

        # evaluate
        store_result = evaluate(
            forecaster=model_SNAIVE, 
            cv=cv, 
            y=ts_y, 
            scoring=MeanAbsoluteScaledError(),
            return_data=True,
            )
        
        for i in range(cv_folds):
            y_test = store_result.loc[i,'y_test']
            y_pred = store_result.loc[i,'y_pred']
            
            store_result.loc[i,'mape'] = mape(y_test, y_pred)
            store_result.loc[i,'mae'] = mae(y_test, y_pred)
            store_result.loc[i,'rmse'] = rmse(y_test, y_pred)

        store_result.rename(columns={'test_MeanAbsoluteScaledError':'mase'}, inplace=True)
        store_result['store_id'] = str(store)
        store_result.drop(columns=["cutoff", "fit_time", "pred_time", "len_train_window"], inplace=True)

        all_store_result = pd.concat([all_store_result, store_result])

    all_store_result = all_store_result[[
        'store_id',
        'mase', 
        'mape', 
        'mae', 
        'rmse',
        'y_train', 
        'y_test', 
        'y_pred', 
        ]]

    return all_store_result

def sum_result(df):
    df = df.groupby('store_id').agg({
        "mape": "mean",
        "mase": "mean",
        "mae": "mean",
        "rmse": "mean",
        "y_train": "last",
        "y_test": "last",
        "y_pred": "last",
        })
    return df


In [208]:
snaive_result_7 = sum_result(fc_snaive(df_store, horizon=7))
snaive_result_14 = sum_result(fc_snaive(df_store, horizon=14))
snaive_result_21 = sum_result(fc_snaive(df_store, horizon=21))
snaive_result_28 = sum_result(fc_snaive(df_store, horizon=28))


In [209]:
snaive_result_7.sort_values('mape').iloc[[0,19,37],:]

df_plot = snaive_result_7.sort_values('mape').loc['349972',:]

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_plot['y_test'].index, 
                         y=df_plot['y_test'], 
                         name='Test set',
                         line={'color': 'dodgerblue'},
                         ))
fig.add_trace(go.Scatter(x=df_plot['y_pred'].index, 
                         y=df_plot['y_pred'], 
                         name='Forecast',
                         line={'color': 'salmon', 'dash': 'dash'},
                         ))
fig.update_layout(title='Seasonal naive forecast')
fig.update_yaxes(title_text='Sales in million VND')

fig.show()


Unnamed: 0_level_0,mape,mase,mae,rmse,y_train,y_test,y_pred
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
349962,0.225668,0.506817,3.487238,4.614568,2017-09-08 5.56825 2017-09-09 13.47849 ...,2021-01-25 11.4129 2021-01-26 14.0360 20...,2021-01-25 8.302224 2021-01-26 8.52989...
349972,0.416356,1.420666,11.384357,13.714064,2017-09-15 0.94230 2017-09-16 0.00000 ...,2021-01-25 14.685600 2021-01-26 20.32720...,2021-01-25 13.526297 2021-01-26 13.47030...
350018,0.783913,2.67172,38.624567,48.393936,2017-09-07 7.858100 2017-09-08 4.963...,2021-01-25 47.77190 2021-01-26 38.8677...,2021-01-25 16.667069 2021-01-26 15.62203...


In [269]:
fig.write_image(file="results/plots/fc_snaive_7.png")

In [240]:
df_plot = snaive_result_28.sort_values('mape').loc['349972',:]

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_plot['y_test'].index, 
                         y=df_plot['y_test'], 
                         name='Test set',
                         line={'color': 'dodgerblue'},
                         ))
fig.add_trace(go.Scatter(x=df_plot['y_pred'].index, 
                         y=df_plot['y_pred'], 
                         name='Forecast',
                         line={'color': 'salmon', 'dash': 'dash'},
                         ))
fig.update_layout(title='Seasonal naive forecast')
fig.update_yaxes(title_text='Figures in million VND')

# fig.show()

fig.write_image(file="results/plots/fc_snaive_28.png")

In [266]:
round(snaive_result_7.describe(), 2)

Unnamed: 0,mape,mase,mae,rmse
count,38.0,38.0,38.0,38.0
mean,0.42,1.18,8.9,11.23
std,0.11,0.5,7.76,10.13
min,0.23,0.51,2.81,3.41
25%,0.34,0.82,3.65,4.87
50%,0.41,1.08,5.48,6.89
75%,0.48,1.41,11.46,14.02
max,0.78,2.71,38.62,48.39


In [267]:
snaive_result_7.iloc[:,:]

Unnamed: 0_level_0,mape,mase,mae,rmse,y_train,y_test,y_pred
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
307222,0.248693,0.783139,7.75666,9.989747,2017-08-28 7.366350 2017-08-29 15.39972...,2021-01-25 24.848540 2021-01-26 18.83403...,2021-01-25 17.683784 2021-01-26 19.37228...
307244,0.258776,0.643764,5.050195,6.29751,2017-08-31 10.394180 2017-09-01 18.80089...,2021-01-25 17.889600 2021-01-26 18.23920...,2021-01-25 14.611911 2021-01-26 14.93820...
307248,0.378832,1.13101,7.812281,9.186986,2017-09-07 3.04870 2017-09-08 5.42386 ...,2021-01-25 21.487100 2021-01-26 23.72387...,2021-01-25 12.301647 2021-01-26 12.53932...
320264,0.301306,0.779109,5.315448,7.766792,2017-09-15 0.000000 2017-09-16 0.00000...,2021-01-25 12.109300 2021-01-26 11.42090...,2021-01-25 10.588502 2021-01-26 10.93650...
328165,0.437284,1.461364,32.222639,45.09748,2017-08-30 0.000000 2017-08-31 0.000...,2021-01-25 41.708122 2021-01-26 54.927...,2021-01-25 28.790498 2021-01-26 29.31330...
349920,0.44721,1.264655,18.963945,23.932269,2017-09-08 0.000000 2017-09-09 0.000...,2021-01-25 26.254900 2021-01-26 29.108...,2021-01-25 20.323551 2021-01-26 19.90663...
349924,0.374602,1.133137,8.354913,10.238745,2017-09-07 0.00000 2017-09-08 0.00000 ...,2021-01-25 19.939000 2021-01-26 22.69375...,2021-01-25 11.898900 2021-01-26 12.72865...
349952,0.45984,1.421171,9.772045,11.422154,2017-09-13 10.26105 2017-09-14 0.90660 ...,2021-01-25 21.040600 2021-01-26 38.76180...,2021-01-25 11.688739 2021-01-26 11.60596...
349958,0.419526,1.390357,12.241482,14.832848,2017-09-07 10.013050 2017-09-08 2.36375...,2021-01-25 30.067400 2021-01-26 36.82551...,2021-01-25 15.670233 2021-01-26 16.04982...
349962,0.225668,0.506817,3.487238,4.614568,2017-09-08 5.56825 2017-09-09 13.47849 ...,2021-01-25 11.4129 2021-01-26 14.0360 20...,2021-01-25 8.302224 2021-01-26 8.52989...


# XGB

In [272]:
XGB_result_7 = pd.read_pickle('results/f9/XGB_result_7.pkl')
XGB_result_7 = sum_result(XGB_result_7)


In [274]:
round(XGB_result_7.describe(), 2)


Unnamed: 0,mape,mase,mae,rmse
count,38.0,38.0,38.0,38.0
mean,0.4,1.17,8.14,9.61
std,0.12,0.34,4.92,5.59
min,0.22,0.53,2.87,3.28
25%,0.31,0.94,4.61,5.39
50%,0.38,1.14,6.46,7.58
75%,0.45,1.36,10.01,11.85
max,0.74,2.13,27.41,30.39


In [294]:
df_plot = XGB_result_7.sort_values('mape').iloc[37,:]

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_plot['y_test'].index, 
                         y=df_plot['y_test'], 
                         name='Test set',
                         line={'color': 'dodgerblue'},
                         ))
fig.add_trace(go.Scatter(x=df_plot['y_pred'].index, 
                         y=df_plot['y_pred'], 
                         name='Forecast',
                         line={'color': 'salmon', 'dash': 'dash'},
                         ))
fig.update_layout(title='XGBoost forecast')
fig.update_yaxes(title_text='Sales in million VND')

fig.show()
