In [170]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
pd.options.plotting.backend = "plotly"
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.model_evaluation import evaluate
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import (
    Detrender,
    Deseasonalizer
    )
from sktime.forecasting.compose import (
    ForecastingPipeline, 
    TransformedTargetForecaster)
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import ARIMA
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import (
    Normalizer, 
    MinMaxScaler
    )
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteScaledError,
    MeanAbsoluteError,
    MeanAbsolutePercentageError,
    MeanSquaredError,
    )
mase = MeanAbsoluteScaledError()
mape = MeanAbsolutePercentageError()
mae = MeanAbsoluteError()
rmse = MeanSquaredError(square_root=True)

from sktime.transformations.series.summarize import WindowSummarizer
from sktime.forecasting.model_selection import (
    SlidingWindowSplitter,
    ForecastingRandomizedSearchCV,
    ForecastingGridSearchCV,
    )


horizon = 7
sp = 7
cv_folds = 4

# data prep

In [171]:
# load data 
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


In [172]:
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_y = ts_company

# extract lags, means, medians
kwargs = {
    "lag_config": {
        "lag": ["lag", [[1,i+6] for i in range(horizon)]], 
            # sales_lag_1_6 = lag 7
        "expand_mean": ["mean", [[i,horizon-1] for i in range(2, horizon+1)]], 
            # sales_expand_mean_2_6 = mean of 2 lags starting from lag 7, i.e lag 7 & lag 8
            # sales_expand_mean_3_6 = mean of 3 lags starting from lag 7, i.e lag 7 & lag 8 & lag 9
            # etc.
        }}
df_window = WindowSummarizer(**kwargs).fit_transform(ts_y).dropna()

# extract DateTimeFeatures
df_from_y = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(df_window)
df_X = df_exog.merge(df_from_y, left_index=True, right_index=True)

# train/test split
y_train, y_test, X_train, X_test = temporal_train_test_split(
    X=df_X, 
    y=ts_y.tail(len(df_window)), 
    test_size=horizon)
y_train.index.freq = 'D'

# forecast horizon
fh = ForecastingHorizon(X_test.index, is_relative=False)

# transform X
scaler = TabularToSeriesAdaptor(MinMaxScaler())
X_train_trans = scaler.fit_transform(X_train)
X_test_trans = scaler.transform(X_test)

# use RF as forecaster
forecaster = make_reduction(
    estimator=RandomForestRegressor(criterion="absolute_error"), 
    window_length=sp, 
    strategy="recursive",
    )

# transform y
pipe = TransformedTargetForecaster([
    ("deseasonalize", Deseasonalizer(model="additive", sp=sp)),
    ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
    ("scale", scaler),
    ("forecaster", forecaster),
    ])


  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, 

# tuning

## cross validation

In [173]:
# config CV
horizon=7
cv_folds=4

# train/test split
# y_train, y_test, X_train, X_test = temporal_train_test_split(
#     X=df_X, 
#     y=ts_y.tail(len(df_window)), 
#     test_size=horizon)

cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_train) - len(y_test) * cv_folds),
    step_length=horizon,
    )

for train, test in cv.split(y_train):
    print("%s %s" % (train, test))


[   0    1    2 ... 1223 1224 1225] [1226 1227 1228 1229 1230 1231 1232]
[   7    8    9 ... 1230 1231 1232] [1233 1234 1235 1236 1237 1238 1239]
[  14   15   16 ... 1237 1238 1239] [1240 1241 1242 1243 1244 1245 1246]
[  21   22   23 ... 1244 1245 1246] [1247 1248 1249 1250 1251 1252 1253]


## random search


In [169]:
# TUNE with RANDOM-SEARCH
param_grid = {
    'forecaster__estimator__n_estimators': [500],   # no. of trees
    'forecaster__estimator__max_depth': [100],      # of each tree
    'forecaster__estimator__max_features': [.33],   # at each split
    # 'forecaster__estimator__max_samples': np.arange(0.1, 1, 0.1),     # of each tree
    # 'forecaster__estimator__max_samples': np.append(np.arange(0.1, 1, 0.1), None),
    'forecaster__estimator__max_samples': [0.1],
    
    'forecaster__estimator__min_samples_split': [2],#, 5, 10, 50],
    # 'forecaster__estimator__min_samples_leaf': [1, 2, 4, 8, 16],
    }

cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_train) - len(y_test) * cv_folds),
    step_length=horizon,
    )

rscv = ForecastingRandomizedSearchCV(
    pipe, 
    strategy="refit", 
    cv=cv, 
    param_distributions=param_grid, 
    n_iter=2,
    n_jobs=-1,   
)

y_train.index.freq = 'D'
rscv.fit(y_train, X=X_train_trans)
rscv_y_pred = rscv.predict(fh=fh, X=X_test_trans)



The total space of parameters 1 is smaller than n_iter=2. Running 1 iterations. For exhaustive searches, use GridSearchCV.

  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])
  values = pd.Int64Index(values, dtype=int)


KeyboardInterrupt: 

In [None]:
rscv.cv_results_.sort_values('rank_test_MeanAbsolutePercentageError')
# rscv.best_params_


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
0,0.28045,9.760612,0.307832,"{'forecaster__estimator__n_estimators': 500, '...",1.0


## grid search

In [174]:
# TUNE with GRID-SEARCH
param_grid = {
    "forecaster__window_length": [7],#,14,21],
    "forecaster__estimator__n_estimators": [10],#,50,100,500,1000,5000],   # no. of trees
    "forecaster__estimator__max_features": [None],   # at each split
    "forecaster__estimator__max_depth": [None],      # of each tree
    "forecaster__estimator__min_samples_split": [2],
    }

# cv = SlidingWindowSplitter(
#     fh=[i for i in range(1, horizon+1)],
#     window_length=(len(y_train) - len(y_test) * cv_folds),
#     step_length=horizon,
#     )

gscv = ForecastingGridSearchCV(
    forecaster=pipe, 
    strategy="refit", 
    cv=cv, 
    param_grid=param_grid, 
    n_jobs=-1,
    verbose=1,
    # refit=False,
)

gscv.fit(y_train)

gscv_y_pred = gscv.predict(fh)
gscv.best_params_


Fitting 4 folds for each of 1 candidates, totalling 4 fits


  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  values = pd.Int64Index(values, dtype=int)
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])
  values = pd.Int64Index(values, dtype=int)
  if not hasattr(cutoff, "freqstr") or cutoff.freqstr is None:
  assert cutoff.freqstr == index.freqstr
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])
  assert isinstance(by, (int, np.integer, pd.Int64Index)), type(by)
  if not hasattr(x, "freq") or x.freq is None:
  by *= x.freq
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])
  warn(
  results = results.append(
  values = pd.In

{'forecaster__estimator__max_depth': None,
 'forecaster__estimator__max_features': None,
 'forecaster__estimator__min_samples_split': 2,
 'forecaster__estimator__n_estimators': 10,
 'forecaster__window_length': 7}

## result

In [2]:
gscv_results = pd.read_pickle('results/f9/gscv_results.pkl')
gscv_results = gscv_results.join(gscv_results.params.apply(pd.Series).iloc[:,-2:]).drop(columns='params')
gscv_results


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,rank_test_MeanAbsolutePercentageError,forecaster__estimator__n_estimators,forecaster__window_length
0,0.252333,1.860852,0.077746,1.0,10.0,7.0
1,0.306031,3.344768,0.064443,16.0,10.0,14.0
2,0.278313,4.975771,0.070513,10.0,10.0,21.0
3,0.265601,8.420335,0.19329,5.0,50.0,7.0
4,0.276955,15.509201,0.128148,8.0,50.0,14.0
5,0.329182,22.040735,0.131831,18.0,50.0,21.0
6,0.27407,16.112373,0.253241,7.0,100.0,7.0
7,0.310525,31.07576,0.265399,17.0,100.0,14.0
8,0.28791,46.049389,0.285377,15.0,100.0,21.0
9,0.282024,77.312614,1.034352,11.0,500.0,7.0


### MAPE heat map 

In [3]:
# MAPE heat map data
px_data = gscv_results.pivot(
    index='forecaster__window_length',
    columns='forecaster__estimator__n_estimators',
    values='mean_test_MeanAbsolutePercentageError'
    )
px_data.index = px_data.index.astype('str')
px_data.columns = px_data.columns.astype('str')
px_data


forecaster__estimator__n_estimators,10.0,50.0,100.0,500.0,1000.0,5000.0
forecaster__window_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7.0,0.252333,0.265601,0.27407,0.282024,0.273477,0.27744
14.0,0.306031,0.276955,0.310525,0.26279,0.264661,0.264617
21.0,0.278313,0.329182,0.28791,0.284198,0.286141,0.286484


In [None]:
# MAPE heat map plot
fig = px.imshow(
    px_data,
    text_auto=True,
    color_continuous_scale='RdBu_r'
    )
fig.update_xaxes(side="top")
fig.show()


### feature importances

In [33]:
# refit estimator to get feature_importances_
y_train_trans = pipe.fit_transform(y_train)
forecaster.get_params()['estimator'].fit(y=y_train_trans, X=X_train_trans)


RandomForestRegressor(criterion='absolute_error')

In [36]:
feature_importances = pd.DataFrame({
    'feature': forecaster.get_params()['estimator'].feature_names_in_,
    'importance': forecaster.get_params()['estimator'].feature_importances_,
    })

feature_importances


Unnamed: 0,feature,importance
0,off_day,0.005799
1,promo_day,0.068791
2,sales_lag_0_7,0.029413
3,sales_lag_1_8,0.035805
4,sales_lag_2_9,0.026794
5,sales_lag_3_10,0.02948
6,sales_lag_4_11,0.029412
7,sales_lag_5_12,0.029483
8,sales_lag_6_13,0.034394
9,sales_expand_mean_2_0,0.201331


In [43]:
feature_importances.sort_values('importance', ascending=False).set_index('feature').plot()

# fitting

In [69]:
best_forecaster = gscv.best_forecaster_


TransformedTargetForecaster(steps=[('deseasonalize', Deseasonalizer(sp=7)),
                                   ('detrend',
                                    Detrender(forecaster=PolynomialTrendForecaster())),
                                   ('scale',
                                    TabularToSeriesAdaptor(transformer=MinMaxScaler())),
                                   ('forecaster',
                                    RecursiveTabularRegressionForecaster(estimator=RandomForestRegressor(criterion='absolute_error',
                                                                                                         max_features=None,
                                                                                                         n_estimators=10),
                                                                         window_length=7))])

In [151]:
all_store_result = pd.DataFrame()

# for store in df_store["store_id"].unique():
for store in df_store["store_id"].unique()[:2]:
    # data
    ts_y = df_store[df_store["store_id"] == store].set_index("date")["sales"]

    # extract lags, means, medians
    df_window = WindowSummarizer(**kwargs).fit_transform(ts_y).dropna()

    # extract DateTimeFeatures
    df_from_y = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(df_window)
    df_X = df_exog.merge(df_from_y, left_index=True, right_index=True)

    # transform
    df_X_trans = scaler.fit_transform(df_X)
    ts_y_trans = ts_y.tail(len(df_window))
    ts_y_trans.index.freq = 'D'
    
    # cv
    cv = SlidingWindowSplitter(
        fh=[i for i in range(1, horizon+1)],
        window_length=(len(ts_y_trans) - horizon * cv_folds),
        step_length=horizon,
        )

    # evaluate
    store_result = evaluate(
        forecaster=best_forecaster, 
        cv=cv, 
        y=ts_y_trans, 
        X=df_X_trans, 
        scoring=MeanAbsoluteScaledError(),
        return_data=True,
        )

    store_result['store_id'] = str(store)
    store_result['mase'] = store_result['test_MeanAbsoluteScaledError']
    store_result['mape'] = [mape(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result['mae'] = [mae(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result['rmse'] = [rmse(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result.drop(columns=["test_MeanAbsoluteScaledError", "fit_time", "pred_time", "len_train_window"], inplace=True)

    all_store_result = pd.concat([all_store_result, store_result])


  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeI

In [152]:
all_store_result


Unnamed: 0,cutoff,y_train,y_test,y_pred,store_id,mase,mape,mae,rmse
0,2021-01-03,2017-08-20 20.122770 2017-08-21 0.27880...,2021-01-04 17.041805 2021-01-05 14.57060...,2021-01-04 22.282136 2021-01-05 31.16253...,307222,1.194258,0.404317,11.837216,13.83716
1,2021-01-10,2017-08-27 26.070100 2017-08-28 7.36635...,2021-01-11 16.588700 2021-01-12 20.27640...,2021-01-11 20.741697 2021-01-12 28.08367...,307222,0.589542,0.203565,5.859277,7.551745
2,2021-01-17,2017-09-03 16.873060 2017-09-04 4.81842...,2021-01-18 18.799600 2021-01-19 27.78320...,2021-01-18 18.372920 2021-01-19 22.16116...,307222,0.721123,0.206256,7.161457,9.106298
3,2021-01-24,2017-09-10 0.000000 2017-09-11 0.00000...,2021-01-25 24.848540 2021-01-26 18.83403...,2021-01-25 29.815955 2021-01-26 26.40952...,307222,0.675228,0.178919,6.725487,9.008792
0,2021-01-03,2017-08-23 0.000000 2017-08-24 0.00000...,2021-01-04 10.987803 2021-01-05 15.22061...,2021-01-04 13.482549 2021-01-05 24.12468...,307244,1.125682,0.397409,8.857679,12.306703
1,2021-01-10,2017-08-30 12.356000 2017-08-31 10.39418...,2021-01-11 8.271600 2021-01-12 16.20540...,2021-01-11 18.610576 2021-01-12 16.77261...,307244,0.880974,0.357499,6.930244,8.129375
2,2021-01-17,2017-09-06 4.596500 2017-09-07 6.82201...,2021-01-18 10.078400 2021-01-19 17.20618...,2021-01-18 19.822354 2021-01-19 21.62157...,307244,0.912302,0.386004,7.200405,8.053555
3,2021-01-24,2017-09-13 20.684930 2017-09-14 9.24962...,2021-01-25 17.889600 2021-01-26 18.23920...,2021-01-25 21.496057 2021-01-26 12.23619...,307244,0.943731,0.337032,7.45954,8.944529
