In [170]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
pd.options.plotting.backend = "plotly"
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.model_evaluation import evaluate
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import (
    Detrender,
    Deseasonalizer
    )
from sktime.forecasting.compose import (
    ForecastingPipeline, 
    TransformedTargetForecaster)
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import ARIMA
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import (
    Normalizer, 
    MinMaxScaler
    )
from sktime.performance_metrics.forecasting import (
    MeanAbsoluteScaledError,
    MeanAbsoluteError,
    MeanAbsolutePercentageError,
    MeanSquaredError,
    )
mase = MeanAbsoluteScaledError()
mape = MeanAbsolutePercentageError()
mae = MeanAbsoluteError()
rmse = MeanSquaredError(square_root=True)

from sktime.transformations.series.summarize import WindowSummarizer
from sktime.forecasting.model_selection import (
    SlidingWindowSplitter,
    ForecastingRandomizedSearchCV,
    ForecastingGridSearchCV,
    )


horizon = 7
sp = 7
cv_folds = 4

# data prep

In [171]:
# load data 
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


In [172]:
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_y = ts_company

# extract lags, means, medians
kwargs = {
    "lag_config": {
        "lag": ["lag", [[1,i+6] for i in range(horizon)]], 
            # sales_lag_1_6 = lag 7
        "expand_mean": ["mean", [[i,horizon-1] for i in range(2, horizon+1)]], 
            # sales_expand_mean_2_6 = mean of 2 lags starting from lag 7, i.e lag 7 & lag 8
            # sales_expand_mean_3_6 = mean of 3 lags starting from lag 7, i.e lag 7 & lag 8 & lag 9
            # etc.
        }}
df_window = WindowSummarizer(**kwargs).fit_transform(ts_y).dropna()

# extract DateTimeFeatures
df_from_y = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(df_window)
df_X = df_exog.merge(df_from_y, left_index=True, right_index=True)

# train/test split
y_train, y_test, X_train, X_test = temporal_train_test_split(
    X=df_X, 
    y=ts_y.tail(len(df_window)), 
    test_size=horizon)
y_train.index.freq = 'D'

# forecast horizon
fh = ForecastingHorizon(X_test.index, is_relative=False)

# transform X
scaler = TabularToSeriesAdaptor(MinMaxScaler())
X_train_trans = scaler.fit_transform(X_train)
X_test_trans = scaler.transform(X_test)

# use RF as forecaster
forecaster = make_reduction(
    estimator=RandomForestRegressor(criterion="absolute_error"), 
    window_length=sp, 
    strategy="recursive",
    )

# transform y
pipe = TransformedTargetForecaster([
    ("deseasonalize", Deseasonalizer(model="additive", sp=sp)),
    ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
    ("scale", scaler),
    ("forecaster", forecaster),
    ])


  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, 

# tuning

## cross validation

In [173]:
# config CV
horizon=7
cv_folds=4

cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_train) - len(y_test) * cv_folds),
    step_length=horizon,
    )

for train, test in cv.split(y_train):
    print("%s %s" % (train, test))


[   0    1    2 ... 1223 1224 1225] [1226 1227 1228 1229 1230 1231 1232]
[   7    8    9 ... 1230 1231 1232] [1233 1234 1235 1236 1237 1238 1239]
[  14   15   16 ... 1237 1238 1239] [1240 1241 1242 1243 1244 1245 1246]
[  21   22   23 ... 1244 1245 1246] [1247 1248 1249 1250 1251 1252 1253]


## random search


In [204]:
X_train_trans.shape[1]

26

In [None]:
# TUNE with RANDOM-SEARCH
param_grid = {
    'forecaster__estimator__n_estimators': list(range(1,1000)),   # no. of trees
    'forecaster__estimator__min_samples_split': list(range(1,5000)),
    'forecaster__estimator__min_samples_leaf': list(range(1,4000)),
    'forecaster__estimator__max_leaf_nodes': list(range(1,200)),
    'forecaster__estimator__max_features': list(range(1,X_train_trans.shape[1])),   # at each split
    'forecaster__estimator__max_depth': list(range(1,50)),      # of each tree
    }

rscv = ForecastingRandomizedSearchCV(
    pipe, 
    strategy="refit", 
    cv=cv, 
    param_distributions=param_grid, 
    n_iter=100,
    n_jobs=-1,   
)

y_train.index.freq = 'D'
rscv.fit(y_train, X=X_train_trans)
rscv_y_pred = rscv.predict(fh=fh, X=X_test_trans)


  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeInd

In [201]:
rscv.cv_results_.sort_values('rank_test_MeanAbsolutePercentageError')


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
68,0.254735,52.553523,0.662173,"{'forecaster__estimator__n_estimators': 413, '...",1.0
94,0.281255,29.911624,0.420842,"{'forecaster__estimator__n_estimators': 455, '...",2.0
11,0.402141,1.368905,0.310841,"{'forecaster__estimator__n_estimators': 75, 'f...",3.0
38,0.402152,1.265036,0.198305,"{'forecaster__estimator__n_estimators': 35, 'f...",4.0
51,0.402626,3.330325,0.993299,"{'forecaster__estimator__n_estimators': 283, '...",5.0
...,...,...,...,...,...
84,0.403299,4.567549,1.250572,"{'forecaster__estimator__n_estimators': 369, '...",96.0
37,0.403303,5.419687,1.568233,"{'forecaster__estimator__n_estimators': 621, '...",97.0
47,0.403307,4.827724,1.524254,"{'forecaster__estimator__n_estimators': 452, '...",98.0
86,0.403354,3.074664,0.805368,"{'forecaster__estimator__n_estimators': 190, '...",99.0


In [202]:
rscv.best_params_


{'forecaster__estimator__n_estimators': 413,
 'forecaster__estimator__min_samples_split': 443,
 'forecaster__estimator__min_samples_leaf': 91,
 'forecaster__estimator__max_leaf_nodes': 66,
 'forecaster__estimator__max_features': 22,
 'forecaster__estimator__max_depth': 43}

## grid search

In [206]:
# TUNE with GRID-SEARCH
param_grid = {
    'forecaster__estimator__n_estimators':      [410, 420],
    'forecaster__estimator__min_samples_split': [440, 450],
    'forecaster__estimator__min_samples_leaf':  [90, 92],
    'forecaster__estimator__max_leaf_nodes':    [65, 67],
    'forecaster__estimator__max_features':      [21, 23],
    'forecaster__estimator__max_depth':         [42, 44],
    }

gscv = ForecastingGridSearchCV(
    forecaster=pipe, 
    strategy="refit", 
    cv=cv, 
    param_grid=param_grid, 
    n_jobs=-1,
    # verbose=1,
    # refit=False,
)

gscv.fit(y_train, X=X_train_trans)
gscv_y_pred = gscv.predict(fh=fh, X=X_test_trans)


Fitting 4 folds for each of 64 candidates, totalling 256 fits


  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeInd

In [None]:
# from joblib import dump, load
# Save model
# dump(gscv, filename='results/f9/RF_gscv.py')
# dump(rscv, filename='results/f9/RF_rscv.py')

# Load model
# gscv = load('results/f9/RF_gscv.py')
# gscv = load('results/f9/RF_gscv.py')


In [216]:
gscv.cv_results_.sort_values('rank_test_MeanAbsolutePercentageError').head(3)


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
61,0.196564,70.221242,0.985276,"{'forecaster__estimator__max_depth': 44, 'fore...",1.0
24,0.200924,60.337559,0.779058,"{'forecaster__estimator__max_depth': 42, 'fore...",2.0
29,0.203423,61.123751,0.978964,"{'forecaster__estimator__max_depth': 42, 'fore...",3.0


In [207]:
gscv.best_params_


{'forecaster__estimator__max_depth': 44,
 'forecaster__estimator__max_features': 23,
 'forecaster__estimator__max_leaf_nodes': 67,
 'forecaster__estimator__min_samples_leaf': 92,
 'forecaster__estimator__min_samples_split': 440,
 'forecaster__estimator__n_estimators': 420}

## result

In [2]:
gscv_results = pd.read_pickle('results/f9/gscv_results.pkl')
gscv_results = gscv_results.join(gscv_results.params.apply(pd.Series).iloc[:,-2:]).drop(columns='params')
gscv_results


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,rank_test_MeanAbsolutePercentageError,forecaster__estimator__n_estimators,forecaster__window_length
0,0.252333,1.860852,0.077746,1.0,10.0,7.0
1,0.306031,3.344768,0.064443,16.0,10.0,14.0
2,0.278313,4.975771,0.070513,10.0,10.0,21.0
3,0.265601,8.420335,0.19329,5.0,50.0,7.0
4,0.276955,15.509201,0.128148,8.0,50.0,14.0
5,0.329182,22.040735,0.131831,18.0,50.0,21.0
6,0.27407,16.112373,0.253241,7.0,100.0,7.0
7,0.310525,31.07576,0.265399,17.0,100.0,14.0
8,0.28791,46.049389,0.285377,15.0,100.0,21.0
9,0.282024,77.312614,1.034352,11.0,500.0,7.0


### MAPE heat map 

In [3]:
# MAPE heat map data
px_data = gscv_results.pivot(
    index='forecaster__window_length',
    columns='forecaster__estimator__n_estimators',
    values='mean_test_MeanAbsolutePercentageError'
    )
px_data.index = px_data.index.astype('str')
px_data.columns = px_data.columns.astype('str')
px_data


forecaster__estimator__n_estimators,10.0,50.0,100.0,500.0,1000.0,5000.0
forecaster__window_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7.0,0.252333,0.265601,0.27407,0.282024,0.273477,0.27744
14.0,0.306031,0.276955,0.310525,0.26279,0.264661,0.264617
21.0,0.278313,0.329182,0.28791,0.284198,0.286141,0.286484


In [None]:
# MAPE heat map plot
fig = px.imshow(
    px_data,
    text_auto=True,
    color_continuous_scale='RdBu_r'
    )
fig.update_xaxes(side="top")
fig.show()


### feature importances

In [33]:
# refit estimator to get feature_importances_
y_train_trans = pipe.fit_transform(y_train)
forecaster.get_params()['estimator'].fit(y=y_train_trans, X=X_train_trans)


RandomForestRegressor(criterion='absolute_error')

In [36]:
feature_importances = pd.DataFrame({
    'feature': forecaster.get_params()['estimator'].feature_names_in_,
    'importance': forecaster.get_params()['estimator'].feature_importances_,
    })

feature_importances


Unnamed: 0,feature,importance
0,off_day,0.005799
1,promo_day,0.068791
2,sales_lag_0_7,0.029413
3,sales_lag_1_8,0.035805
4,sales_lag_2_9,0.026794
5,sales_lag_3_10,0.02948
6,sales_lag_4_11,0.029412
7,sales_lag_5_12,0.029483
8,sales_lag_6_13,0.034394
9,sales_expand_mean_2_0,0.201331


In [43]:
feature_importances.sort_values('importance', ascending=False).set_index('feature').plot()

# fitting

In [218]:

best_forecaster = gscv.best_forecaster_

all_store_result = pd.DataFrame()

for store in df_store["store_id"].unique():#[:2]:
    # data
    ts_y = df_store[df_store["store_id"] == store].set_index("date")["sales"]

    # extract lags, means, medians
    df_window = WindowSummarizer(**kwargs).fit_transform(ts_y).dropna()

    # extract DateTimeFeatures
    df_from_y = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(df_window)
    df_X = df_exog.merge(df_from_y, left_index=True, right_index=True)

    # transform
    df_X_trans = scaler.fit_transform(df_X)
    ts_y_trans = ts_y.tail(len(df_window))
    ts_y_trans.index.freq = 'D'
    
    # cv
    cv = SlidingWindowSplitter(
        fh=[i for i in range(1, horizon+1)],
        window_length=(len(ts_y_trans) - horizon * cv_folds),
        step_length=horizon,
        )

    # evaluate
    store_result = evaluate(
        forecaster=best_forecaster, 
        cv=cv, 
        y=ts_y_trans, 
        X=df_X_trans, 
        scoring=MeanAbsoluteScaledError(),
        return_data=True,
        )

    store_result['store_id'] = str(store)
    store_result['mase'] = store_result['test_MeanAbsoluteScaledError']
    store_result['mape'] = [mape(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result['mae'] = [mae(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result['rmse'] = [rmse(store_result.loc[i,'y_test'], store_result.loc[i,'y_pred']) for i in range(cv_folds)] 
    store_result.drop(columns=["test_MeanAbsoluteScaledError", "fit_time", "pred_time", "len_train_window"], inplace=True)

    all_store_result = pd.concat([all_store_result, store_result])


  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.Ra

In [220]:
all_store_result.groupby('store_id').mean().mean()


mase     1.217682
mape     0.448011
mae      9.028615
rmse    11.609702
dtype: float64

In [221]:
# all_store_result.to_pickle('results/f9/RF_result_7.pkl')