In [2]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
pd.options.plotting.backend = "plotly"
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.transformations.series.detrend import (Detrender,
                                                   Deseasonalizer)
from sktime.forecasting.compose import (ForecastingPipeline, TransformedTargetForecaster)
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.arima import ARIMA
from sktime.transformations.series.date import DateTimeFeatures
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import (Normalizer, MinMaxScaler)
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
from sktime.transformations.series.summarize import WindowSummarizer

horizon = 7
sp = 7
cv_folds = 4

In [67]:
# load data 
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


# PIPELINE 1


In [29]:
# extract DateTimeFeatures
df_dt = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(ts_company)
df_exog = df_exog.merge(df_dt, left_index=True, right_index=True).drop(columns='sales')

# split
y_train, y_test, X_train, X_test = temporal_train_test_split(X=df_exog, y=ts_company, test_size=horizon)
y_train.index.freq = 'D'

# fh
fh = ForecastingHorizon(X_test.index, is_relative=False)
fh


ForecastingHorizon(['2021-01-25', '2021-01-26', '2021-01-27', '2021-01-28',
               '2021-01-29', '2021-01-30', '2021-01-31'],
              dtype='datetime64[ns]', name='date', freq='D', is_relative=False)

In [72]:
# transform X
transformer_X = TabularToSeriesAdaptor(MinMaxScaler())
X_train_trans = transformer_X.fit_transform(X_train)
X_test_trans = transformer_X.transform(X_test)

# pipeline
pipe_y = TransformedTargetForecaster([
    ("deseasonalize", Deseasonalizer(model="additive", sp=sp)),
    ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
    ("scale", TabularToSeriesAdaptor(MinMaxScaler())),
    ("forecaster", make_reduction(KNeighborsRegressor(n_neighbors=1), window_length=sp, strategy="recursive")),
    ])

# fit
pipe_y.fit(y_train, X=X_train_trans, fh=fh)

# predict (already inverse)
y_pred = pipe_y.predict(fh=fh, X=X_test_trans)

# score
mape = mean_absolute_percentage_error(y_test, y_pred)
print(mape, '\n\n', y_pred)


ValueError: y must be univariate, but found more than one variable

# PIPELINE 2
add lags, rolling means, expanding means

## data

In [69]:
# extract lags, means, medians
kwargs = {
    "lag_config": {
        "lag": ["lag", [[1,i] for i in range(horizon)]],
        "mean": ["mean", [[i,0] for i in range(2, horizon+1)]],
        # "median": ["median", [[i,0] for i in range(2, horizon+1)]],
        }}
df_window = WindowSummarizer(**kwargs).fit_transform(ts_company).dropna()

# extract DateTimeFeatures
df_datetime = DateTimeFeatures(ts_freq="D", feature_scope="comprehensive").fit_transform(df_window)
df_exog = pd.read_pickle("data/df_exog.pkl")
df_exog = df_exog.merge(df_datetime, left_index=True, right_index=True)

# train/test split
y_train, y_test, X_train, X_test = temporal_train_test_split(
    X=df_exog, 
    y=ts_company.tail(len(df_window)), 
    test_size=horizon)
y_train.index.freq = 'D'

# forecast horizon
fh = ForecastingHorizon(X_test.index, is_relative=False)

# transform X
scaler = TabularToSeriesAdaptor(MinMaxScaler())
X_train_trans = scaler.fit_transform(X_train)
X_test_trans = scaler.transform(X_test)

# use RF as forecaster
forecaster = make_reduction(
    estimator=RandomForestRegressor(criterion="absolute_error"), 
    window_length=sp, 
    strategy="recursive",
    )

# transform y
pipe = TransformedTargetForecaster([
    ("deseasonalize", Deseasonalizer(model="additive", sp=sp)),
    ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
    ("scale", scaler),
    ("forecaster", forecaster),
    ])


In [3]:
[[1,i] for i in range(horizon)]

[[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]]

In [None]:
# # fit & predict
# pipe = pipe.fit(y_train, X=X_train_trans)
# y_pred = pipe.predict(fh=fh, X=X_test_trans)

# # score
# mape = mean_absolute_percentage_error(y_test, y_pred)
# print(mape, '\n\n', y_pred)


## cross validation

In [63]:
# config CV

horizon=7
cv_folds=4

# train/test split
y_train, y_test, X_train, X_test = temporal_train_test_split(
    X=df_exog, 
    y=ts_company.tail(len(df_window)), 
    test_size=horizon)

cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_train) - len(y_test) * cv_folds),
    step_length=horizon,
    )

for train, test in cv.split(y_train):
    print("%s %s" % (train, test))


[   0    1    2 ... 1229 1230 1231] [1232 1233 1234 1235 1236 1237 1238]
[   7    8    9 ... 1236 1237 1238] [1239 1240 1241 1242 1243 1244 1245]
[  14   15   16 ... 1243 1244 1245] [1246 1247 1248 1249 1250 1251 1252]
[  21   22   23 ... 1250 1251 1252] [1253 1254 1255 1256 1257 1258 1259]


## RANDOM-SEARCH

In [119]:
# TUNE with RANDOM-SEARCH
from sktime.forecasting.model_selection import (
    ForecastingRandomizedSearchCV,
    SlidingWindowSplitter,
)
from scipy.stats import loguniform

param_grid = {
    'forecaster__estimator__n_estimators': [500],   # no. of trees
    'forecaster__estimator__max_depth': [100],      # of each tree
    'forecaster__estimator__max_features': [.33],   # at each split
    # 'forecaster__estimator__max_samples': np.arange(0.1, 1, 0.1),     # of each tree
    'forecaster__estimator__max_samples': np.append(np.arange(0.1, 1, 0.1), None),
    
    'forecaster__estimator__min_samples_split': [2, 5, 10, 50],
    # 'forecaster__estimator__min_samples_leaf': [1, 2, 4, 8, 16],
    }

cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_train) - len(y_test) * cv_folds),
    step_length=horizon,
    )

rscv = ForecastingRandomizedSearchCV(
    pipe, 
    strategy="refit", 
    cv=cv, 
    param_distributions=param_grid, 
    n_iter=2,
    n_jobs=-1,   
)

rscv.fit(y_train, X=X_train_trans)
rscv_y_pred = rscv.predict(fh=fh, X=X_test_trans)


  pd.Int64Index,
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.

In [121]:
rscv.cv_results_.sort_values('rank_test_MeanAbsolutePercentageError')
rscv.best_params_


{'forecaster__estimator__max_samples': 0.7000000000000001,
 'forecaster__estimator__max_features': 'sqrt'}

## GRID-SEARCH

In [161]:
# TUNE with GRID-SEARCH
from sktime.forecasting.model_selection import (
    ForecastingGridSearchCV,
    SlidingWindowSplitter,
)

param_grid = {
    "forecaster__window_length": [7,14,21],
    'forecaster__estimator__n_estimators': [10,50,100,500,1000,5000],   # no. of trees
    'forecaster__estimator__max_features': [None],   # at each split
    'forecaster__estimator__max_depth': [None],      # of each tree
    'forecaster__estimator__min_samples_split': [2],
    }

cv = SlidingWindowSplitter(
    fh=[i for i in range(1, horizon+1)],
    window_length=(len(y_train) - len(y_test) * cv_folds),
    step_length=horizon,
    )

gscv = ForecastingGridSearchCV(
    forecaster=pipe, 
    strategy="refit", 
    cv=cv, 
    param_grid=param_grid, 
    n_jobs=-1,
    verbose=1,
    refit=False,
)

gscv.fit(y_train)
gscv_y_pred = gscv.predict(fh)


Fitting 4 folds for each of 18 candidates, totalling 72 fits


  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  pd.Int64Index,
  RELATIVE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.TimedeltaIndex)
  ABSOLUTE_TYPES = (pd.Int64Index, pd.RangeIndex, pd.DatetimeIndex, pd.PeriodIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.Index)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  VALID_MULTIINDEX_TYPES = (pd.Int64Index, pd.RangeIndex)
  VALID_INDEX_TYPES = (pd.Int64Index, pd.

## result

In [2]:
gscv_results = pd.read_pickle('results/f9/gscv_results.pkl')
gscv_results = gscv_results.join(gscv_results.params.apply(pd.Series).iloc[:,-2:]).drop(columns='params')
gscv_results


Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,rank_test_MeanAbsolutePercentageError,forecaster__estimator__n_estimators,forecaster__window_length
0,0.252333,1.860852,0.077746,1.0,10.0,7.0
1,0.306031,3.344768,0.064443,16.0,10.0,14.0
2,0.278313,4.975771,0.070513,10.0,10.0,21.0
3,0.265601,8.420335,0.19329,5.0,50.0,7.0
4,0.276955,15.509201,0.128148,8.0,50.0,14.0
5,0.329182,22.040735,0.131831,18.0,50.0,21.0
6,0.27407,16.112373,0.253241,7.0,100.0,7.0
7,0.310525,31.07576,0.265399,17.0,100.0,14.0
8,0.28791,46.049389,0.285377,15.0,100.0,21.0
9,0.282024,77.312614,1.034352,11.0,500.0,7.0


### MAPE heat map 

In [3]:
# MAPE heat map data
px_data = gscv_results.pivot(
    index='forecaster__window_length',
    columns='forecaster__estimator__n_estimators',
    values='mean_test_MeanAbsolutePercentageError'
    )
px_data.index = px_data.index.astype('str')
px_data.columns = px_data.columns.astype('str')
px_data


forecaster__estimator__n_estimators,10.0,50.0,100.0,500.0,1000.0,5000.0
forecaster__window_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7.0,0.252333,0.265601,0.27407,0.282024,0.273477,0.27744
14.0,0.306031,0.276955,0.310525,0.26279,0.264661,0.264617
21.0,0.278313,0.329182,0.28791,0.284198,0.286141,0.286484


In [None]:
# MAPE heat map plot
fig = px.imshow(
    px_data,
    text_auto=True,
    color_continuous_scale='RdBu_r'
    )
fig.update_xaxes(side="top")
fig.show()


### feature importances

In [144]:
# refit estimator to get feature_importances_
y_train_trans = pipe.fit_transform(y_train)
forecaster.get_params()['estimator'].fit(y=y_train_trans, X=X_train_trans)


RandomForestRegressor(criterion='absolute_error')

In [157]:
feature_importances = pd.DataFrame({
    'feature': forecaster.get_params()['estimator'].feature_names_in_,
    'importance': forecaster.get_params()['estimator'].feature_importances_,
    }).sort_values('importance', ascending=False).reset_index(drop=True)

feature_importances


Unnamed: 0,feature,importance
0,sales_lag_1_0,0.254381
1,sales_mean_4_0,0.103663
2,weekday,0.078442
3,year,0.05107
4,sales_mean_2_0,0.044772
5,week_of_year,0.043552
6,promo_day,0.036981
7,sales_lag_1_6,0.033552
8,day_of_quarter,0.033032
9,sales_mean_7_0,0.032125


In [159]:
feature_importances.set_index('feature').plot()