In [16]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

pd.options.plotting.backend = "plotly"

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
from sklearn.model_selection import train_test_split



# define functions
1. train/test split y (to prevent data leak)
3. transform y with STL
1. join X & y
4. scale X & y
5. train/grid-search
6. predict & score

In [17]:
def preprocessing(y, X, horizon):
    y.index.freq='D'

    # split
    y_train, y_test = train_test_split(y, test_size=horizon, shuffle=False)

    # deseasonalize & detrend
    transformer = make_pipeline(Deseasonalizer(sp=7), Detrender())
    y_train_trans = transformer.fit_transform(y_train)
    y_train_trans.name = y.name
    y_test_trans = transformer.transform(y_test)
    y_test_trans.name = y.name
    y_trans = pd.concat([y_train_trans, y_test_trans])

    # join
    df = X.join(y_trans).dropna()
    
    # extract exo. variables from date index
    df['dayofweek'] = df.index.dayofweek
    df['dayofmonth'] = df.index.day
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar()['week']
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    df['year'] = df.index.year

    # rolling mean
    df['rolling_mean_2'] = df['sales'].rolling(2).mean()
    df['rolling_mean_3'] = df['sales'].rolling(3).mean()
    df['rolling_mean_4'] = df['sales'].rolling(4).mean()
    df['rolling_mean_5'] = df['sales'].rolling(5).mean()
    df['rolling_mean_6'] = df['sales'].rolling(6).mean()
    df['rolling_mean_7'] = df['sales'].rolling(7).mean()

    # expanding mean
    # df['expanding_mean'] = df['sales'].expanding(2).mean()

    df.dropna(inplace=True)
    
    return df, transformer


# CV 
def cross_validation_result(data, tuned_model, model_name, transformer, horizon, rolls=4):
    # score model with CV on store data
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        # print(f"fold {i}---------------")
        
        # split data
        y_train = data.iloc[: -(rolls - i) * horizon]
        y_test = data.iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]

        # fit model
        model = tuned_model
        model.fit(
            y    = y_train['sales'],
            exog = y_train[y_train.columns.difference(['sales'])]
        )
        
        # make forecast
        y_hat = model.predict(
                        steps = horizon,
                        exog = y_test[y_test.columns.difference(['sales'])]
                    )
        y_hat = pd.Series(data=y_hat, index=y_test.index)
        
        # inverse
        y_train = transformer.inverse_transform(y_train['sales'])
        y_test = transformer.inverse_transform(y_test['sales'])
        y_hat = transformer.inverse_transform(y_hat)
                
        # score
        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
        
    return {'store':model_name,
        'mae_RF':np.mean(mae_CVs),
        'rmse_RF':np.mean(rmse_CVs),
        'mape_RF':np.mean(mape_CVs),
        'mase_RF':np.mean(mase_CVs),
        'fc_RF':y_hat,
        }


# read data

In [18]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


# horizon = 7

## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [19]:
horizon = 7

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [list(range(horizon, horizon*2))]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 162


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [35:26<00:00, 2126.90s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 7  8  9 10 11 12 13] 
  Parameters: {'randomforestregressor__max_depth': 100, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 800}
  Backtesting metric: 0.32189474986261335



In [20]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f81/RF_forecaster_7.py')

# Load model
forecaster = load('results/f81/RF_forecaster_7.py')


## fit on store data

In [21]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...
processing stores 328165...
processing stores 349920...
processing stores 349924...
processing stores 349952...
processing stores 349958...
processing stores 349962...
processing stores 349972...
processing stores 349978...
processing stores 349980...
processing stores 349998...
processing stores 350016...
processing stores 350018...
processing stores 350026...
processing stores 350028...
processing stores 350040...
processing stores 350046...
processing stores 350054...
processing stores 350056...
processing stores 350060...
processing stores 354468...
processing stores 387240...
processing stores 412585...
processing stores 441997...
processing stores 452387...
processing stores 461349...
processing stores 464495...
processing stores 471477...
processing stores 476061...
processing stores 480733...
processing stores 528854...
processing stores 536898...
processing stores 53

In [22]:
all_stores_result_CV.to_pickle('results/f81/RF_result_7.pkl')


# horizon = 14

## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [23]:
horizon = 14

# data
df_company, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [100, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [list(range(horizon, horizon*2))]

# Grid search
results_grid = grid_search_forecaster(
    y=df_company["sales"],
    initial_train_size=len(df_company) - horizon,
    exog=df_company[df_company.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|███████████████████████████████████| 1/1 [1:19:38<00:00, 4778.49s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [14 15 16 17 18 19 20 21 22 23 24 25 26 27] 
  Parameters: {'randomforestregressor__max_depth': 100, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__min_samples_split': 10, 'randomforestregressor__n_estimators': 2000}
  Backtesting metric: 0.44922080793972136



In [24]:
# Save model
from joblib import dump, load
dump(forecaster, filename='results/f81/RF_forecaster_14.py')

# Load model
forecaster = load('results/f81/RF_forecaster_14.py')


## fit on store data

In [25]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [26]:
all_stores_result_CV.to_pickle('results/f81/RF_result_14.pkl')


# horizon = 21
## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [27]:
horizon = 21

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [800, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [list(range(horizon, horizon*2))]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|███████████████████████████████████| 1/1 [1:01:19<00:00, 3679.63s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41] 
  Parameters: {'randomforestregressor__max_depth': 100, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 10, 'randomforestregressor__n_estimators': 800}
  Backtesting metric: 0.6154811574016966



In [28]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f81/RF_forecaster_21.py')

# Load model
forecaster = load('results/f81/RF_forecaster_21.py')


## fit on store data

In [29]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [30]:
all_stores_result_CV.to_pickle('results/f81/RF_result_21.pkl')


# horizon = 28
## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [31]:
horizon = 28

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [800, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [list(range(horizon, horizon*2))]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|███████████████████████████████████| 1/1 [1:12:02<00:00, 4322.27s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 52 53 54 55] 
  Parameters: {'randomforestregressor__max_depth': 10, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 5, 'randomforestregressor__n_estimators': 200}
  Backtesting metric: 1.1780400049758997



In [32]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f81/RF_forecaster_28.py')

# Load model
forecaster = load('results/f81/RF_forecaster_28.py')


## fit on store data

In [33]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [None]:
all_stores_result_CV.to_pickle('results/f81/RF_result_28.pkl')
