In [32]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

pd.options.plotting.backend = "plotly"

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# define functions
1. train/test split y (to prevent data leak)
3. transform y with STL
1. join X & y
4. scale X & y
5. train/grid-search
6. predict & score

In [76]:
def preprocessing(y, X, horizon):
    from sktime.transformations.series.detrend import Deseasonalizer, Detrender
    from sklearn.model_selection import train_test_split
    y.index.freq='D'

    # split
    y_train, y_test = train_test_split(y, test_size=horizon, shuffle=False)

    # transform (deseasonalize)
    transformer = make_pipeline(Deseasonalizer(sp=7), Detrender())
    y_train_trans = transformer.fit_transform(y_train)
    y_train_trans.name = y.name
    y_test_trans = transformer.transform(y_test)
    y_test_trans.name = y.name
    y_trans = pd.concat([y_train_trans, y_test_trans])

    # join
    df = X.join(y_trans).dropna()
    
    # extract exo. variables from date index
    df['dayofweek'] = df.index.dayofweek
    df['dayofmonth'] = df.index.day
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar()['week']
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    df['year'] = df.index.year

    # rolling mean
    df['rolling_mean_2'] = df['sales'].rolling(2).mean()
    df['rolling_mean_3'] = df['sales'].rolling(3).mean()
    df['rolling_mean_4'] = df['sales'].rolling(4).mean()
    df['rolling_mean_5'] = df['sales'].rolling(5).mean()
    df['rolling_mean_6'] = df['sales'].rolling(6).mean()
    df['rolling_mean_7'] = df['sales'].rolling(7).mean()

    df.dropna(inplace=True)
    
    return df, transformer


# CV 
def cross_validation_result(data, tuned_model, model_name, transformer, horizon, rolls=4):
    # score model with CV on store data
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        # print(f"fold {i}---------------")
        
        # split data
        y_train = data.iloc[: -(rolls - i) * horizon]
        y_test = data.iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]

        # fit model
        model = tuned_model
        model.fit(
            y    = y_train['sales'],
            exog = y_train[y_train.columns.difference(['sales'])]
        )
        
        # make forecast
        y_hat = model.predict(
                        steps = horizon,
                        exog = y_test[y_test.columns.difference(['sales'])]
                    )
        y_hat = pd.Series(data=y_hat, index=y_test.index)
        
        # inverse
        y_train = transformer.inverse_transform(y_train['sales'])
        y_test = transformer.inverse_transform(y_test['sales'])
        y_hat = transformer.inverse_transform(y_hat)
                
        # score
        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
        
    return {'store':model_name,
        'mae_RF':np.mean(mae_CVs),
        'rmse_RF':np.mean(rmse_CVs),
        'mape_RF':np.mean(mape_CVs),
        'mase_RF':np.mean(mase_CVs),
        'fc_RF':y_hat,
        }


# read data

In [33]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


# horizon = 7

## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [60]:
horizon = 7

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100],#10, 50, 100],
    'randomforestregressor__max_features': ['auto'],#, 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1],#, 2, 4],
    'randomforestregressor__min_samples_split': [5],#],#2, 5, 10],
    'randomforestregressor__n_estimators': [800],#200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [7]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])


In [63]:
# from joblib import dump, load
# Save model
# dump(forecaster, filename='results/f8/RF_forecaster_7.py')

# Load model
# forecaster = load('results/f8/RF_forecaster_7.py')


## fit on store data

In [81]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, test_size=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...


In [89]:
# all_stores_result_CV.to_pickle('results/f8/RF_result_7.pkl')


# horizon = 14

## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [90]:
horizon = 14

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [800, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [7]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [51:49<00:00, 3109.84s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'randomforestregressor__max_depth': 100, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 200}
  Backtesting metric: 0.22167603625811044



In [91]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f8/RF_forecaster_14.py')

# Load model
forecaster = load('results/f8/RF_forecaster_14.py')


## fit on store data

In [92]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, test_size=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...
processing stores 328165...
processing stores 349920...
processing stores 349924...
processing stores 349952...
processing stores 349958...
processing stores 349962...
processing stores 349972...
processing stores 349978...
processing stores 349980...
processing stores 349998...
processing stores 350016...
processing stores 350018...
processing stores 350026...
processing stores 350028...
processing stores 350040...
processing stores 350046...
processing stores 350054...
processing stores 350056...
processing stores 350060...
processing stores 354468...
processing stores 387240...
processing stores 412585...
processing stores 441997...
processing stores 452387...
processing stores 461349...
processing stores 464495...
processing stores 471477...
processing stores 476061...
processing stores 480733...
processing stores 528854...
processing stores 536898...
processing stores 53

In [93]:
all_stores_result_CV.to_pickle('results/f8/RF_result_14.pkl')


In [94]:
all_stores_result_CV

Unnamed: 0,store,mae_RF,rmse_RF,mape_RF,mase_RF,fc_RF
0,store_307222,2.169786,2.845929,0.065143,0.222,date 2021-01-25 28.474187 2021-01-26 17....
1,store_307244,1.366929,1.634929,0.058929,0.175357,date 2021-01-25 17.554629 2021-01-26 18....
2,store_307248,2.024214,2.548714,0.1065,0.294143,date 2021-01-25 23.305246 2021-01-26 20....
3,store_320264,1.354429,1.957929,0.0935,0.199,date 2021-01-25 26.052705 2021-01-26 6....
4,store_328165,7.620429,9.613,0.13,0.354286,date 2021-01-25 56.828440 2021-01-26 4...
5,store_349920,5.280357,6.658143,0.133929,0.361,date 2021-01-25 29.759017 2021-01-26 2...
6,store_349924,2.244571,2.676143,0.103143,0.306429,date 2021-01-25 20.371845 2021-01-26 21....
7,store_349952,2.172357,2.643571,0.110286,0.3175,date 2021-01-25 21.154298 2021-01-26 34....
8,store_349958,2.535429,3.874714,0.072929,0.291,date 2021-01-25 29.986048 2021-01-26 36....
9,store_349962,1.2925,1.597857,0.091786,0.189214,date 2021-01-25 10.773370 2021-01-26 16....
