In [18]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

pd.options.plotting.backend = "plotly"

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sktime.transformations.series.detrend import Deseasonalizer, Detrender
from sklearn.model_selection import train_test_split



# define functions
1. train/test split y (to prevent data leak)
3. transform y with STL
1. join X & y
4. scale X & y
5. train/grid-search
6. predict & score

In [30]:
def preprocessing(y, X, horizon):
    y.index.freq='D'

    # split
    y_train, y_test = train_test_split(y, test_size=horizon, shuffle=False)

    # deseasonalize & detrend
    transformer = make_pipeline(Deseasonalizer(sp=7), Detrender())
    y_train_trans = transformer.fit_transform(y_train)
    y_train_trans.name = y.name
    y_test_trans = transformer.transform(y_test)
    y_test_trans.name = y.name
    y_trans = pd.concat([y_train_trans, y_test_trans])

    # join
    df = X.join(y_trans).dropna()
    
    # extract exo. variables from date index
    df['dayofweek'] = df.index.dayofweek
    df['dayofmonth'] = df.index.day
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar()['week']
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    df['year'] = df.index.year

    # rolling mean
    df['rolling_mean_2'] = df['sales'].rolling(2).mean()
    df['rolling_mean_3'] = df['sales'].rolling(3).mean()
    df['rolling_mean_4'] = df['sales'].rolling(4).mean()
    df['rolling_mean_5'] = df['sales'].rolling(5).mean()
    df['rolling_mean_6'] = df['sales'].rolling(6).mean()
    df['rolling_mean_7'] = df['sales'].rolling(7).mean()

    # expanding mean
    # df['expanding_mean'] = df['sales'].expanding(2).mean()

    df.dropna(inplace=True)
    
    return df, transformer


# CV 
def cross_validation_result(data, tuned_model, model_name, transformer, horizon, rolls=4):
    # score model with CV on store data
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        # print(f"fold {i}---------------")
        
        # split data
        y_train = data.iloc[: -(rolls - i) * horizon]
        y_test = data.iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]

        # fit model
        model = tuned_model
        model.fit(
            y    = y_train['sales'],
            exog = y_train[y_train.columns.difference(['sales'])]
        )
        
        # make forecast
        y_hat = model.predict(
                        steps = horizon,
                        exog = y_test[y_test.columns.difference(['sales'])]
                    )
        y_hat = pd.Series(data=y_hat, index=y_test.index)
        
        # inverse
        y_train = transformer.inverse_transform(y_train['sales'])
        y_test = transformer.inverse_transform(y_test['sales'])
        y_hat = transformer.inverse_transform(y_hat)
                
        # score
        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
        
    return {'store':model_name,
        'mae_RF':np.mean(mae_CVs),
        'rmse_RF':np.mean(rmse_CVs),
        'mape_RF':np.mean(mape_CVs),
        'mase_RF':np.mean(mase_CVs),
        'fc_RF':y_hat,
        }


# read data

In [20]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_store['sales'] = df_store['sales']/1e6
df_exog = pd.read_pickle("data/df_exog.pkl")
ts_company = df_store.groupby("date").sum()["sales"]


# horizon = 7

## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [60]:
horizon = 7

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100],#10, 50, 100],
    'randomforestregressor__max_features': ['auto'],#, 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1],#, 2, 4],
    'randomforestregressor__min_samples_split': [5],#],#2, 5, 10],
    'randomforestregressor__n_estimators': [800],#200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [7]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])
  if hasattr(x, "freqstr"):
  if x.freqstr is None:
  elif "-" in x.freqstr:
  return x.freqstr
  return pd.Int64Index([d.n / count for d in duration])


In [31]:
from joblib import dump, load
# Save model
# dump(forecaster, filename='results/f8/RF_forecaster_7.py')

# Load model
forecaster = load('results/f8/RF_forecaster_7.py')


## fit on store data

In [33]:
all_stores_result_CV = []
rolls = 4
# for store in df_store["store_id"].unique():
for store in df_store["store_id"].unique()[:4]:
    print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...


In [34]:
all_stores_result_CV.mean()

mae_RF     2.243375
rmse_RF    3.184312
mape_RF    0.113125
mase_RF    0.283500
dtype: float64

In [None]:
all_stores_result_CV.mean()

mae_RF     2.228063
rmse_RF    3.166375
mape_RF    0.112813
mase_RF    0.281312
dtype: float64

In [35]:
forecaster.get_feature_importance().sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
15,rolling_mean_2,0.799138
0,lag_1,0.077854
10,expanding_mean,0.074942
1,lag_2,0.006483
13,promo_day,0.006076
8,dayofweek,0.00542
3,lag_4,0.003773
16,rolling_mean_3,0.003503
6,lag_7,0.002576
4,lag_5,0.002192


In [89]:
# all_stores_result_CV.to_pickle('results/f8/RF_result_7.pkl')


# horizon = 14

## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [90]:
horizon = 14

# data
df_company, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [100, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [7]

# Grid search
results_grid = grid_search_forecaster(
    y=df_company["sales"],
    initial_train_size=len(df_company) - horizon,
    exog=df_company[df_company.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [51:49<00:00, 3109.84s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'randomforestregressor__max_depth': 100, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 200}
  Backtesting metric: 0.22167603625811044



In [36]:
# # Save model
# from joblib import dump, load
# dump(forecaster, filename='results/f8/RF_forecaster_14.py')

# # Load model
# forecaster = load('results/f8/RF_forecaster_14.py')


In [53]:
forecaster.last_window
# forecaster.exog_col_names



date
2021-01-25    141.094374
2021-01-26    203.304939
2021-01-27    133.004034
2021-01-28    177.037215
2021-01-29    189.516012
2021-01-30    411.217959
2021-01-31    816.393535
Freq: D, Name: y, dtype: float64

In [67]:
# id = np.where((df["sales"].values >= 800) & (df["sales"].values < 820))
# df.iloc[id]


Unnamed: 0_level_0,off_day,promo_day,sales,dayofweek,dayofmonth,dayofyear,weekofyear,month,quarter,year,rolling_mean_2,rolling_mean_3,rolling_mean_4,rolling_mean_5,rolling_mean_6,rolling_mean_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-11-19,0,1,813.375525,0,19,323,47,11,4,2018,1288.146506,1524.261493,2006.812461,1583.452202,1313.360974,1123.501059
2019-04-29,0,1,812.380174,0,29,119,18,4,2,2019,702.717049,710.849244,680.831651,676.914212,639.53781,603.82344
2020-06-21,0,0,808.911958,6,21,173,25,6,2,2020,742.714563,699.215118,481.050698,351.94952,263.695518,198.278268
2020-09-19,0,0,818.159063,5,19,263,38,9,3,2020,753.091294,403.298884,262.973428,181.558576,114.304427,69.336985


In [71]:
df["sales"].tail(7) - forecaster.last_window
df.describe()

Unnamed: 0,off_day,promo_day,sales,dayofweek,dayofmonth,dayofyear,weekofyear,month,quarter,year,rolling_mean_2,rolling_mean_3,rolling_mean_4,rolling_mean_5,rolling_mean_6,rolling_mean_7
count,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0,1268.0
mean,0.016562,0.12776,6.463512,3.002366,15.835174,191.515773,27.887224,6.792587,2.59858,2018.827287,5.932806,5.532582,5.236885,4.988671,4.769183,4.559066
std,0.127672,0.333954,392.648113,2.001774,8.801438,108.689397,15.518832,3.556118,1.139337,1.044086,366.189085,348.306515,332.858127,320.236176,309.933254,301.469081
min,0.0,0.0,-797.642367,0.0,1.0,1.0,1.0,1.0,1.0,2017.0,-746.432061,-697.223656,-633.51726,-597.246105,-563.522146,-538.397212
25%,0.0,0.0,-182.014248,1.0,8.0,96.0,14.0,4.0,2.0,2018.0,-178.065022,-171.861408,-169.020787,-161.825946,-155.808104,-155.05575
50%,0.0,0.0,-76.457893,3.0,16.0,201.5,29.0,7.0,3.0,2019.0,-73.052742,-73.659816,-76.147679,-76.980813,-78.181095,-74.369569
75%,0.0,0.0,50.846637,5.0,23.0,286.25,41.0,10.0,4.0,2020.0,63.228982,84.691423,89.79819,99.740466,111.40403,104.664291
max,1.0,1.0,4012.253328,6.0,31.0,366.0,53.0,12.0,4.0,2021.0,3298.773719,3100.791967,2580.401323,2244.036432,1999.24237,1852.640349


## fit on store data

In [92]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    # print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...
processing stores 328165...
processing stores 349920...
processing stores 349924...
processing stores 349952...
processing stores 349958...
processing stores 349962...
processing stores 349972...
processing stores 349978...
processing stores 349980...
processing stores 349998...
processing stores 350016...
processing stores 350018...
processing stores 350026...
processing stores 350028...
processing stores 350040...
processing stores 350046...
processing stores 350054...
processing stores 350056...
processing stores 350060...
processing stores 354468...
processing stores 387240...
processing stores 412585...
processing stores 441997...
processing stores 452387...
processing stores 461349...
processing stores 464495...
processing stores 471477...
processing stores 476061...
processing stores 480733...
processing stores 528854...
processing stores 536898...
processing stores 53

In [93]:
# all_stores_result_CV.to_pickle('results/f8/RF_result_14.pkl')


# horizon = 21
## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [4]:
horizon = 21

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [800, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [7]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|███████████████████████████████████| 1/1 [1:29:36<00:00, 5376.61s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'randomforestregressor__max_depth': 50, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 5, 'randomforestregressor__n_estimators': 200}
  Backtesting metric: 0.3078443142480022



In [5]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f8/RF_forecaster_21.py')

# Load model
# forecaster = load('results/f8/RF_forecaster_21.py')


['results/f8/RF_forecaster_21.py']

## fit on store data

In [12]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    # print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [13]:
all_stores_result_CV.to_pickle('results/f8/RF_result_21.pkl')


# horizon = 28
## tune on company data
grid search
- fit once per param set on train set
- predict & score on test set (no CV)

In [14]:
horizon = 28

# data
df, transformer = preprocessing(ts_company, df_exog, horizon)

# Grid search hyperparameters and lags
from sklearn.preprocessing import Normalizer
pipe = make_pipeline(
    Normalizer(), 
    RandomForestRegressor(random_state=123)
    )

forecaster = ForecasterAutoreg(
    regressor=pipe, 
    lags=10  # This value will be replaced in the grid search
    )

# Regressor hyperparameters
param_grid = {
    'randomforestregressor__max_depth': [100, 10, 50, 100],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__n_estimators': [800, 200, 800, 2000]
    }

# Lags used as predictors
lags_grid = [7]

# Grid search
results_grid = grid_search_forecaster(
    y=df["sales"],
    initial_train_size=len(df) - horizon,
    exog=df[df.columns.difference(["sales"])],
    forecaster=forecaster,
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=horizon,
    refit=True,
    metric="mean_absolute_percentage_error",
    return_best=True,
    verbose=False,
)


Number of models compared: 288


loop lags_grid: 100%|█████████████████████████████████████| 1/1 [48:27<00:00, 2907.46s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4 5 6 7] 
  Parameters: {'randomforestregressor__max_depth': 50, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__min_samples_split': 10, 'randomforestregressor__n_estimators': 200}
  Backtesting metric: 0.6319211028337506



In [15]:
from joblib import dump, load
# Save model
dump(forecaster, filename='results/f8/RF_forecaster_28.py')

# Load model
# forecaster = load('results/f8/RF_forecaster_28.py')


['results/f8/RF_forecaster_28.py']

## fit on store data

In [16]:
all_stores_result_CV = []
rolls = 4
for store in df_store["store_id"].unique():
    # for store in df_store["store_id"].unique()[:4]:
    # print(f"processing stores {store}...")
    model_name = "store_" + str(store)

    # data
    ts_1_store = df_store[df_store["store_id"] == store].set_index("date")["sales"]
    df_1_store_pro, transformer = preprocessing(
        ts_1_store, df_exog, horizon=horizon * rolls
    )

    # CV
    cv_score = cross_validation_result(
        df_1_store_pro, forecaster, model_name, transformer, horizon
    )

    # result
    all_stores_result_CV.append(cv_score)
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


In [17]:
all_stores_result_CV.to_pickle('results/f8/RF_result_28.pkl')
