In [6]:
# imports
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
import itertools
from prophet import Prophet
from sktime.performance_metrics.forecasting import mean_absolute_scaled_error, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd
import numpy as np
import plotly.express as px
from pmdarima.preprocessing import FourierFeaturizer
from pmdarima import auto_arima, ARIMA
import matplotlib.pyplot as plt
import holidays
import time
import datetime
pd.options.plotting.backend = 'plotly'

In [7]:
def preprocessing_ARIMA(ts, ts_holiday, split=True, yearly_seasonality=True, steps_ahead=30):
    '''
    split: to split into train and test set
    '''
    print(f'Preprocessing timeseries data with {steps_ahead} steps ahead')
    ts = ts['sales']/1e6
    ts.index.freq = 'D'
    # yearly seasonality
    if yearly_seasonality:
        fouri_terms = FourierFeaturizer(365.25, 2)
        y_prime, df_fouri = fouri_terms.fit_transform(ts)
        df_fouri.index = y_prime.index
    # combine exog. variables
    df_exog = pd.concat([df_fouri, ts_holiday], axis=1)
    df_exog['holiday'] = df_exog['holiday'].fillna(False).astype('int')
    df_exog.dropna(inplace=True)
    if split:
        # split
        y_train = y_prime.iloc[:-steps_ahead]
        y_test = y_prime.iloc[-steps_ahead:]
        exog_train = df_exog.iloc[:-steps_ahead]
        exog_test = df_exog.iloc[-steps_ahead:]
        return {'y_train':y_train,
                'y_test':y_test,
                'exog_train':exog_train,
                'exog_test':exog_test}
    else:
        return {'y':y_prime, 'exog': df_exog}

def preprocessing_prophet(ts, ts_holiday, steps_ahead=30, split = True):
    fb_df = ts[['sales']]
    fb_df['sales'] = ts['sales']/1e6
    #fb_df['holiday'] = fb_df['holiday'].fillna(False).astype('bool')
    fb_df = fb_df.reset_index().rename({'date': 'ds', 'sales': 'y'}, axis=1)
    if split:
        fb_train = fb_df.iloc[:-steps_ahead]
        fb_test = fb_df.iloc[-steps_ahead:]
        return {'y_train':fb_train,
                'y_test':fb_test}
    else:
        return fb_df

def holidays_ts_prophet(promotion = True, holiday_neg=False):
    holiday = pd.DataFrame(holidays.Vietnam(years=[2018, 2019, 2020, 2021]).items()).rename({0:'date', 1:'holiday_neg'}, axis=1)
    # Add Tet promotion
    promo = pd.DataFrame(holiday[holiday.holiday_neg == 'Vietnamese New Year'].date - datetime.timedelta(days=1))
    promo['promotion'] = "1 days before Tet Holiday"
    # Add Black Friday
    promo = promo.append(
                pd.DataFrame({'date':[datetime.date(2020,11,27), 
                                      datetime.date(2019,11,29),
                                      datetime.date(2018,11,23)],
                              'promotion':["Black Friday","Black Friday","Black Friday"]}))
    # set to 1 if holiday affect sales negatively
    holiday_off = holiday.replace({
            'Vietnamese New Year.*': 1, 
             '.*day of Tet Holiday': 1,
             'International Labor Day': 1,
             '\D': np.NaN}, 
            regex=True).dropna()
    if promotion:
        promotions = pd.DataFrame({
                'holiday':'big_promotion',
                'ds':promo.date,
                'lower_window': -14, # 2 weeks before
                'upper_window': 0})
    else:
        promotions = None
    if holiday_neg:
        holiday_negative = pd.DataFrame({
                'holiday':'holiday_off',
                'ds':holiday_off.date,
                'lower_window': 0,
                'upper_window': 0})
    else:
        holiday_negative = None
    return pd.concat((promotions, holiday_negative))

def auto_arima_model(y_train, exog_train, diff_num):
    time_start = time.time()
    print('start auto arima...')
    # Fit model to the level to find common order
    arima_model = auto_arima(
        y=y_train,
        exogenous=exog_train,
        D=diff_num, 
        seasonal=True, m=7 # Weekly seasonality
    )
    time_stop = time.time()
    print(f'finished auto arima, model: {arima_model}, total time: {round(time_stop-time_start)} sec')
    return arima_model

def cross_validation_prophet(prophet_data, param_grid, steps_ahead = 30):
    cv_prophet_result = pd.DataFrame(columns=['params','mape','rmse'])
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
    for params in all_params:
        # set up model
        prophet_model = Prophet(**params
                                ).add_seasonality(name='weekly', period=7, fourier_order=5, prior_scale=10
                                ).add_seasonality(name='yearly', period=365.25, fourier_order=5, prior_scale=1)
        #m.add_regressor('holiday')
        prophet_model.fit(prophet_data['y_train'])
        # set up CV
        df_cv = cross_validation(prophet_model, initial=(str(prophet_data['y_train'].shape[0]-steps_ahead*2)+' days'), period='7 days', horizon='30 days', parallel="processes")
        # evaluate
        df_p = performance_metrics(df_cv)
        cv_prophet_result = cv_prophet_result.append({'params':params, 
                                            'mape'  :df_p['mape'].values[-1],
                                            'rmse'  :df_p['rmse'].values[-1]}, ignore_index=True)
    return cv_prophet_result


def cross_validation_result(data, model_name, model, rolls=4, horizon=30, prophet_params=None):
    '''
    '''
    cv_score = []
    for i in range(rolls):
        if model_name=='arima':
            model.fit(y=data['y'].iloc[:-(rolls-i)*horizon], 
                      X=data['exog'].iloc[:-(rolls-i)*horizon])
            y_hat = model.predict(n_periods=horizon, 
                                  exogenous=data['exog'].iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon]])
            y_test = data['y'].iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon]]
            mape_OOS = round(mean_absolute_percentage_error(y_test, y_hat), 3)
        elif model_name=='prophet':
            model = Prophet(**prophet_params
                           ).add_seasonality(name='weekly', period=7, fourier_order=5, prior_scale=10
                           ).add_seasonality(name='yearly', period=365.25, fourier_order=5, prior_scale=1)
            model.fit(data.iloc[:-(rolls-i)*horizon,:])
            y_hat=model.predict(data.iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon],:])['yhat']
            y_test = data['y'].iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon]]
            mape_OOS = round(mean_absolute_percentage_error(y_test, y_hat), 3)
        cv_score.append(mape_OOS)
    return cv_score

In [8]:
df_store = pd.read_pickle('data/df_daily.pkl')
# holiday
ts_holiday = pd.read_pickle('data/holiday.pkl')
#print(ts_holiday.index)
df_company = df_store.groupby('date').sum()[['sales']]

In [9]:
models_to_run = ['arima','prophet']
steps_ahead=30
prophet_holidays=holidays_ts_prophet(promotion=True, holiday_neg=True)
param_grid_prophet = {
    'changepoint_prior_scale': [0.01, 0.1, 1, 10,20],
    'changepoint_range': [0.8, 0.9],
    'holidays_prior_scale':[0.1, 1],
    'seasonality_mode': ['additive'],
    'holidays':[prophet_holidays],
    'daily_seasonality': [False],
    'weekly_seasonality': [False],
    'yearly_seasonality': [False],
}

In [22]:
cv_pipe_result = {}
best_model = {}
for model_name in models_to_run:
    if model_name =='arima':
        print(f'Start cross-validation for {model_name} models')
        time_start = time.time()
        data = preprocessing_ARIMA(ts=df_company, ts_holiday=ts_holiday, split=True, yearly_seasonality=True, steps_ahead=steps_ahead)
        arima_model = auto_arima_model(y_train=data['y_train'], exog_train=data['exog_train'], diff_num=1)
        # now get the data for cross-validatio and start the process
        data = preprocessing_ARIMA(ts=df_company, ts_holiday=ts_holiday, split=False, yearly_seasonality=True, steps_ahead=steps_ahead) 
        cv_score = cross_validation_result(data=data, model_name=model_name, model=arima_model, rolls=4, horizon=steps_ahead)
        # save result
        cv_pipe_result[model_name] = cv_score
        best_model[model_name] = arima_model
        time_stop = time.time()
        print(f'Finished cross-validation, total time: {round(time_stop-time_start)} sec')
    if model_name =='prophet':
        print(f'Start cross-validation for {model_name} models')
        time_start = time.time()
        data = preprocessing_prophet(ts=df_company, ts_holiday=prophet_holidays, steps_ahead=steps_ahead,split=True)
        cv_prophet_result = cross_validation_prophet(prophet_data=data, 
                                                     param_grid=param_grid_prophet,                                                       
                                                     steps_ahead = steps_ahead) 
        prophet_params = cv_prophet_result.sort_values('mape').iloc[0,0]
        # now get the data for cross-validatio and start the process
        data = preprocessing_prophet(ts=df_company, ts_holiday=prophet_holidays, steps_ahead=steps_ahead,split=False)
        cv_score = cross_validation_result(data=data, model_name=model_name, model=None, rolls=4, horizon=steps_ahead, prophet_params=prophet_params)
        # save result
        cv_pipe_result[model_name] = cv_score
        best_model[model_name] = prophet_params
        time_stop = time.time()
        print(f'Finished cross-validation, total time: {round(time_stop-time_start)} sec')

Start cross-validation for arima models
Preprocessing timeseries data with 30 steps ahead
start auto arima...
finished auto arima, model:  ARIMA(4,0,0)(2,1,0)[7]          , total time: 143 sec
Preprocessing timeseries data with 30 steps ahead
Finished cross-validation, total time: 158 sec
Start cross-validation for prophet models


INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000228E03D0AC0>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000228D5C638E0>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000229365AB460>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000228E032B070>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurr

Finished cross-validation, total time: 407 sec


In [23]:
cv_pipe_result

{'arima': [0.338, 0.428, 0.448, 0.202], 'prophet': [0.15, 0.442, 0.282, 0.266]}