In [3]:
# imports
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
import itertools
from prophet import Prophet
from sktime.performance_metrics.forecasting import mean_absolute_scaled_error, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd
import numpy as np
import plotly.express as px
from pmdarima.preprocessing import FourierFeaturizer
from pmdarima import auto_arima, ARIMA
import matplotlib.pyplot as plt
import holidays
import time
import datetime

pd.options.plotting.backend = 'plotly'

  VALID_INDEX_TYPES = (pd.Int64Index, pd.RangeIndex, pd.PeriodIndex, pd.DatetimeIndex)
  from pandas import Int64Index as NumericIndex


In [5]:
df_store = pd.read_pickle('data/df_daily.pkl')
# holiday
ts_holiday = pd.read_pickle('data/holiday.pkl')
#print(ts_holiday.index)

In [49]:
def preprocessing_prophet(ts, ts_holiday, steps_ahead=30, split = True):
    fb_df = ts[['sales']]
    fb_df['sales'] = ts['sales']/1e6
    #fb_df['holiday'] = fb_df['holiday'].fillna(False).astype('bool')
    fb_df = fb_df.reset_index().rename({'date': 'ds', 'sales': 'y'}, axis=1)
    if split:
        fb_train = fb_df.iloc[:-steps_ahead]
        fb_test = fb_df.iloc[-steps_ahead:]
        return {'y_train':fb_train,
                'y_test':fb_test}
    else:
        return fb_df

def holidays_ts(promotion = True, holiday_neg=False):
    holiday = pd.DataFrame(holidays.Vietnam(years=[2018, 2019, 2020, 2021]).items()).rename({0:'date', 1:'holiday_neg'}, axis=1)
    # Add Tet promotion
    promo = pd.DataFrame(holiday[holiday.holiday_neg == 'Vietnamese New Year'].date - datetime.timedelta(days=1))
    promo['promotion'] = "1 days before Tet Holiday"
    # Add Black Friday
    promo = promo.append(
                pd.DataFrame({'date':[datetime.date(2020,11,27), 
                                      datetime.date(2019,11,29),
                                      datetime.date(2018,11,23)],
                              'promotion':["Black Friday","Black Friday","Black Friday"]}))
    # set to 1 if holiday affect sales negatively
    holiday_off = holiday.replace({
            'Vietnamese New Year.*': 1, 
             '.*day of Tet Holiday': 1,
             'International Labor Day': 1,
             '\D': np.NaN}, 
            regex=True).dropna()
    if promotion:
        promotions = pd.DataFrame({
                'holiday':'big_promotion',
                'ds':promo.date,
                'lower_window': -14, # 2 weeks before
                'upper_window': 0})
    else:
        promotions = None
    if holiday_neg:
        holiday_negative = pd.DataFrame({
                'holiday':'holiday_off',
                'ds':holiday_off.date,
                'lower_window': 0,
                'upper_window': 0})
    else:
        holiday_negative = None
    return pd.concat((promotions, holiday_negative))

def cross_validation_prophet(prophet_data, param_grid, prophet_holidays, steps_ahead = 30):
    mape_PROPHET = pd.DataFrame(columns=['params','mape','rmse'])
    all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
    for params in all_params:
        # set up model
        prophet_model = Prophet(holidays=prophet_holidays,
                                daily_seasonality= False,
                                weekly_seasonality= False,
                                yearly_seasonality= False,
                                **params
                                ).add_seasonality(name='weekly', period=7, fourier_order=5, prior_scale=10
                                ).add_seasonality(name='yearly', period=365.25, fourier_order=5, prior_scale=1)
        #m.add_regressor('holiday')
        prophet_model.fit(prophet_data['y_train'])
        # set up CV
        df_cv = cross_validation(prophet_model, initial=(str(prophet_data['y_train'].shape[0]-steps_ahead*2)+' days'), period='7 days', horizon='30 days', parallel="processes")
        # evaluate
        df_p = performance_metrics(df_cv)
        mape_PROPHET = mape_PROPHET.append({'params':params, 
                                            'mape'  :df_p['mape'].values[-1],
                                            'rmse'  :df_p['rmse'].values[-1]}, ignore_index=True)
    return mape_PROPHET

def cross_validation_result(data, model, rolls=4, horizon=30, ARIMA=True):
    '''
    '''
    cv_score = []
    for i in range(rolls):
        if ARIMA:
            model.fit(y=data['y'].iloc[:-(rolls-i)*horizon], 
                      X=data['exog'].iloc[:-(rolls-i)*horizon])
            y_hat = model.predict(n_periods=horizon, 
                                  exogenous=data['exog'].iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon]])
            y_test = data['y'].iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon]]
            mape_OOS = round(mean_absolute_percentage_error(y_test, y_hat), 3)
        else:
            model = Prophet(changepoint_prior_scale=0.01,
                            holidays_prior_scale=1,
                            changepoint_range=0.9,
                            daily_seasonality= False,
                            weekly_seasonality= False,
                            yearly_seasonality= False,
                            seasonality_mode='additive',
                            holidays = prophet_holidays
                           ).add_seasonality(name='weekly', period=7, fourier_order=5, prior_scale=10
                           ).add_seasonality(name='yearly', period=365.25, fourier_order=5, prior_scale=1)
            model.fit(data.iloc[:-(rolls-i)*horizon,:])
            y_hat=model.predict(data.iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon],:])['yhat']
            y_test = data['y'].iloc[np.r_[-(rolls-i)*horizon:-(rolls-i-1)*horizon]]
            mape_OOS = round(mean_absolute_percentage_error(y_test, y_hat), 3)
        cv_score.append(mape_OOS)
    return cv_score


In [7]:
df_company = df_store.groupby('date').sum()[['sales']]
prophet_data = preprocessing_prophet(df_company, ts_holiday, steps_ahead=30)
prophet_holidays = holidays_ts(promotion=True, holiday_neg=True)

In [8]:
prophet_data['y_train']

Unnamed: 0,ds,y
0,2017-08-07,1.527650
1,2017-08-08,0.000000
2,2017-08-09,34.942320
3,2017-08-10,20.151840
4,2017-08-11,24.305380
...,...,...
1239,2020-12-28,519.237551
1240,2020-12-29,557.447782
1241,2020-12-30,675.009125
1242,2020-12-31,881.599760


In [35]:
# Grid search set up
param_grid = {
    'changepoint_prior_scale': [0.01, 0.1, 1, 10,20],
    'changepoint_range': [0.8, 0.9],
    'holidays_prior_scale':[0.1, 1],
    'seasonality_mode': ['additive'],
}

cv_result = cross_validation_prophet(prophet_data=prophet_data, 
                                     param_grid=param_grid, 
                                     prophet_holidays=prophet_holidays, 
                                     steps_ahead = 30)

INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000141A8483070>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000141A84DCF40>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000141A850F8E0>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000141A849E340>
INFO:prophet:Making 5 forecasts with cutoffs between 2020-11-04 00:00:00 and 2020-12-02 00:00:00
INFO:prophet:Applying in parallel with <concurr

In [39]:
cv_result.sort_values('rmse').iloc[0,0]

{'changepoint_prior_scale': 0.01,
 'changepoint_range': 0.9,
 'holidays_prior_scale': 1,
 'seasonality_mode': 'additive'}

In [37]:
cv_result.sort_values('rmse').iloc[1,0]

{'changepoint_prior_scale': 0.01,
 'changepoint_range': 0.8,
 'holidays_prior_scale': 1,
 'seasonality_mode': 'additive'}

In [9]:
prophet_parasm =cv_result.sort_values('mape').iloc[0,0]
prophet_parasm['holidays']=prophet_holidays

In [46]:
prophet_model = Prophet(changepoint_prior_scale=0.01,
                        holidays_prior_scale=1,
                        changepoint_range=0.9,
                        daily_seasonality= False,
                        weekly_seasonality= False,
                        yearly_seasonality= False,
                        seasonality_mode='additive',
                        holidays = prophet_holidays
                       ).add_seasonality(name='weekly', period=7, fourier_order=5, prior_scale=10
                       ).add_seasonality(name='yearly', period=365.25, fourier_order=5, prior_scale=1)
prophet_model.fit(prophet_data['y_train'])
prophet_prediction_IS = prophet_model.predict(prophet_data['y_train'])
prophet_prediction_OOS = prophet_model.predict(prophet_data['y_test'])

mape_PROPHET = round(mean_absolute_percentage_error(prophet_data['y_train']['y'], prophet_prediction_IS['yhat']), 3)
print(f'MAPE IS: {mape_PROPHET}')
mape_PROPHET = round(mean_absolute_percentage_error(prophet_data['y_test']['y'], prophet_prediction_OOS['yhat']), 3)
print(f'MAPE OOS: {mape_PROPHET}')

MAPE IS: 0.344
MAPE OOS: 0.266


In [47]:
res = pd.merge(
    prophet_data['y_train'].set_index('ds')[['y']],
    prophet_prediction_IS.set_index('ds')[['yhat']],
    'inner', left_index=True, right_index=True).reset_index()


# %%
fig = res[['y','yhat']].plot()
arima_storelevel_mean = round(res['yhat'].mean(), 3)
fig.add_hline(y=arima_storelevel_mean, line_dash="dot", line_color='blue',
              annotation_text=str(arima_storelevel_mean),
              annotation_position="top left")

In [48]:
res = pd.merge(
    prophet_data['y_test'].set_index('ds')[['y']],
    prophet_prediction_OOS.set_index('ds')[['yhat']],
    'inner', left_index=True, right_index=True).reset_index()


# %%
fig = res[['y','yhat']].plot()
arima_storelevel_mean = round(res['yhat'].mean(), 3)
# fig.add_hline(y=arima_storelevel_mean, line_dash="dot", line_color='blue',
#               annotation_text=str(arima_storelevel_mean),
#               annotation_position="top left")


## Cross-validation:

In [50]:
data = preprocessing_prophet(df_company, ts_holiday, steps_ahead=30, split=False)

cv_score = cross_validation_result(data=data,
                                model=prophet_model,
                                   ARIMA=False)

In [51]:
cv_score

[0.15, 0.442, 0.282, 0.266]

In [52]:
np.mean(cv_score)

0.285

## LOOP all stores

In [23]:
df_data

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2018-12-15,0.0
2018-12-16,37507350.0
2018-12-17,3547800.0
2018-12-18,7403700.0
2018-12-19,3852900.0
...,...
2021-01-27,1125400.0
2021-01-28,1666800.0
2021-01-29,3342400.0
2021-01-30,1622200.0


In [24]:
all_stores_result = pd.DataFrame()
for store in df_store['store_id'].unique():  # print(store)
    print(f'processing stores {store}...')
    df_data = df_store[df_store['store_id'] == store].set_index('date')[['sales']]
    prophet_store_data = preprocessing_prophet(df_data, ts_holiday, steps_ahead=30)
    prophet_model = Prophet(changepoint_prior_scale=1,
                        holidays_prior_scale=10,
                        daily_seasonality= False,
                        weekly_seasonality= False,
                        yearly_seasonality= False,
                        seasonality_mode='additive',
                        holidays = prophet_holidays
                       ).add_seasonality(name='weekly', period=7, fourier_order=5, prior_scale=10
                       ).add_seasonality(name='yearly', period=365.25, fourier_order=5, prior_scale=1)
    # Apply arima_model from whole company
    model_name='store_'+str(store)
    
    prophet_model.fit(prophet_store_data['y_train'])
    prophet_prediction_IS = prophet_model.predict(prophet_store_data['y_train'])
    prophet_prediction_OOS = prophet_model.predict(prophet_store_data['y_test'])
    mape_IS = round(mean_absolute_percentage_error(prophet_store_data['y_train']['y'], prophet_prediction_IS['yhat']), 3)
    mape_OOS = round(mean_absolute_percentage_error(prophet_store_data['y_test']['y'], prophet_prediction_OOS['yhat']), 3)
    mae_IS = round(mean_absolute_error(prophet_store_data['y_train']['y'], prophet_prediction_IS['yhat']))
    mae_OOS = round(mean_absolute_error(prophet_store_data['y_test']['y'], prophet_prediction_OOS['yhat']))
    result = {'model':model_name,
               'mae_IS':mae_IS,
               'mae_OOS':mae_OOS,
               'mape_IS':mape_IS,
               'mape_OOS':mape_OOS}
    all_stores_result=all_stores_result.append(result,ignore_index=True)  

processing stores 307222...
processing stores 307244...
processing stores 307248...
processing stores 320264...
processing stores 328165...
processing stores 349920...
processing stores 349924...
processing stores 349952...
processing stores 349958...
processing stores 349962...
processing stores 349972...
processing stores 349978...
processing stores 349980...
processing stores 349998...
processing stores 350016...
processing stores 350018...
processing stores 350026...
processing stores 350028...
processing stores 350040...
processing stores 350046...
processing stores 350054...
processing stores 350056...
processing stores 350060...
processing stores 354468...
processing stores 387240...
processing stores 412585...
processing stores 441997...
processing stores 452387...
processing stores 461349...
processing stores 464495...
processing stores 471477...
processing stores 476061...
processing stores 480733...
processing stores 528854...
processing stores 536898...
processing stores 53

In [27]:
all_stores_result.mape_OOS.mean()

0.3567368421052631

In [4]:
#all_stores_result.to_csv('results/grouping/all_stores_result_prophet.csv')
all_stores_result = pd.read_csv('results/grouping/all_stores_result_prophet.csv', index_col=0)
all_stores_result.head()

Unnamed: 0,mae_IS,mae_OOS,mape_IS,mape_OOS,model
0,10.0,8.0,0.475,0.294,store_307222
1,9.0,6.0,0.485,0.294,store_307244
2,7.0,5.0,0.465,0.232,store_307248
3,7.0,5.0,0.535,0.301,store_320264
4,20.0,31.0,0.601,0.397,store_328165


In [5]:
all_stores_result.describe()

Unnamed: 0,mae_IS,mae_OOS,mape_IS,mape_OOS
count,38.0,38.0,38.0,38.0
mean,6.5,7.289474,0.495263,0.356737
std,3.546982,5.713464,0.049034,0.09782
min,3.0,3.0,0.38,0.222
25%,4.0,4.0,0.464,0.29425
50%,5.5,5.5,0.4955,0.3265
75%,7.75,8.75,0.53075,0.40575
max,20.0,31.0,0.601,0.677


In [7]:
all_stores_result_ARIMA = pd.read_csv('results/grouping/all_stores_result_ARIMA.csv',index_col=0)

In [8]:
all_stores_result_ARIMA.describe()

Unnamed: 0,mae_IS,mae_OOS,mape_IS,mape_OOS
count,38.0,38.0,38.0,38.0
mean,6.710526,7.921053,0.554579,0.380316
std,3.287223,5.965584,0.057218,0.116763
min,3.0,3.0,0.44,0.243
25%,4.0,4.0,0.511,0.30875
50%,6.0,6.5,0.546,0.3435
75%,8.0,9.0,0.59175,0.41775
max,18.0,33.0,0.674,0.842


In [7]:
res = pd.merge(
    all_stores_result_ARIMA.set_index('model')[['mape_OOS']],
    all_stores_result.set_index('model')[['mape_OOS']],
    'inner', left_index=True, right_index=True).reset_index()

res.rename({'mape_OOS_x':'ARIMA','mape_OOS_y':'PROPHET'},axis = 1,inplace=True)

res.head()

NameError: name 'all_stores_result' is not defined

In [39]:
# %%
fig = res[['ARIMA',
           'PROPHET'
           ]].plot()

arima_whole_mean = round(res['ARIMA'].mean(), 3)
fig.add_hline(y=arima_whole_mean, line_dash="dot", line_color='blue',
              annotation_text=str(arima_whole_mean),
              annotation_position="bottom right"
            )

arima_storelevel_mean = round(res['PROPHET'].mean(), 3)
fig.add_hline(y=arima_storelevel_mean, line_dash="dot", line_color='red',
              annotation_text=str(arima_storelevel_mean),
              annotation_position="top left")


In [11]:
res_RW_ARIMA = pd.read_pickle('results/res_RW_ARIMA_whole_holiday_promo.pkl')


In [12]:
res_RW_ARIMA.describe()


Unnamed: 0,RW_mae_OOS,RW_mape_OOS,arima_mae_OOS,arima_mape_OOS,store_id
count,38.0,38.0,38.0,38.0,38.0
mean,11.789474,0.624921,10.763158,0.546842,394977.868421
std,10.97456,0.341407,9.788118,0.191051,77692.774383
min,4.0,0.435,3.0,0.374,307222.0
25%,6.0,0.4605,5.0,0.452,349964.5
50%,9.5,0.511,7.5,0.4915,350043.0
75%,12.0,0.58825,11.75,0.55575,459108.5
max,62.0,2.0,50.0,1.338,566792.0
