# import


In [10]:
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
import itertools
from prophet import Prophet
from sktime.performance_metrics.forecasting import mean_absolute_scaled_error, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pmdarima.preprocessing import FourierFeaturizer
from pmdarima import auto_arima, ARIMA
import matplotlib.pyplot as plt
import holidays
pd.options.plotting.backend = 'plotly'


# ARIMA - 3 years

## data


In [24]:
df_store = pd.read_pickle('data/df_daily.pkl')
df_store = df_store.set_index('date').loc['2017-09-01':'2021-01-31']
ts_company = df_store.groupby('date').sum()['sales']/1e6
ts_company.index.freq = 'D'
# ts_company.plot(title='Sales TS in Million VND (aggregated data)')

# Split the time series as well as exogenous features data into train and test splits
steps_ahead = 14 # 2 weeks
y_train = ts_company.iloc[:-steps_ahead]
y_test = ts_company.iloc[-steps_ahead:]


In [12]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_train.index, y=y_train, name='y_train'))
fig.add_trace(go.Scatter(x=y_test.index, y=y_test, name='y_test', mode='lines'))
fig.show()


## model


In [26]:
# find optimal SARIMA order
model_SARIMA = auto_arima(
    y=y_train,
    m=7, # Weekly seasonality
    n_jobs=-1, stepwise=False
    )


In [27]:
# Forecast
y_forecast_SARIMA = model_SARIMA.predict(n_periods=len(y_test))
y_forecast_SARIMA = pd.Series(y_forecast_SARIMA,
                             name='forecast',
                             index=y_test.index)

# out-sample metrics
mape_SARIMA = round(mean_absolute_percentage_error(
    y_test, y_forecast_SARIMA), 3)

print(model_SARIMA)
print(f'MAPE: {mape_SARIMA}')

 ARIMA(0,1,3)(1,0,0)[7] intercept
MAPE: 0.249


## stores

In [28]:
mape_SARIMA_stores = []
for store in df_store['store_id'].unique():  # print(store)
    # data
    ts_store = df_store[df_store['store_id'] == store]['sales']/1e6
    ts_store.index.freq = 'D'
    y_train_store = ts_store.iloc[:-steps_ahead]
    y_test_store = ts_store.iloc[-steps_ahead:]
    
    # model
    model_SARIMA_store = ARIMA(
        order=model_SARIMA.order,
        seasonal_order=model_SARIMA.seasonal_order
    ).fit(y_train_store)
    
    # forecast
    y_forecast_SARIMA_store = model_SARIMA.predict(n_periods=len(y_test_store))
    y_forecast_SARIMA_store = pd.Series(y_forecast_SARIMA_store,
                                name='forecast',
                                index=y_test_store.index)

    # out-sample metrics
    mape_SARIMA_store = round(mean_absolute_percentage_error(
        y_test_store, y_forecast_SARIMA_store), 3)
    mape_SARIMA_stores.append(mape_SARIMA_store)    

df_mape_SARIMA_stores = pd.DataFrame({
    'store': df_store['store_id'].unique(),
    'mape': mape_SARIMA_stores})

df_mape_SARIMA_stores.mean()


Maximum Likelihood optimization failed to converge. Check mle_retvals


Maximum Likelihood optimization failed to converge. Check mle_retvals



store    394977.868421
mape          1.890895
dtype: float64

# ARIMA - 2 years

## data

In [30]:
y_train_2y = ts_company[:'2020-01-09'].iloc[:-steps_ahead]
y_test_2y = ts_company[:'2020-01-09'].iloc[-steps_ahead:]


## model

In [31]:
# find optimal SARIMA order
model_SARIMA_2y = auto_arima(
    y=y_train_2y,
    m=7, # Weekly seasonality
    n_jobs=-1, stepwise=False
    )


In [32]:
# Forecast
y_forecast_SARIMA_2y = model_SARIMA_2y.predict(n_periods=len(y_test_2y))
y_forecast_SARIMA_2y = pd.Series(y_forecast_SARIMA_2y,
                             name='forecast',
                             index=y_test_2y.index)

# out-sample metrics
mape_SARIMA_2y = round(mean_absolute_percentage_error(
    y_test_2y, y_forecast_SARIMA_2y), 3)

print(model_SARIMA_2y)
print(f'MAPE: {mape_SARIMA_2y}')

 ARIMA(4,1,0)(1,0,0)[7] intercept
MAPE: 0.212


In [71]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test.index, y=y_test, name='y_test'))
fig.add_trace(go.Scatter(x=y_test.index, y=y_forecast_SARIMA, name='forecast'))
fig.show()


In [64]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test_2y.index, y=y_test_2y, name='y_test'))
fig.add_trace(go.Scatter(x=y_test_2y.index, y=y_forecast_SARIMA_2y, name='forecast'))
fig.show()


## stores

In [33]:
mape_SARIMA_stores_2y = []
for store in df_store['store_id'].unique():  # print(store)
    # data
    ts_store = df_store[df_store['store_id'] == store]['sales']/1e6
    ts_store.index.freq = 'D'
    y_train_store_2y = ts_store[:'2020-01-09'].iloc[:-steps_ahead]
    y_test_store_2y = ts_store[:'2020-01-09'].iloc[-steps_ahead:]
    
    # model
    model_SARIMA_store_2y = ARIMA(
        order=model_SARIMA_2y.order,
        seasonal_order=model_SARIMA_2y.seasonal_order
    ).fit(y_train_store_2y)
    
    # forecast
    y_forecast_SARIMA_store_2y = model_SARIMA_2y.predict(n_periods=len(y_test_store_2y))
    y_forecast_SARIMA_store_2y = pd.Series(y_forecast_SARIMA_store_2y,
                                name='forecast',
                                index=y_test_store_2y.index)

    # out-sample metrics
    mape_SARIMA_store_2y = round(mean_absolute_percentage_error(
        y_test_store_2y, y_forecast_SARIMA_store_2y), 3)
    mape_SARIMA_stores_2y.append(mape_SARIMA_store_2y)    

df_mape_SARIMA_stores_2y = pd.DataFrame({
    'store': df_store['store_id'].unique(),
    'mape': mape_SARIMA_stores_2y})

df_mape_SARIMA_stores_2y.mean()

store    394977.868421
mape          1.887816
dtype: float64