In [2]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.options.plotting.backend = "plotly"

from pmdarima.preprocessing import FourierFeaturizer
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
from pmdarima import auto_arima, ARIMA
import time


  pd.Int64Index,


# read data

In [3]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_company = df_store.groupby("date").sum()[["sales"]]
df_exog = pd.read_pickle("data/df_exog.pkl")


# define functions

In [4]:
def preprocessing(ts, df_exog, horizon, split=True):
    """
    split: to split into train and test set
    """
    print(f"Preprocessing timeseries data with {horizon} steps ahead")
    ts = ts["sales"] / 1e6
    ts.index.freq = "D"
    df_exog = df_exog.loc[ts.index]

    if split:
        # split
        y_train = ts.iloc[:-horizon]
        y_test = ts.iloc[-horizon:]
        exog_train = df_exog.iloc[:-horizon]
        exog_test = df_exog.iloc[-horizon:]
        return {
            "y_train": y_train,
            "y_test": y_test,
            "exog_train": exog_train,
            "exog_test": exog_test,
        }
    else:
        return {"y": ts, "exog": df_exog}


def auto_arima_model(y_train, exog_train, diff_num):
    time_start = time.time()
    print("start auto arima...")
    # Fit model to the level to find common order
    arima_model = auto_arima(
        y=y_train,
        exogenous=exog_train,
        D=diff_num,
        seasonal=True,
        m=7,  # Weekly seasonality
    )
    time_stop = time.time()
    print(f"finished auto arima, total time: {time_stop-time_start}")
    return arima_model


def fit_arima_model(arima_model, model_name, arima_data):
    y_train = arima_data["y_train"]
    y_test = arima_data["y_test"]
    exog_train = arima_data["exog_train"]
    exog_test = arima_data["exog_test"]
    # Forecast
    arima_y_fitted = arima_model.predict_in_sample(X=exog_train)
    arima_y_forecast = arima_model.predict(n_periods=len(y_test), exogenous=exog_test)
    arima_y_forecast = pd.Series(arima_y_forecast, name="forecast", index=y_test.index)

    # metrics evaluation
    # in-sample
    mae_IS = round(mean_absolute_error(y_train, arima_y_fitted))
    mape_IS = round(mean_absolute_percentage_error(y_train, arima_y_fitted), 3)

    # out-sample
    mae_OOS = round(mean_absolute_error(y_test, arima_y_forecast), 3)
    mape_OOS = round(mean_absolute_percentage_error(y_test, arima_y_forecast), 3)
    return {
        "model": model_name,
        "mae_IS": mae_IS,
        "mae_OOS": mae_OOS,
        "mape_IS": mape_IS,
        "mape_OOS": mape_OOS,
    }


def cross_validation_result(data, model, model_name, horizon, rolls=4):
    """ """
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        # print(f"fold {i}---------------")
        y_train = data["y"].iloc[: -(rolls - i) * horizon]
        y_test = data["y"].iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]
        
        model.fit(
            y=y_train,
            X=data["exog"].iloc[: -(rolls - i) * horizon])
        
        y_hat = model.predict(
            n_periods=horizon,
            exogenous=data["exog"].iloc[
                np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]])

        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
    
    return {'store':model_name,
           'mae_ARIMA':np.mean(mae_CVs),
           'rmse_ARIMA':np.mean(rmse_CVs),
           'mape_ARIMA':np.mean(mape_CVs),
           'mase_ARIMA':np.mean(mase_CVs),
           'fc_ARIMA':y_hat,
           }


# horizon = 7

## tune on company data

In [5]:
horizon = 7
arima_data = preprocessing(ts=df_company, df_exog=df_exog, horizon=horizon)
print(arima_data.keys())


Preprocessing timeseries data with 7 steps ahead
dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


In [11]:
arima_data['y_train']

date
2017-08-07       1.527650
2017-08-08       0.000000
2017-08-09      34.942320
2017-08-10      20.151840
2017-08-11      24.305380
                 ...     
2021-01-20     653.033192
2021-01-21     595.027497
2021-01-22     745.327841
2021-01-23    1207.594290
2021-01-24    1672.900088
Freq: D, Name: sales, Length: 1267, dtype: float64

In [93]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    diff_num=1,
)
print(arima_model_whole_company) # ARIMA(1,0,3)(0,1,1)[7] intercept


start auto arima...
finished auto arima, total time: 103.73086977005005
 ARIMA(1,0,1)(0,1,1)[7] intercept


In [95]:
# from joblib import dump, load
# # Save model
# dump(arima_model_whole_company, filename='results/f8/ARIMA_forecaster_7.py')

# # Load model
# arima_model_whole_company = load('results/f8/ARIMA_forecaster_7.py')


## fit on store data

In [70]:
# arima_model_whole_company = ARIMA((1,0,3), (0,1,1,7))

In [71]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():#[:2]:
    time_start = time.time()
    print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        horizon=horizon,
        split=False,
        )

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, 
        model=arima_model_whole_company, 
        model_name=model_name,
        horizon=horizon,
        )

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)



processing stores 307222...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307222, total time: 34.460044145584106

processing stores 307244...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307244, total time: 31.19917106628418

processing stores 307248...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307248, total time: 26.74669098854065

processing stores 320264...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 320264, total time: 28.19735312461853

processing stores 328165...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1-------------

## result

In [75]:
# all_stores_result_CV.to_pickle("results/f8/ARIMA_result_7.pkl")


# horizon = 14

## tune on company data

In [83]:
horizon = 14
arima_data = preprocessing(ts=df_company, df_exog=df_exog, horizon=horizon)
print(arima_data.keys())


Preprocessing timeseries data with 14 steps ahead
dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


In [84]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    diff_num=1,
    
)
print(arima_model_whole_company) # ARIMA(1,0,3)(0,1,1)[7] intercept


start auto arima...
finished auto arima, total time: 204.3140480518341
 ARIMA(0,0,3)(0,1,1)[7]          


In [None]:
# from joblib import dump, load
# # Save model
# dump(arima_model_whole_company, filename='results/f8/ARIMA_forecaster_14.py')

# # Load model
# arima_model_whole_company = load('results/f8/ARIMA_forecaster_14.py')



## fit on store data

In [None]:
# arima_model_whole_company = ARIMA((1,0,3), (0,1,1,7))

In [85]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():#[:2]:
    time_start = time.time()
    # print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        horizon=horizon,
        split=False,
        )

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, 
        model=arima_model_whole_company, 
        model_name=model_name,
        horizon=horizon,
        )

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)



processing stores 307222...
Preprocessing timeseries data with 14 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307222, total time: 13.54040002822876

processing stores 307244...
Preprocessing timeseries data with 14 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307244, total time: 13.858042240142822

processing stores 307248...
Preprocessing timeseries data with 14 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307248, total time: 13.794012069702148

processing stores 320264...
Preprocessing timeseries data with 14 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 320264, total time: 16.285012006759644

processing stores 328165...
Preprocessing timeseries data with 14 steps ahead
fold 0---------------
fold 1------

## result

In [88]:
# Save result
# all_stores_result_CV.to_pickle("results/f8/ARIMA_result_14.pkl")


# horizon = 21

## tune on company data

In [98]:
horizon = 21
arima_data = preprocessing(ts=df_company, df_exog=df_exog, horizon=horizon)
print(arima_data.keys())


Preprocessing timeseries data with 21 steps ahead
dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


In [99]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    diff_num=1,
    
)
print(arima_model_whole_company) # ARIMA(1,0,3)(0,1,1)[7] intercept


start auto arima...
finished auto arima, total time: 161.90804886817932
 ARIMA(0,0,3)(0,1,1)[7] intercept


In [100]:
from joblib import dump, load
# Save model
dump(arima_model_whole_company, filename='results/f8/ARIMA_forecaster_21.py')

# Load model
arima_model_whole_company = load('results/f8/ARIMA_forecaster_21.py')



## fit on store data

In [101]:
# arima_model_whole_company = ARIMA((1,0,3), (0,1,1,7))

In [102]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():#[:2]:
    time_start = time.time()
    # print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        horizon=horizon,
        split=False,
        )

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, 
        model=arima_model_whole_company, 
        model_name=model_name,
        horizon=horizon,
        )

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    # print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead
Preprocessing timeseries data with 21 steps ahead


## result

In [103]:
# Save result
all_stores_result_CV.to_pickle("results/f8/ARIMA_result_21.pkl")


# horizon = 28

## tune on company data

In [104]:
horizon = 28
arima_data = preprocessing(ts=df_company, df_exog=df_exog, horizon=horizon)
print(arima_data.keys())


Preprocessing timeseries data with 28 steps ahead
dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


In [105]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    diff_num=1,
    
)
print(arima_model_whole_company) # ARIMA(1,0,3)(0,1,1)[7] intercept


start auto arima...
finished auto arima, total time: 322.09505581855774
 ARIMA(0,0,3)(0,1,1)[7] intercept


In [106]:
from joblib import dump, load
# Save model
dump(arima_model_whole_company, filename='results/f8/ARIMA_forecaster_28.py')

# Load model
# arima_model_whole_company = load('results/f8/ARIMA_forecaster_21.py')



['results/f8/ARIMA_forecaster_28.py']

## fit on store data

In [107]:
# arima_model_whole_company = ARIMA((1,0,3), (0,1,1,7))

In [108]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():#[:2]:
    time_start = time.time()
    # print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        horizon=horizon,
        split=False,
        )

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, 
        model=arima_model_whole_company, 
        model_name=model_name,
        horizon=horizon,
        )

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    # print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead
Preprocessing timeseries data with 28 steps ahead


## result

In [None]:
# Save result
all_stores_result_CV.to_pickle("results/f8/ARIMA_result_28.pkl")
