In [2]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.options.plotting.backend = "plotly"

from pmdarima.preprocessing import FourierFeaturizer
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
from pmdarima import auto_arima, ARIMA
import time


## read data

In [24]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_company = df_store.groupby("date").sum()[["sales"]]
df_exog = pd.read_pickle("data/df_exog.pkl")


## define functions

In [88]:
def preprocessing(ts, df_exog, split=True, steps_ahead=30):
    """
    split: to split into train and test set
    """
    print(f"Preprocessing timeseries data with {steps_ahead} steps ahead")
    ts = ts["sales"] / 1e6
    ts.index.freq = "D"
    df_exog=df_exog.loc[ts.index]
    
    if split:
        # split
        y_train = ts.iloc[:-steps_ahead]
        y_test = ts.iloc[-steps_ahead:]
        exog_train = df_exog.iloc[:-steps_ahead]
        exog_test = df_exog.iloc[-steps_ahead:]
        return {
            "y_train": y_train,
            "y_test": y_test,
            "exog_train": exog_train,
            "exog_test": exog_test,
        }
    else:
        return {"y": ts, "exog": df_exog}


def auto_arima_model(y_train, exog_train, diff_num):
    time_start = time.time()
    print("start auto arima...")
    # Fit model to the level to find common order
    arima_model = auto_arima(
        y=y_train,
        exogenous=exog_train,
        D=diff_num,
        seasonal=True,
        m=7,  # Weekly seasonality
    )
    time_stop = time.time()
    print(f"finished auto arima, total time: {time_stop-time_start}")
    return arima_model


def fit_arima_model(arima_model, model_name, arima_data):
    y_train = arima_data["y_train"]
    y_test = arima_data["y_test"]
    exog_train = arima_data["exog_train"]
    exog_test = arima_data["exog_test"]
    # Forecast
    arima_y_fitted = arima_model.predict_in_sample(X=exog_train)

    arima_y_forecast = arima_model.predict(n_periods=len(y_test), exogenous=exog_test)

    arima_y_forecast = pd.Series(arima_y_forecast, name="forecast", index=y_test.index)

    # metrics evaluation
    # in-sample
    mae_IS = round(mean_absolute_error(y_train, arima_y_fitted))
    mape_IS = round(mean_absolute_percentage_error(y_train, arima_y_fitted), 3)

    # out-sample
    mae_OOS = round(mean_absolute_error(y_test, arima_y_forecast), 3)
    mape_OOS = round(mean_absolute_percentage_error(y_test, arima_y_forecast), 3)
    return {
        "model": model_name,
        "mae_IS": mae_IS,
        "mae_OOS": mae_OOS,
        "mape_IS": mape_IS,
        "mape_OOS": mape_OOS,
    }


def cross_validation_result(data, model, model_name, rolls=4, horizon=30):
    """ """
    mae_CVs = []
    mape_CVs = []
    for i in range(rolls):
        print(f'fold {i}---------------')
        model.fit(
            y=data["y"].iloc[: -(rolls - i) * horizon],
            X=data["exog"].iloc[: -(rolls - i) * horizon],
        )
        y_test = data["y"].iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]
        ]
        y_hat = model.predict(
            n_periods=horizon,
            exogenous=data["exog"].iloc[
                np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]
            ],
        )
        mae_CV = round(mean_absolute_error(y_test, y_hat), 3)
        mape_CV = round(mean_absolute_percentage_error(y_test, y_hat), 3)
        mae_CVs.append(mae_CV)
        mape_CVs.append(mape_CV)
    return {
        "model": model_name,
        "mae_CV": np.mean(mae_CVs),
        "mape_CV": np.mean(mape_CVs),
    }


## tune on company data

In [25]:
arima_data = preprocessing(
    ts=df_company, df_exog=df_exog, steps_ahead=30
)

print(arima_data.keys())


Preprocessing timeseries data with 30 steps ahead

dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


In [26]:
arima_data["exog_train"]

Unnamed: 0_level_0,off_day,promo_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-08-07,0,0
2017-08-08,0,0
2017-08-09,0,0
2017-08-10,0,0
2017-08-11,0,0
...,...,...
2020-12-28,0,0
2020-12-29,0,0
2020-12-30,0,0
2020-12-31,0,0


In [27]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    diff_num=1
)
print(arima_model_whole_company)


start auto arima...
finished auto arima, total time: 253.79850220680237
 ARIMA(1,0,3)(0,1,1)[7] intercept


In [28]:
result = fit_arima_model(
    arima_model=arima_model_whole_company,
    model_name="whole_company",
    arima_data=arima_data,
)


In [29]:
result


{'model': 'whole_company',
 'mae_IS': 127,
 'mae_OOS': 208.445,
 'mape_IS': 0.277,
 'mape_OOS': 0.26}

## fit on store data

In [87]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():
    time_start = time.time()
    print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        split=False,
        steps_ahead=30,
    )

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, model=arima_model_whole_company, model_name=model_name
    )

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)


processing stores 307222...
Preprocessing timeseries data with 30 steps ahead

fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307222, total time: 26.484768629074097
processing stores 307244...
Preprocessing timeseries data with 30 steps ahead

fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307244, total time: 28.495460033416748
processing stores 307248...
Preprocessing timeseries data with 30 steps ahead

fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307248, total time: 28.795305967330933
processing stores 320264...
Preprocessing timeseries data with 30 steps ahead

fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 320264, total time: 28.824748039245605
processing stores 328165...
Preprocessing timeseries data with 30 steps ahead

fold 0---------------
fold 1-----

## result

In [89]:
all_stores_result_CV.to_pickle("results/f8/ARIMA_promo.pkl")


In [90]:
all_stores_result_CV.head()


Unnamed: 0,model,mae_CV,mape_CV
0,store_307222,9.407,0.32875
1,store_307244,7.12975,0.3545
2,store_307248,7.0615,0.41025
3,store_320264,5.616,0.4395
4,store_328165,29.0005,0.453


In [91]:
all_stores_result_CV.describe()


Unnamed: 0,mae_CV,mape_CV
count,38.0,38.0
mean,7.847007,0.452066
std,5.777859,0.085296
min,3.01525,0.322
25%,4.020062,0.383438
50%,6.554,0.4405
75%,9.063438,0.50075
max,29.0005,0.65925
