In [64]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.options.plotting.backend = "plotly"

from pmdarima.preprocessing import FourierFeaturizer
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
from pmdarima import auto_arima, ARIMA
import time


# read data

In [65]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_company = df_store.groupby("date").sum()[["sales"]]
df_exog = pd.read_pickle("data/df_exog.pkl")


# define functions

In [66]:
horizon = 7
def preprocessing(ts, df_exog, split=True, steps_ahead=horizon):
    """
    split: to split into train and test set
    """
    print(f"Preprocessing timeseries data with {steps_ahead} steps ahead")
    ts = ts["sales"] / 1e6
    ts.index.freq = "D"
    df_exog = df_exog.loc[ts.index]

    if split:
        # split
        y_train = ts.iloc[:-steps_ahead]
        y_test = ts.iloc[-steps_ahead:]
        exog_train = df_exog.iloc[:-steps_ahead]
        exog_test = df_exog.iloc[-steps_ahead:]
        return {
            "y_train": y_train,
            "y_test": y_test,
            "exog_train": exog_train,
            "exog_test": exog_test,
        }
    else:
        return {"y": ts, "exog": df_exog}


def auto_arima_model(y_train, exog_train):#, diff_num):
    time_start = time.time()
    print("start auto arima...")
    # Fit model to the level to find common order
    arima_model = auto_arima(
        y=y_train,
        exogenous=exog_train,
        # D=diff_num,
        seasonal=True,
        m=7,  # Weekly seasonality
    )
    time_stop = time.time()
    print(f"finished auto arima, total time: {time_stop-time_start}")
    return arima_model


def fit_arima_model(arima_model, model_name, arima_data):
    y_train = arima_data["y_train"]
    y_test = arima_data["y_test"]
    exog_train = arima_data["exog_train"]
    exog_test = arima_data["exog_test"]
    # Forecast
    arima_y_fitted = arima_model.predict_in_sample(X=exog_train)
    arima_y_forecast = arima_model.predict(n_periods=len(y_test), exogenous=exog_test)
    arima_y_forecast = pd.Series(arima_y_forecast, name="forecast", index=y_test.index)

    # metrics evaluation
    # in-sample
    mae_IS = round(mean_absolute_error(y_train, arima_y_fitted))
    mape_IS = round(mean_absolute_percentage_error(y_train, arima_y_fitted), 3)

    # out-sample
    mae_OOS = round(mean_absolute_error(y_test, arima_y_forecast), 3)
    mape_OOS = round(mean_absolute_percentage_error(y_test, arima_y_forecast), 3)
    return {
        "model": model_name,
        "mae_IS": mae_IS,
        "mae_OOS": mae_OOS,
        "mape_IS": mape_IS,
        "mape_OOS": mape_OOS,
    }


def cross_validation_result(data, model, model_name, rolls=4, horizon=horizon):
    """ """
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        print(f"fold {i}---------------")
        y_train = data["y"].iloc[: -(rolls - i) * horizon]
        y_test = data["y"].iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]
        
        model.fit(
            y=y_train,
            X=data["exog"].iloc[: -(rolls - i) * horizon])
        
        y_hat = model.predict(
            n_periods=horizon,
            exogenous=data["exog"].iloc[
                np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]])

        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
    
    return {'store':model_name,
           'mae_ARIMA':np.mean(mae_CVs),
           'rmse_ARIMA':np.mean(rmse_CVs),
           'mape_ARIMA':np.mean(mape_CVs),
           'mase_ARIMA':np.mean(mase_CVs),
           'fc_ARIMA':y_hat,
           }


# tune on company data

In [67]:
arima_data = preprocessing(ts=df_company, df_exog=df_exog)
print(arima_data.keys())


Preprocessing timeseries data with 7 steps ahead
dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


## grid-search 
- with auto_arima and no CV

In [68]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    # diff_num=1,
)
print(arima_model_whole_company) # ARIMA(1,0,3)(0,1,1)[7] intercept


start auto arima...
finished auto arima, total time: 186.13248801231384
 ARIMA(5,1,0)(2,0,0)[7]          


## score with CV

In [69]:
arima_data_CV = preprocessing(ts=df_company, df_exog=df_exog, split=False)
result_CV = cross_validation_result(arima_data_CV, arima_model_whole_company, 'company')
result_CV


Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------


{'store': 'company',
 'mae_ARIMA': 284.57075,
 'rmse_ARIMA': 317.89475,
 'mape_ARIMA': 0.331,
 'mase_ARIMA': 1.74275,
 'fc_ARIMA': array([1087.77647678, 1040.70904204,  969.89440579,  959.64106677,
        1269.41375397, 1522.40235338, 1719.64910467])}

# fit on store data

In [70]:
# arima_model_whole_company = ARIMA((1,0,3), (0,1,1,7))

In [71]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():#[:2]:
    time_start = time.time()
    print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        split=False,
        )

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, 
        model=arima_model_whole_company, 
        model_name=model_name,
        )

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)



processing stores 307222...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307222, total time: 34.460044145584106

processing stores 307244...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307244, total time: 31.19917106628418

processing stores 307248...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307248, total time: 26.74669098854065

processing stores 320264...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 320264, total time: 28.19735312461853

processing stores 328165...
Preprocessing timeseries data with 7 steps ahead
fold 0---------------
fold 1-------------

# result

In [75]:
all_stores_result_CV.to_pickle("results/f8/ARIMA_7.pkl")


In [76]:
all_stores_result_CV.head()


Unnamed: 0,store,mae_ARIMA,rmse_ARIMA,mape_ARIMA,mase_ARIMA,fc_ARIMA
0,store_307222,11.27125,13.6105,0.3555,1.14025,"[48.20032137436968, 42.39733446660982, 38.4800..."
1,store_307244,7.5565,9.543,0.357,0.96875,"[21.193627554872037, 19.93864384353813, 18.663..."
2,store_307248,7.62575,8.43425,0.32725,1.1095,"[30.379947609565903, 27.74868495508247, 26.293..."
3,store_320264,9.5075,11.30125,0.48525,1.40275,"[38.18944299332359, 33.939920033410274, 29.909..."
4,store_328165,44.94325,50.0105,0.5595,2.04875,"[104.85617978504158, 96.91847754174306, 86.387..."


In [77]:
all_stores_result_CV.describe()


Unnamed: 0,mae_ARIMA,rmse_ARIMA,mape_ARIMA,mase_ARIMA
count,38.0,38.0,38.0,38.0
mean,9.787678,11.374257,0.428039,1.298664
std,8.108257,9.190228,0.09249,0.338098
min,2.8605,3.516,0.304,0.7765
25%,4.6085,5.365375,0.355875,1.103125
50%,7.54475,8.852375,0.418,1.22375
75%,11.164,12.703813,0.480875,1.4615
max,44.94325,50.0105,0.68075,2.20375
