In [31]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

pd.options.plotting.backend = "plotly"

from pmdarima.preprocessing import FourierFeaturizer
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
)
from pmdarima import auto_arima, ARIMA
import time


# read data

In [32]:
df_store = pd.read_pickle("data/df_daily.pkl")
df_company = df_store.groupby("date").sum()[["sales"]]
df_exog = pd.read_pickle("data/df_exog.pkl")


# define functions

In [45]:
def preprocessing(ts, df_exog, split=True, steps_ahead=30):
    """
    split: to split into train and test set
    """
    print(f"Preprocessing timeseries data with {steps_ahead} steps ahead")
    ts = ts["sales"] / 1e6
    ts.index.freq = "D"
    df_exog = df_exog.loc[ts.index]

    if split:
        # split
        y_train = ts.iloc[:-steps_ahead]
        y_test = ts.iloc[-steps_ahead:]
        exog_train = df_exog.iloc[:-steps_ahead]
        exog_test = df_exog.iloc[-steps_ahead:]
        return {
            "y_train": y_train,
            "y_test": y_test,
            "exog_train": exog_train,
            "exog_test": exog_test,
        }
    else:
        return {"y": ts, "exog": df_exog}


def auto_arima_model(y_train, exog_train, diff_num):
    time_start = time.time()
    print("start auto arima...")
    # Fit model to the level to find common order
    arima_model = auto_arima(
        y=y_train,
        exogenous=exog_train,
        D=diff_num,
        seasonal=True,
        m=7,  # Weekly seasonality
    )
    time_stop = time.time()
    print(f"finished auto arima, total time: {time_stop-time_start}")
    return arima_model


def fit_arima_model(arima_model, model_name, arima_data):
    y_train = arima_data["y_train"]
    y_test = arima_data["y_test"]
    exog_train = arima_data["exog_train"]
    exog_test = arima_data["exog_test"]
    # Forecast
    arima_y_fitted = arima_model.predict_in_sample(X=exog_train)
    arima_y_forecast = arima_model.predict(n_periods=len(y_test), exogenous=exog_test)
    arima_y_forecast = pd.Series(arima_y_forecast, name="forecast", index=y_test.index)

    # metrics evaluation
    # in-sample
    mae_IS = round(mean_absolute_error(y_train, arima_y_fitted))
    mape_IS = round(mean_absolute_percentage_error(y_train, arima_y_fitted), 3)

    # out-sample
    mae_OOS = round(mean_absolute_error(y_test, arima_y_forecast), 3)
    mape_OOS = round(mean_absolute_percentage_error(y_test, arima_y_forecast), 3)
    return {
        "model": model_name,
        "mae_IS": mae_IS,
        "mae_OOS": mae_OOS,
        "mape_IS": mape_IS,
        "mape_OOS": mape_OOS,
    }


def cross_validation_result(data, model, model_name, rolls=4, horizon=30):
    """ """
    mae_CVs = []
    rmse_CVs = []
    mape_CVs = []
    mase_CVs = []
    for i in range(rolls):
        print(f"fold {i}---------------")
        y_train = data["y"].iloc[: -(rolls - i) * horizon]
        y_test = data["y"].iloc[
            np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]]
        
        model.fit(
            y=y_train,
            X=data["exog"].iloc[: -(rolls - i) * horizon])
        
        y_hat = model.predict(
            n_periods=horizon,
            exogenous=data["exog"].iloc[
                np.r_[-(rolls - i) * horizon : -(rolls - i - 1) * horizon]])

        mae_CVs.append(round(mean_absolute_error(y_test, y_hat), 3))
        rmse_CVs.append(round(mean_squared_error(y_test, y_hat, square_root=True), 3))
        mape_CVs.append(round(mean_absolute_percentage_error(y_test, y_hat), 3))
        mase_CVs.append(round(mean_absolute_scaled_error(y_test, y_hat, y_train=y_train), 3))
    
    return {'store':model_name,
           'mae_ARIMA':np.mean(mae_CVs),
           'rmse_ARIMA':np.mean(rmse_CVs),
           'mape_ARIMA':np.mean(mape_CVs),
           'mase_ARIMA':np.mean(mase_CVs),
           'fc_ARIMA':y_hat,
           }


# tune on company data

In [34]:
arima_data = preprocessing(ts=df_company, df_exog=df_exog, steps_ahead=30)
print(arima_data.keys())


Preprocessing timeseries data with 30 steps ahead
dict_keys(['y_train', 'y_test', 'exog_train', 'exog_test'])


## grid-search 
- with auto_arima and no CV

In [36]:
arima_model_whole_company = auto_arima_model(
    y_train=arima_data["y_train"], 
    exog_train=arima_data["exog_train"], 
    diff_num=1
)
print(arima_model_whole_company) # ARIMA(1,0,3)(0,1,1)[7] intercept


start auto arima...
finished auto arima, total time: 244.10508608818054
 ARIMA(1,0,3)(0,1,1)[7] intercept


## score with CV

In [41]:
arima_data_CV = preprocessing(ts=df_company, df_exog=df_exog, steps_ahead=30, split=False)
result_CV = cross_validation_result(arima_data_CV, arima_model_whole_company, 'company')
result_CV


Preprocessing timeseries data with 30 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------


{'store': 'company',
 'mae_ARIMA': 218.0315,
 'rmse_ARIMA': 289.91725,
 'mape_ARIMA': 0.29475,
 'mase_ARIMA': 1.3667500000000001}

# fit on store data

In [27]:
# arima_model_whole_company = ARIMA((1,0,3), (0,1,1,7))

In [46]:
all_stores_result_CV = []
for store in df_store["store_id"].unique():#[:2]:
    time_start = time.time()
    print(f"\nprocessing stores {store}...")
    model_name = "store_" + str(store)

    # data
    df_data = df_store[df_store["store_id"] == store].set_index("date")[["sales"]]
    arima_data = preprocessing(
        ts=df_data,
        df_exog=df_exog,
        split=False,
        steps_ahead=30,)

    # fit model to each store
    cv_score = cross_validation_result(
        data=arima_data, 
        model=arima_model_whole_company, 
        model_name=model_name)

    # result
    all_stores_result_CV.append(cv_score)

    # timing
    time_stop = time.time()
    print(f"finished stores {store}, total time: {time_stop-time_start}")
    
all_stores_result_CV = pd.DataFrame(all_stores_result_CV)



processing stores 307222...
Preprocessing timeseries data with 30 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307222, total time: 38.93562602996826

processing stores 307244...
Preprocessing timeseries data with 30 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307244, total time: 41.625200271606445

processing stores 307248...
Preprocessing timeseries data with 30 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 307248, total time: 41.72073292732239

processing stores 320264...
Preprocessing timeseries data with 30 steps ahead
fold 0---------------
fold 1---------------
fold 2---------------
fold 3---------------
finished stores 320264, total time: 39.95443797111511

processing stores 328165...
Preprocessing timeseries data with 30 steps ahead
fold 0---------------
fold 1--------

# result

In [49]:
# all_stores_result_CV.to_pickle("results/f8/ARIMA.pkl")


In [47]:
all_stores_result_CV.head()


Unnamed: 0,store,mae_ARIMA,rmse_ARIMA,mape_ARIMA,mase_ARIMA,fc_ARIMA
0,store_307222,9.407,12.852,0.32875,0.96175,"[37.029268763271205, 35.85993168857752, 20.326..."
1,store_307244,7.12975,9.58825,0.3545,0.919,"[16.621216340771806, 19.598568769438476, 11.74..."
2,store_307248,7.0615,9.1115,0.41025,1.02925,"[21.925906231844007, 20.782152493217513, 12.86..."
3,store_320264,5.616,7.47675,0.4395,0.82175,"[11.04554411780473, 13.87814908741479, 6.02455..."
4,store_328165,29.0005,40.30175,0.453,1.355,"[189.20752770854932, 166.71628694822704, 68.30..."


In [48]:
all_stores_result_CV.describe()


Unnamed: 0,mae_ARIMA,rmse_ARIMA,mape_ARIMA,mase_ARIMA
count,38.0,38.0,38.0,38.0
mean,7.847007,10.608467,0.452066,1.068914
std,5.777859,7.997232,0.085296,0.221264
min,3.01525,3.95375,0.322,0.79575
25%,4.020062,5.288375,0.383438,0.940687
50%,6.554,8.794875,0.4405,1.020375
75%,9.063438,12.218813,0.50075,1.184063
max,29.0005,40.30175,0.65925,1.90525
