In [117]:
import pandas as pd
import numpy as np
from pathlib import Path

from warnings import simplefilter
simplefilter("ignore")

In [118]:
dataset_dir = Path("dataset")

## CONFIG 

In [122]:
filter_store_nbr = [str(i) for i in range(55)]
filter_family_list = ["AUTOMOTIVE", "SEAFOOD"]
val_ratio = 0.2
train_start_date = "2013-01-02"
train_end_date = "2016-12-30"
test_start_date= "2017-01-01"
test_end_date= "2017-08-15"

from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(np.arange(1,228))
# fh = ForecastingHorizon(
#     pd.PeriodIndex(pd.date_range('2017-08-16', periods=15, freq="D")), is_relative=False
# )
fh

ForecastingHorizon([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       218, 219, 220, 221, 222, 223, 224, 225, 226, 227],
      dtype='int32', length=227, is_relative=True)

In [None]:
def fh_calculator():
    pass

In [123]:
train = pd.read_csv(dataset_dir / 'train.csv',                            
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32'
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
train['date'] = train.date.dt.to_period('D')

train = (train
       .set_index(['store_nbr','family','date'])         # Setting MultiIndex to make unique identifiers for each 'sales' item
       .sort_index()
      )
# train = train[train.index.isin(store_nbr_list)]

In [98]:
# store_nbr_list = train['store_nbr'].unique()
# family_list = train['family'].unique()
# train.groupby(["store_nbr","family"]).count()

## MULTI-INDEX FILTERING

In [124]:
c1=train.index.get_level_values(0).isin(filter_store_nbr)
c2 = train.index.get_level_values(1).isin(filter_family_list)
df = train[c1 & c2]

In [125]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.000000
1,AUTOMOTIVE,2013-01-02,2.000000
1,AUTOMOTIVE,2013-01-03,3.000000
1,AUTOMOTIVE,2013-01-04,3.000000
1,AUTOMOTIVE,2013-01-05,5.000000
...,...,...,...
9,SEAFOOD,2017-08-11,23.830999
9,SEAFOOD,2017-08-12,16.859001
9,SEAFOOD,2017-08-13,20.000000
9,SEAFOOD,2017-08-14,17.000000


In [126]:
df.index.get_level_values(1).unique()

CategoricalIndex(['AUTOMOTIVE', 'SEAFOOD'], categories=['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', ..., 'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD'], ordered=False, dtype='category', name='family')

## DATE FILTERING

In [128]:
filtered_train = df[(df.index.get_level_values('date') >= train_start_date) & (df.index.get_level_values('date') <= train_end_date)]
filtered_test = df[(df.index.get_level_values('date') >= test_start_date) & (df.index.get_level_values('date') <= test_end_date)]

In [129]:
filtered_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0
1,AUTOMOTIVE,2013-01-06,2.0
...,...,...,...
9,SEAFOOD,2016-12-26,12.0
9,SEAFOOD,2016-12-27,10.0
9,SEAFOOD,2016-12-28,7.0
9,SEAFOOD,2016-12-29,8.0


In [130]:
filtered_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2017-01-01,0.000000
1,AUTOMOTIVE,2017-01-02,5.000000
1,AUTOMOTIVE,2017-01-03,4.000000
1,AUTOMOTIVE,2017-01-04,1.000000
1,AUTOMOTIVE,2017-01-05,2.000000
...,...,...,...
9,SEAFOOD,2017-08-11,23.830999
9,SEAFOOD,2017-08-12,16.859001
9,SEAFOOD,2017-08-13,20.000000
9,SEAFOOD,2017-08-14,17.000000


In [131]:
test = pd.read_csv(dataset_dir/'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D')
test = (test
       .set_index(['store_nbr','family','date'])         # Setting MultiIndex to make unique identifiers for each 'sales' item
       .sort_index()
      )

In [132]:
class ForecastingProcess:
    def __init__(self):
        pass

    def build_model(self,model_name,**kwargs):
        model = None
        if model_name == "ARIMA":
            from sktime.forecasting.arima import ARIMA
            model=ARIMA()
        elif model_name == "AutoARIMA":
            from sktime.forecasting.arima import AutoARIMA 
            model=AutoARIMA()
        elif model_name == "SARIMAX":
            from sktime.forecasting.sarimax import SARIMAX
            model=SARIMAX()
        elif model_name == "BATS":
            from sktime.forecasting.bats import BATS
            model=BATS()
        elif model_name == "TBATS":
            from sktime.forecasting.tbats import TBATS
            model=TBATS()
        elif model_name == "StatsForecastArima":
            from sktime.forecasting.statsforecast import StatsForecastAutoARIMA
            model=StatsForecastAutoARIMA()
        elif model_name == "StatsForecastTheta":
            from sktime.forecasting.statsforecast import StatsForecastAutoTheta
            model=StatsForecastAutoTheta()
        elif model_name == "ExpSmoothing":
            from sktime.forecasting.exp_smoothing import ExponentialSmoothing
            model=ExponentialSmoothing()
            
        elif model_name == "structural":
            from sktime.forecasting.structural import UnobservedComponents
            model = UnobservedComponents()
            
        return model
    
    def train_model(self, model, y_train, fh=None):
            
        model.fit(y_train)
        return model
    
    def test_model(self,model,y_test,fh=None):
        y_pred = model.predict(fh)
        print(y_pred)
        # 
        from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_log_error,mean_absolute_percentage_error

        print(f"MAPE: {mean_absolute_percentage_error(y_test,y_pred)}")
        print(f'MAE:  {mean_absolute_error(y_test, y_pred)}')
        print(f'MSE:  {mean_squared_error(y_test, y_pred)}')
        print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
        print(f"RMSLE: {root_mean_squared_log_error(y_test,y_pred)}")
    
    def tune_model(self,model, param_grid, y_train):
        from sktime.forecasting.model_selection import ForecastingGridSearchCV
        from sktime.split import SlidingWindowSplitter

        cv = SlidingWindowSplitter(initial_window=int(len(y_train) * 0.8), window_length=20)
        gscv = ForecastingGridSearchCV(
            model, strategy="refit", cv=cv, param_grid=param_grid
        )
        gscv.fit(y_train)
        print(f"gscv best params: {gscv.best_params_}")
        return gscv

## ARIMA

In [142]:
forecasting = ForecastingProcess()
arima = forecasting.build_model("ARIMA")
arima= forecasting.train_model(arima,filtered_train)

In [143]:
fh = ForecastingHorizon(np.arange(1,228))

In [144]:
forecasting.test_model(arima, filtered_test, fh)

                                     sales
store_nbr family     date                 
1         AUTOMOTIVE 2016-12-31   3.204962
                     2017-01-01   3.133669
                     2017-01-02   3.127276
                     2017-01-03   3.126703
                     2017-01-04   3.126651
...                                    ...
9         SEAFOOD    2017-08-10  16.731863
                     2017-08-11  16.731863
                     2017-08-12  16.731863
                     2017-08-13  16.731863
                     2017-08-14  16.731863

[24516 rows x 1 columns]
MAPE: 864930941807896.5
MAE:  5.924748922520457
MSE:  110.76892220528092
RMSE: 10.524681572631113


ValueError: Root Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [79]:
filtered_test.index.get_level_values(1).unique()

CategoricalIndex(['SEAFOOD'], categories=['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', ..., 'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD'], ordered=False, dtype='category', name='family')