In [17]:
import pandas as pd
import numpy as np
from pathlib import Path

from warnings import simplefilter
simplefilter("ignore")

In [18]:
dataset_dir = Path("dataset")

## CONFIG 

In [19]:
filter_store_nbr = ["1","2"]
filter_family_list = ["AUTOMATIVE", "SEAFOOD"]
val_ratio = 0.2
train_start_date = "2013-01-02"
train_end_date = "2016-12-30"
test_start_date= "2017-01-01"
test_end_date= "2017-08-15"

from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(np.arange(1,16))
fh = ForecastingHorizon(
    pd.PeriodIndex(pd.date_range('2017-08-16', periods=15, freq="D")), is_relative=False
)
fh

In [20]:
train = pd.read_csv(dataset_dir / 'train.csv',                            
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32'
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
train['date'] = train.date.dt.to_period('D')

train = (train
       .set_index(['store_nbr','family','date'])         # Setting MultiIndex to make unique identifiers for each 'sales' item
       .sort_index()
      )
# train = train[train.index.isin(store_nbr_list)]

In [21]:
# store_nbr_list = train['store_nbr'].unique()
# family_list = train['family'].unique()
# train.groupby(["store_nbr","family"]).count()

## MULTI-INDEX FILTERING

In [22]:
c1=train.index.get_level_values(0).isin(filter_store_nbr)
c2 = train.index.get_level_values(1).isin(filter_family_list)
df = train[c1 & c2]

In [23]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,SEAFOOD,2013-01-01,0.000000,0
1,SEAFOOD,2013-01-02,38.028999,0
1,SEAFOOD,2013-01-03,17.366001,0
1,SEAFOOD,2013-01-04,29.907001,0
1,SEAFOOD,2013-01-05,24.841999,0
...,...,...,...,...
2,SEAFOOD,2017-08-11,25.675001,5
2,SEAFOOD,2017-08-12,26.413000,0
2,SEAFOOD,2017-08-13,44.046001,0
2,SEAFOOD,2017-08-14,34.644001,0


## DATE FILTERING

In [25]:
filtered_train = df[(df.index.get_level_values('date') >= train_start_date) & (df.index.get_level_values('date') <= train_end_date)]
filtered_train.to_csv('filtered_train.csv')
filtered_test = df[(df.index.get_level_values('date') >= test_start_date) & (df.index.get_level_values('date') <= test_end_date)]
filtered_test.to_csv("filtered_test.csv")

In [26]:
filtered_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,SEAFOOD,2013-01-02,38.028999,0
1,SEAFOOD,2013-01-03,17.366001,0
1,SEAFOOD,2013-01-04,29.907001,0
1,SEAFOOD,2013-01-05,24.841999,0
1,SEAFOOD,2013-01-06,5.000000,0
...,...,...,...,...
2,SEAFOOD,2016-12-26,43.312000,0
2,SEAFOOD,2016-12-27,32.037998,0
2,SEAFOOD,2016-12-28,27.914000,0
2,SEAFOOD,2016-12-29,13.324000,1


In [34]:
filtered_train.index.to_list()

[('1', 'SEAFOOD', Period('2013-01-02', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-03', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-04', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-05', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-06', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-07', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-08', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-09', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-10', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-11', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-12', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-13', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-14', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-15', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-16', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-17', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-18', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-19', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-20', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-21', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-22', 'D')),
 ('1', 'SEAFOOD', Period('2013-01-

CategoricalIndex(['1', '2'], categories=['1', '10', '11', '12', ..., '6', '7', '8', '9'], ordered=False, dtype='category', name='store_nbr')

In [71]:
test = pd.read_csv(dataset_dir/'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D'

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,3000888,0
1,AUTOMOTIVE,2017-08-17,3002670,0
1,AUTOMOTIVE,2017-08-18,3004452,0
1,AUTOMOTIVE,2017-08-19,3006234,0
1,AUTOMOTIVE,2017-08-20,3008016,0
...,...,...,...,...
2,SEAFOOD,2017-08-27,3020885,1
2,SEAFOOD,2017-08-28,3022667,0
2,SEAFOOD,2017-08-29,3024449,1
2,SEAFOOD,2017-08-30,3026231,1


In [74]:
class MODELS:
    def __init__(self,model_name:str):
        self.model = self.build_model(model_name)

    def build_model(self,model_name,model_param=None):
        model = None
        if model_name == "ARIMA":
            from sktime.forecasting.arima import ARIMA
            model=ARIMA()
        elif model_name == "AutoARIMA":
            from sktime.forecasting.autoarima import AutoARIMA 
            model=AutoARIMA()
        elif model_name == "SARIMAX":
            from sktime.forecasting.sarimax import SARIMAX
            model=SARIMAX()
        elif model_name == "BATS":
            from sktime.forecasting.bats import BATS
            model=BATS()
        elif model_name == "TBATS":
            from sktime.forecasting.tbats import TBATS
            model=TBATS()
        elif model_name == "StatsForecastArima":
            from sktime.forecasting.statsforecast import StatsForecastAutoARIMA
            model=StatsForecastAutoARIMA()
        elif model_name == "StatsForecastTheta":
            from sktime.forecasting.statsforecast import StatsForecastAutoTheta
            model=StatsForecastAutoTheta()
        elif model_name == "ExpSmoothing":
            from sktime.forecasting.expsmoothing import ExponentialSmoothing
            model=ExponentialSmoothing()
            
        elif model_name == "structural":
            from sktime.forecasting.structural import UnobservedComponents
            model = UnobservedComponents()
            
        elif model_name == "prophet":
            from sktime.forecasting.fbprophet import Prophet
            model=Prophet()
        return model
    
    def train_model(self, model, y_train, fh=None):
        model.fit(y_train)
        return model
    
    def test_model(self,model,y_test,fh=None):
        y_pred = model.predict(fh)
        
        from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_log_error,mean_absolute_percentage_error

        print(f"MAPE: {mean_absolute_percentage_error(y_test,y_pred)}")
        print(f'MAE:  {mean_absolute_error(y_test, y_pred)}')
        print(f'MSE:  {mean_squared_error(y_test, y_pred)}')
        print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
        print(f"RMSLE: {root_mean_squared_log_error(y_test,y_pred)}")
        

        
    def tune_model(self,model, param_grid, y_train):
        from sktime.forecasting.model_selection import ForecastingGridSearchCV
        cv = SlidingWindowSplitter(initial_window=int(len(y_train) * 0.8), window_length=20)
        gscv = ForecastingGridSearchCV(
            model, strategy="refit", cv=cv, param_grid=param_grid
        )
        gscv.fit(y_train)
        print(f"gscv best params: {gscv.best_params_}")
        return gscv

ForecastingHorizon(['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
             '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
             '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
             '2017-08-28', '2017-08-29', '2017-08-30'],
            dtype='period[D]', is_relative=False)

ModuleNotFoundError: SARIMAX requires package 'statsmodels' to be present in the python environment, but 'statsmodels' was not found. 'statsmodels' is a soft dependency and not included in the base sktime installation. Please run: `pip install statsmodels` to install the statsmodels package. To install all soft dependencies, run: `pip install sktime[all_extras]`

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,4.704427,6.991693e-274
1,AUTOMOTIVE,2017-08-17,4.541561,3.674356e-274
1,AUTOMOTIVE,2017-08-18,4.497597,5.258373e-275
1,AUTOMOTIVE,2017-08-19,4.497597,1.081728e-274
1,AUTOMOTIVE,2017-08-20,4.497597,1.806772e-274
...,...,...,...,...
3,SEAFOOD,2017-08-26,87.628143,1.697403e+00
3,SEAFOOD,2017-08-27,103.236091,8.315984e-01
3,SEAFOOD,2017-08-28,74.746845,-6.047275e-01
3,SEAFOOD,2017-08-29,75.150595,-7.486686e-01


family,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,...,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD
family,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,...,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD
store_nbr,1,10,11,12,13,14,15,16,17,18,...,5,50,51,52,53,54,6,7,8,9
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-08-25,4.483274,1.811184,7.89436,5.932794,5.003217,3.829323,3.946422,4.902763,9.191286,4.312973,...,9.746943,19.963582,50.982105,9.403427,4.284481,2.579434,37.288262,43.194023,35.799047,17.768955


MultiIndex([('AUTOMOTIVE', 'AUTOMOTIVE',  '1'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '10'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '11'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '12'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '13'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '14'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '15'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '16'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '17'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '18'),
            ...
            (   'SEAFOOD',    'SEAFOOD',  '5'),
            (   'SEAFOOD',    'SEAFOOD', '50'),
            (   'SEAFOOD',    'SEAFOOD', '51'),
            (   'SEAFOOD',    'SEAFOOD', '52'),
            (   'SEAFOOD',    'SEAFOOD', '53'),
            (   'SEAFOOD',    'SEAFOOD', '54'),
            (   'SEAFOOD',    'SEAFOOD',  '6'),
            (   'SEAFOOD',    'SEAFOOD',  '7'),
            (   'SEAFOOD',    'SEAFOOD',  '8'),
            (   'SEAFOOD',    'SEAFOOD',  '9')],
           names=['fami