In [173]:
import pandas as pd
import numpy as np
from pathlib import Path

from warnings import simplefilter
simplefilter("ignore")

In [174]:
dataset_dir = Path("dataset")

## CONFIG 

In [175]:
filter_store_nbr = [str(i) for i in range(55)]
filter_family_list = []
val_ratio = 0.2
train_start_date = "2013-01-02"
train_end_date = "2017-08-15"
test_start_date= "2017-01-01"
test_end_date= "2017-08-15"

from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(np.arange(1,16))
# fh = ForecastingHorizon(
#     pd.PeriodIndex(pd.date_range('2017-08-16', periods=15, freq="D")), is_relative=False
# )
fh

ForecastingHorizon([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], dtype='int32', is_relative=True)

In [176]:
def fh_calculator():
    pass

In [177]:
train = pd.read_csv(dataset_dir / 'train.csv',                            
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32'
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
train['date'] = train.date.dt.to_period('D')

train = (train
       .set_index(['store_nbr','family','date'])         # Setting MultiIndex to make unique identifiers for each 'sales' item
       .sort_index()
      )
# train = train[train.index.isin(store_nbr_list)]

In [178]:
def root_mean_squared_log_error_func(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: # check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

In [179]:
def normalize(group):
    min_vals = group.min()
    max_vals = group.max()
    return (group - min_vals) / (max_vals - min_vals)


In [180]:
normalized_df = train.groupby(level="family").apply(normalize)
normalized_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sales
family,store_nbr,family,date,Unnamed: 4_level_1
AUTOMOTIVE,1,AUTOMOTIVE,2013-01-01,0.000000
AUTOMOTIVE,1,AUTOMOTIVE,2013-01-02,0.007843
AUTOMOTIVE,1,AUTOMOTIVE,2013-01-03,0.011765
AUTOMOTIVE,1,AUTOMOTIVE,2013-01-04,0.011765
AUTOMOTIVE,1,AUTOMOTIVE,2013-01-05,0.019608
...,...,...,...,...
SEAFOOD,9,SEAFOOD,2017-08-11,0.086835
SEAFOOD,9,SEAFOOD,2017-08-12,0.061431
SEAFOOD,9,SEAFOOD,2017-08-13,0.072876
SEAFOOD,9,SEAFOOD,2017-08-14,0.061944


## MULTI-INDEX FILTERING

In [181]:
filter_family_list = train.index.get_level_values(1).unique().to_list()
c1=train.index.get_level_values(0).isin(filter_store_nbr)
c2 = train.index.get_level_values(1).isin(filter_family_list)
df = train[c1 & c2]

In [182]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.000000
1,AUTOMOTIVE,2013-01-02,2.000000
1,AUTOMOTIVE,2013-01-03,3.000000
1,AUTOMOTIVE,2013-01-04,3.000000
1,AUTOMOTIVE,2013-01-05,5.000000
...,...,...,...
9,SEAFOOD,2017-08-11,23.830999
9,SEAFOOD,2017-08-12,16.859001
9,SEAFOOD,2017-08-13,20.000000
9,SEAFOOD,2017-08-14,17.000000


In [183]:
df.index.get_level_values(1).unique()

CategoricalIndex(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
                  'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI',
                  'EGGS', 'FROZEN FOODS', 'GROCERY I', 'GROCERY II',
                  'HARDWARE', 'HOME AND KITCHEN I', 'HOME AND KITCHEN II',
                  'HOME APPLIANCES', 'HOME CARE', 'LADIESWEAR',
                  'LAWN AND GARDEN', 'LINGERIE', 'LIQUOR,WINE,BEER',
                  'MAGAZINES', 'MEATS', 'PERSONAL CARE', 'PET SUPPLIES',
                  'PLAYERS AND ELECTRONICS', 'POULTRY', 'PREPARED FOODS',
                  'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD'],
                 categories=['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', ..., 'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD'], ordered=False, dtype='category', name='family')

## DATE FILTERING

In [184]:
filtered_train = df[(df.index.get_level_values('date') >= train_start_date) & (df.index.get_level_values('date') <= train_end_date)]
# filtered_test = df[(df.index.get_level_values('date') >= test_start_date) & (df.index.get_level_values('date') <= test_end_date)]

In [185]:
filtered_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-02,2.000000
1,AUTOMOTIVE,2013-01-03,3.000000
1,AUTOMOTIVE,2013-01-04,3.000000
1,AUTOMOTIVE,2013-01-05,5.000000
1,AUTOMOTIVE,2013-01-06,2.000000
...,...,...,...
9,SEAFOOD,2017-08-11,23.830999
9,SEAFOOD,2017-08-12,16.859001
9,SEAFOOD,2017-08-13,20.000000
9,SEAFOOD,2017-08-14,17.000000


In [186]:
test = pd.read_csv(dataset_dir/'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D')
test = (test
       .set_index(['store_nbr','family','date'])         # Setting MultiIndex to make unique identifiers for each 'sales' item
       .sort_index()
      )

In [187]:
from statsmodels.tsa.arima_model import ARIMA
from sktime.forecasting.arima import ARIMA
class ForecastingProcess:
    def __init__(self):
        pass

    def build_model(self,model_name,**kwargs):
        model = None
        if model_name == "ARIMA":
            from sktime.forecasting.arima import ARIMA
            model=ARIMA()
        elif model_name == "AutoARIMA":
            from sktime.forecasting.arima import AutoARIMA 
            model=AutoARIMA()
        elif model_name == "SARIMAX":
            from sktime.forecasting.sarimax import SARIMAX
            model=SARIMAX()
        elif model_name == "BATS":
            from sktime.forecasting.bats import BATS
            model=BATS()
        elif model_name == "TBATS":
            from sktime.forecasting.tbats import TBATS
            model=TBATS()
        elif model_name == "StatsForecastArima":
            from sktime.forecasting.statsforecast import StatsForecastAutoARIMA
            model=StatsForecastAutoARIMA()
        elif model_name == "StatsForecastTheta":
            from sktime.forecasting.statsforecast import StatsForecastAutoTheta
            model=StatsForecastAutoTheta()
        elif model_name == "ExpSmoothing":
            from sktime.forecasting.exp_smoothing import ExponentialSmoothing
            model=ExponentialSmoothing()
            
        elif model_name == "structural":
            from sktime.forecasting.structural import UnobservedComponents
            model = UnobservedComponents()

         
        return model
    
    def train_model(self, model, y_train, fh=None):
            
        model.fit(y_train)
        return model
    
    def test_model(self,model, fh, y_test=None,submission=None):
        y_pred = model.predict(fh)
        print(np.sum((y_pred["sales"]<0).values.ravel()))
        print(y_pred["sales"])
        y_pred["sales"] = np.abs(y_pred["sales"])
        
        if submission is None:
            from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_log_error,mean_absolute_percentage_error
            
            print(f"MAPE: {mean_absolute_percentage_error(y_test,y_pred)}")
            print(f'MAE:  {mean_absolute_error(y_test, y_pred)}')
            print(f'MSE:  {mean_squared_error(y_test, y_pred)}')
            print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
            #print(f"RMSLE_func: {root_mean_squared_log_error_func(y_test,y_pred)}")
            print(f"RMSLE: {root_mean_squared_log_error(y_test,y_pred)}")
        return y_pred

    def tune_model(self,model, param_grid, y_train):
        from sktime.forecasting.model_selection import ForecastingGridSearchCV
        from sktime.split import SlidingWindowSplitter

        cv = SlidingWindowSplitter(initial_window=int(len(y_train) * 0.8), window_length=20)
        gscv = ForecastingGridSearchCV(
            model, strategy="refit", cv=cv, param_grid=param_grid
        )
        gscv.fit(y_train)
        print(f"gscv best params: {gscv.best_params_}")
        return gscv

## ARIMA

In [188]:
forecasting = ForecastingProcess()
arima = forecasting.build_model("ARIMA")
arima= forecasting.train_model(arima,filtered_train)

In [191]:
fh = ForecastingHorizon(np.arange(1,17))

In [192]:
y_pred = forecasting.test_model(arima, fh,submission=True)

848
store_nbr  family      date      
1          AUTOMOTIVE  2017-08-16     3.327836
                       2017-08-17     3.260719
                       2017-08-18     3.254017
                       2017-08-19     3.253347
                       2017-08-20     3.253281
                                       ...    
9          SEAFOOD     2017-08-27    16.895777
                       2017-08-28    16.895777
                       2017-08-29    16.895777
                       2017-08-30    16.895777
                       2017-08-31    16.895777
Name: sales, Length: 28512, dtype: float64


In [193]:
y_pred.to_csv("y_pred.csv")

In [195]:
test# filtered_test.index.get_level_values(1).unique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,3000888,0
1,AUTOMOTIVE,2017-08-17,3002670,0
1,AUTOMOTIVE,2017-08-18,3004452,0
1,AUTOMOTIVE,2017-08-19,3006234,0
1,AUTOMOTIVE,2017-08-20,3008016,0
...,...,...,...,...
9,SEAFOOD,2017-08-27,3022271,0
9,SEAFOOD,2017-08-28,3024053,0
9,SEAFOOD,2017-08-29,3025835,0
9,SEAFOOD,2017-08-30,3027617,0
