In [65]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from warnings import simplefilter
simplefilter("ignore")

In [66]:
dataset_dir = Path("dataset")

## CONFIG 

In [67]:
store_nbr_list = ["1","2"]
family_list = ["AUTOMATIVE", "SEAFOOD"]
val_ratio = 0.2

In [68]:
train = pd.read_csv(dataset_dir / 'train.csv',                            
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32'
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
train['date'] = train.date.dt.to_period('D')

train = (train
       .set_index(['store_nbr','family','date'])         # Setting MultiIndex to make unique identifiers for each 'sales' item
       .sort_index()
      )
# train = train[train.index.isin(store_nbr_list)]
train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.000000,0
1,AUTOMOTIVE,2013-01-02,2.000000,0
1,AUTOMOTIVE,2013-01-03,3.000000,0
1,AUTOMOTIVE,2013-01-04,3.000000,0
1,AUTOMOTIVE,2013-01-05,5.000000,0
...,...,...,...,...
9,SEAFOOD,2017-08-11,23.830999,0
9,SEAFOOD,2017-08-12,16.859001,4
9,SEAFOOD,2017-08-13,20.000000,0
9,SEAFOOD,2017-08-14,17.000000,0


## MULTI-INDEX FILTERING

In [69]:
c1=train.index.get_level_values(0).isin(['1', '2', '3'])
c2 = train.index.get_level_values(1).isin(["AUTOMOTIVE","SEAFOOD"])
train = train[c1 & c2]

## DATE FILTERING

In [70]:
# start_date = "2013-01-02"
# end_date = "2013-01-06"
# filtered_df = train[(train.index.get_level_values('date') >= start_date) & (train.index.get_level_values('date') <= end_date)]
# filtered_df.to_csv('filtered_df.csv')

In [71]:
test = pd.read_csv(dataset_dir/'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['store_nbr','family','date']).sort_index()
test = test[test.index.isin(store_nbr_list)]
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,3000888,0
1,AUTOMOTIVE,2017-08-17,3002670,0
1,AUTOMOTIVE,2017-08-18,3004452,0
1,AUTOMOTIVE,2017-08-19,3006234,0
1,AUTOMOTIVE,2017-08-20,3008016,0
...,...,...,...,...
2,SEAFOOD,2017-08-27,3020885,1
2,SEAFOOD,2017-08-28,3022667,0
2,SEAFOOD,2017-08-29,3024449,1
2,SEAFOOD,2017-08-30,3026231,1


In [72]:
c1=test.index.get_level_values(0).isin(['1', '2', '3'])
c2 = test.index.get_level_values(1).isin(["AUTOMOTIVE","SEAFOOD"])
test = test[c1 & c2]
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,3000888,0
1,AUTOMOTIVE,2017-08-17,3002670,0
1,AUTOMOTIVE,2017-08-18,3004452,0
1,AUTOMOTIVE,2017-08-19,3006234,0
1,AUTOMOTIVE,2017-08-20,3008016,0
...,...,...,...,...
2,SEAFOOD,2017-08-27,3020885,1
2,SEAFOOD,2017-08-28,3022667,0
2,SEAFOOD,2017-08-29,3024449,1
2,SEAFOOD,2017-08-30,3026231,1


In [96]:
df_test = pd.DataFrame([])
for store_nbr in store_nbr_list:
    df = test[test.index.isin([store_nbr])]
    

                                      id  onpromotion
store_nbr family     date                            
1         AUTOMOTIVE 2017-08-16  3000888            0
                     2017-08-17  3002670            0
                     2017-08-18  3004452            0
                     2017-08-19  3006234            0
                     2017-08-20  3008016            0
...                                  ...          ...
          SEAFOOD    2017-08-27  3020522            0
                     2017-08-28  3022304            1
                     2017-08-29  3024086            0
                     2017-08-30  3025868            1
                     2017-08-31  3027650            0

[528 rows x 2 columns]
                                      id  onpromotion
store_nbr family     date                            
2         AUTOMOTIVE 2017-08-16  3001251            0
                     2017-08-17  3003033            0
                     2017-08-18  3004815            0
    

In [73]:
def split():
    pass

In [74]:
class MODELS:
    def __init__(model_name):
        pass
        

In [75]:
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.sarimax import SARIMAX
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.base import ForecastingHorizon

In [76]:
fh = ForecastingHorizon(
    pd.PeriodIndex(pd.date_range('2017-08-16', periods=15, freq="D")), is_relative=False
)
fh

ForecastingHorizon(['2017-08-16', '2017-08-17', '2017-08-18', '2017-08-19',
             '2017-08-20', '2017-08-21', '2017-08-22', '2017-08-23',
             '2017-08-24', '2017-08-25', '2017-08-26', '2017-08-27',
             '2017-08-28', '2017-08-29', '2017-08-30'],
            dtype='period[D]', is_relative=False)

In [77]:
fh2 = ForecastingHorizon(np.arange(1,16))
fh2

ForecastingHorizon([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], dtype='int32', is_relative=True)

In [78]:
forecaster =  AutoARIMA()
forecaster.fit(train)



In [79]:
y_pred = forecaster.predict(fh2)

In [80]:
y_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2017-08-16,4.704427,6.991693e-274
1,AUTOMOTIVE,2017-08-17,4.541561,3.674356e-274
1,AUTOMOTIVE,2017-08-18,4.497597,5.258373e-275
1,AUTOMOTIVE,2017-08-19,4.497597,1.081728e-274
1,AUTOMOTIVE,2017-08-20,4.497597,1.806772e-274
...,...,...,...,...
3,SEAFOOD,2017-08-26,87.628143,1.697403e+00
3,SEAFOOD,2017-08-27,103.236091,8.315984e-01
3,SEAFOOD,2017-08-28,74.746845,-6.047275e-01
3,SEAFOOD,2017-08-29,75.150595,-7.486686e-01


In [37]:
y_pred

family,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,...,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD
family,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,AUTOMOTIVE,...,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD,SEAFOOD
store_nbr,1,10,11,12,13,14,15,16,17,18,...,5,50,51,52,53,54,6,7,8,9
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-08-25,4.483274,1.811184,7.89436,5.932794,5.003217,3.829323,3.946422,4.902763,9.191286,4.312973,...,9.746943,19.963582,50.982105,9.403427,4.284481,2.579434,37.288262,43.194023,35.799047,17.768955


MultiIndex([('AUTOMOTIVE', 'AUTOMOTIVE',  '1'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '10'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '11'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '12'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '13'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '14'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '15'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '16'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '17'),
            ('AUTOMOTIVE', 'AUTOMOTIVE', '18'),
            ...
            (   'SEAFOOD',    'SEAFOOD',  '5'),
            (   'SEAFOOD',    'SEAFOOD', '50'),
            (   'SEAFOOD',    'SEAFOOD', '51'),
            (   'SEAFOOD',    'SEAFOOD', '52'),
            (   'SEAFOOD',    'SEAFOOD', '53'),
            (   'SEAFOOD',    'SEAFOOD', '54'),
            (   'SEAFOOD',    'SEAFOOD',  '6'),
            (   'SEAFOOD',    'SEAFOOD',  '7'),
            (   'SEAFOOD',    'SEAFOOD',  '8'),
            (   'SEAFOOD',    'SEAFOOD',  '9')],
           names=['fami