In [1]:
%reload_kedro
train_data = catalog.load("train_data")
test_data = catalog.load("eval_data")

2022-04-04 08:48:36,688 - kedro.framework.hooks.manager - INFO - Registered hooks from 1 installed plugin(s): kedro-mlflow-0.7.6
2022-04-04 08:48:36,753 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-04-04 08:48:36,788 - kedro.config.config - INFO - Config from path `/home/matheus/projects/time_series_kedro/conf/local` will override the following existing top-level config keys: fr_horizon, initial, models, n_jobs, sampling, stride, use_exog
2022-04-04 08:48:36,791 - root - INFO - ** Kedro project time_series_kedro
2022-04-04 08:48:36,792 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-04-04 08:48:36,811 - root - INFO - Registered line magic `run_viz`
2022-04-04 08:48:36,813 - root - INFO - Registered line magic `reload_kedro_mlflow`
2022-04-04 08:48:36,813 - kedro.io.data_catalog - INFO - Loading data from `train_data` (CSVDataSet)...
2022-04-04 08:48:36,925 - kedro.io.dat

  data_set = class_obj(**config)  # type: ignore
  catalog._data_sets[name] = MlflowMetricsDataSet(prefix=name)


In [2]:
train_data

Unnamed: 0,date,serie_id,sales,dcoilwtico,group
0,2013-01-01,"(10, 'HOME APPLIANCES')",0.0,0.00,2
1,2013-01-02,"(10, 'HOME APPLIANCES')",0.0,93.14,2
2,2013-01-03,"(10, 'HOME APPLIANCES')",0.0,92.97,2
3,2013-01-04,"(10, 'HOME APPLIANCES')",0.0,93.12,2
4,2013-01-07,"(10, 'HOME APPLIANCES')",0.0,93.20,2
...,...,...,...,...,...
59495,2017-07-18,"(9, 'HOME AND KITCHEN I')",33.0,46.40,4
59496,2017-07-19,"(9, 'HOME AND KITCHEN I')",46.0,47.10,4
59497,2017-07-20,"(9, 'HOME AND KITCHEN I')",53.0,46.73,4
59498,2017-07-21,"(9, 'HOME AND KITCHEN I')",42.0,45.78,4


In [30]:
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import  make_pipeline
from numpy.lib.stride_tricks import sliding_window_view
import logging
import warnings
warnings.filterwarnings("ignore")

class RegressionModel(RegressorMixin, BaseEstimator):
    '''

    '''
    def __init__(self, base_estimator, lags=1, poly_degree=1, **kwargs):
        self._base_estimator = base_estimator
        self.lags = lags
        self.poly_degree = poly_degree
        self.params = ['lags','poly_degree'] + list(kwargs.keys())
        for parameter, value in kwargs.items():
            setattr(self, parameter, value)

    def _create_lagged_data(
        self, 
        target_series, 
    ):
        lagged_data = sliding_window_view(target_series, self.lags + 1)
        X = lagged_data[:, :-1]
        y = lagged_data[:, -1]
        return X, y        

    def fit(self, y, X=None):
        logging.disable(logging.ERROR)
        
        ts = y.values
        
        self._train_series = ts.copy()
        X_train, y_train = self._create_lagged_data(ts)
        if X is not None:
            X_train = np.concatenate((X_train, X.values[self.lags:,]), axis=1)
            """
            Exog window
            
            exog_lagged_data = []
            for exog in X.columns:
                X_exog, y_exog= self._create_lagged_data(X[exog])
                exog_lagged_data = np.concatenate((X_exog, y_exog.reshape(-1, 1)), axis=1)
                X_train = np.concatenate([X_train, exog_lagged_data], axis=1)
            """
        self._lagged_data = (X_train, y_train)
        model_params = self.get_params().copy()
        del model_params["lags"]
        del model_params["poly_degree"]
        
        steps = []
        if self.poly_degree > 1:
            steps.append(PolynomialFeatures(self.poly_degree))
        steps.append(self._base_estimator(**model_params))
        self._model = make_pipeline(*steps)
        print(self._lagged_data[0].shape)
        X, y = self._lagged_data
        self._model.fit(X, y)
        logging.disable(logging.NOTSET)
        return self
        
    
    def predict(self, n_periods, X=None):
        logging.disable(logging.ERROR)
        X_hist = np.zeros(self.lags)
        X_hist[:self.lags] = self._lagged_data[0][-1, :self.lags]
        X_hist[-1] = self._lagged_data[1][-1]
        preds = []

        for i in range(n_periods):
            if X is not None:
                exog_values = X.iloc[i, :] 
                X_pred = np.concatenate((X_hist, X.iloc[i, :]))
                pred = self._model.predict(X_pred.reshape(1, -1))
            else:
                pred = self._model.predict(X_hist.reshape(1, -1))
            preds.append(pred[0])
            X_hist = np.roll(X_hist, -1)
            X_hist[self.lags-1] = pred
            
        logging.disable(logging.NOTSET)
        return np.array(preds)
    
    
    def get_params(self, deep=True):
        
        self.parameters = {}
        for p in self.params:
            self.parameters[p] = getattr(self, p)
    
        return self.parameters
    
    
    
    def set_params(self, **parameters):
        
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
            
        return self

class RandomForestForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        super().__init__(base_estimator=RandomForestRegressor, lags=lags, **kwargs)

class SVRForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        super().__init__(base_estimator=LinearSVR, lags=lags, **kwargs)  
        
class AdaForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        
        super().__init__(base_estimator=AdaBoostRegressor, lags=lags, **kwargs)

class RidgeForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        
        super().__init__(base_estimator=Ridge, lags=lags, **kwargs)

class LassoForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        super().__init__(base_estimator=Lasso, lags=lags, **kwargs)

In [31]:
date_col = "date"
serie_data = train_data[(train_data.serie_id == "(9, 'HOME AND KITCHEN I')")]
exog_info = catalog.load("params:exog")
exog_columns = []
if exog_info is not None:
    for exog_name in exog_info:
        exog_columns += exog_info[exog_name]["target_columns"]
ts = serie_data.set_index("date").sales
X = serie_data[exog_columns + [date_col]].set_index(date_col)
X_test = test_data[exog_columns + [date_col]].set_index(date_col)
estimator = RidgeForecaster(lags=6)
estimator.fit(ts, X=X)
estimator.predict(3, X=X_test)

2022-04-04 09:23:34,690 - kedro.io.data_catalog - INFO - Loading data from `params:exog` (MemoryDataSet)...
(1184, 7)
[61. 55. 33. 46. 53. 47.]
[61.   55.   33.   46.   53.   47.   47.77]
[46.44200778]
[55.         33.         46.         53.         47.         46.44200778
 48.58      ]
[45.80130294]
[33.         46.         53.         47.         46.44200778 45.80130294
 49.05      ]
[43.81260107]


array([46.44200778, 45.80130294, 43.81260107])

In [13]:
((ts.shape[0] - ts.shape[0]*0.7) - 16)/60

5.683333333333334

In [17]:
serie_data.tail(10)

Unnamed: 0,date,serie_id,sales,dcoilwtico,group
59490,2017-07-11,"(9, 'HOME AND KITCHEN I')",52.0,45.06,4
59491,2017-07-12,"(9, 'HOME AND KITCHEN I')",32.0,45.48,4
59492,2017-07-13,"(9, 'HOME AND KITCHEN I')",20.0,46.06,4
59493,2017-07-14,"(9, 'HOME AND KITCHEN I')",61.0,46.53,4
59494,2017-07-17,"(9, 'HOME AND KITCHEN I')",55.0,46.02,4
59495,2017-07-18,"(9, 'HOME AND KITCHEN I')",33.0,46.4,4
59496,2017-07-19,"(9, 'HOME AND KITCHEN I')",46.0,47.1,4
59497,2017-07-20,"(9, 'HOME AND KITCHEN I')",53.0,46.73,4
59498,2017-07-21,"(9, 'HOME AND KITCHEN I')",42.0,45.78,4
59499,2017-07-24,"(9, 'HOME AND KITCHEN I')",47.0,46.21,4


In [12]:
estimator._lagged_data

(array([[ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  , 97.48],
        [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  , 97.03],
        [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  , 97.3 ],
        ...,
        [33.  , 45.  , 53.  , ..., 33.  , 46.  , 46.73],
        [45.  , 53.  , 62.  , ..., 46.  , 53.  , 45.78],
        [53.  , 62.  , 51.  , ..., 53.  , 42.  , 46.21]]),
 array([ 0.,  0.,  0., ..., 53., 42., 47.]))