In [3]:
%reload_kedro
train_data = catalog.load("train_data")
test_data = catalog.load("eval_data")

2022-02-14 21:20:41,435 - kedro.framework.hooks.manager - INFO - Registered hooks from 1 installed plugin(s): kedro-mlflow-0.7.6
2022-02-14 21:20:41,519 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-02-14 21:20:41,564 - kedro.config.config - INFO - Config from path `/home/matheus/projects/time_series_kedro/conf/local` will override the following existing top-level config keys: fr_horizon, initial, models, n_jobs, sampling, stride
2022-02-14 21:20:41,566 - root - INFO - ** Kedro project time_series_kedro
2022-02-14 21:20:41,567 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-02-14 21:20:41,587 - root - INFO - Registered line magic `run_viz`
2022-02-14 21:20:41,588 - root - INFO - Registered line magic `reload_kedro_mlflow`
2022-02-14 21:20:41,589 - kedro.io.data_catalog - INFO - Loading data from `train_data` (CSVDataSet)...


  data_set = class_obj(**config)  # type: ignore
  catalog._data_sets[name] = MlflowMetricsDataSet(prefix=name)


2022-02-14 21:20:42,788 - kedro.io.data_catalog - INFO - Loading data from `eval_data` (CSVDataSet)...


In [4]:
import numpy as np
import pandas as pd
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import  make_pipeline

import logging
import warnings
warnings.filterwarnings("ignore")

class RegressionModel(RegressorMixin, BaseEstimator):
    '''

    '''
    def __init__(self, base_estimator, lags=1, poly_degree=1, **kwargs):
        self._base_estimator = base_estimator
        self.lags = lags
        self.poly_degree = poly_degree
        self.params = ['lags','poly_degree'] + list(kwargs.keys())
        for parameter, value in kwargs.items():
            setattr(self, parameter, value)

    def _create_lagged_data(
        self, 
        target_series, 
    ):
        n_in = self.lags
        n_out = 1
        data = target_series.copy()
        
        n_vars = 1 if len(data.shape) == 1 else data.shape[-1]
        
        df = pd.DataFrame(data)
        cols, names = list(), list()
        # input sequence (t-n, ... t-1)
        for i in range(n_in, 0, -1):
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, n_out):
            cols.append(df.shift(-i))
            if i == 0:
                names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
            else:
                names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

        # put it all together
        agg = pd.concat(cols, axis=1)
        agg.columns = names
        agg.dropna(inplace=True)
        X = agg.iloc[:, :-1].values
        y = agg.iloc[:, -1].values
        return X, y        

    def fit(self, y, X=None):
        logging.disable(logging.ERROR)
        
        ts = y.values
        
        self._train_series = ts.copy()
        X_train, y_train = self._create_lagged_data(ts)
        if X is not None:
            exog_lagged_data = []
            for exog in X.columns:
                X_exog, y_exog= self._create_lagged_data(X[exog])
                exog_lagged_data = np.concatenate((X_exog, y_exog.reshape(-1, 1)), axis=1)
                X_train = np.concatenate([X_train, exog_lagged_data], axis=1)
        self._lagged_data = (X_train, y_train)
        model_params = self.get_params().copy()
        del model_params["lags"]
        del model_params["poly_degree"]
        
        steps = []
        if self.poly_degree > 1:
            steps.append(PolynomialFeatures(self.poly_degree))
        steps.append(self._base_estimator(**model_params))
        self._model = make_pipeline(*steps)
        print(self._lagged_data[0].shape)
        X, y = self._lagged_data
        self._model.fit(X, y)
        logging.disable(logging.NOTSET)
        return self
        
    
    def predict(self, n_periods, X=None):
        logging.disable(logging.ERROR)
        X_hist = np.zeros(self._lagged_data[0].shape[-1])
        X_hist[:self.lags] = self._lagged_data[0][-1, :self.lags]
        X_hist[self.lags - 1] = self._lagged_data[1][-1]
        preds = []

        for i in range(n_periods):
            if X is not None:
                for i, exog in enumerate(X.columns):
                    X_hist[self.lags + i] = X[exog].iloc[i]
            pred = self._model.predict(X_hist.reshape(1, -1))
            preds.append(pred[0])
            X_hist = np.roll(X_hist, -1)
            X_hist[self.lags-1] = pred
            
        logging.disable(logging.NOTSET)
        return np.array(preds)
    
    
    def get_params(self, deep=True):
        
        self.parameters = {}
        for p in self.params:
            self.parameters[p] = getattr(self, p)
    
        return self.parameters
    
    
    
    def set_params(self, **parameters):
        
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
            
        return self

class RandomForestForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        super().__init__(base_estimator=RandomForestRegressor, lags=lags, **kwargs)

class SVRForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        super().__init__(base_estimator=LinearSVR, lags=lags, **kwargs)  
        
class AdaForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        
        super().__init__(base_estimator=AdaBoostRegressor, lags=lags, **kwargs)

class RidgeForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        
        super().__init__(base_estimator=Ridge, lags=lags, **kwargs)

class LassoForecaster(RegressionModel):
    def __init__(self, lags=1, **kwargs):
        super().__init__(base_estimator=Lasso, lags=lags, **kwargs)

In [6]:
date_col = "date"
serie_data = train_data[(train_data.store_nbr==27) & (train_data.family=='HOME AND KITCHEN II')]
exog_info = catalog.load("params:exog")
exog_columns = []
if exog_info is not None:
    for exog_name in exog_info:
        exog_columns += exog_info[exog_name]["target_columns"]
ts = serie_data.set_index("date").sales
X = serie_data[exog_columns + [date_col]].set_index(date_col)
X_test = test_data[exog_columns + [date_col]].set_index(date_col)
estimator = RidgeForecaster(lags=30)
estimator.fit(ts, X=X)
estimator.predict(16, X=X_test)

(1160, 61)


array([25.4749459 , 28.01467371, 25.17464311, 28.9783228 , 23.91126674,
       26.27670411, 29.49711263, 27.17997935, 29.25323414, 25.26206865,
       28.57824594, 25.32019107, 28.90779148, 29.80519692, 25.73226984,
       27.03503651])

In [13]:
((ts.shape[0] - ts.shape[0]*0.7) - 16)/60

5.683333333333334