In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
from pandas.core.common import SettingWithCopyWarning
from sklearn.base import BaseEstimator, RegressorMixin, MetaEstimatorMixin, TransformerMixin, clone
from datetime import timedelta
from statistics import median, mean
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.models.reg import PositiveRegressor
from mentorship.ml.models.common import SplitPipeline
from mentorship.ml.models.kaggle.storesales.linear import PipelineLinearV1
from mentorship.ml.models.kaggle.storesales.ridge import PipelineRidgeV1
from mentorship.ml.models.kaggle.storesales.lasso import PipelineLassoV1
from mentorship.ml.models.kaggle.storesales.elasticnet import PipelineElasticNetV1
from mentorship.features.kaggle.storesales.etl import ETLTransformer
from mentorship.ml.cv.split import DateTimeSeriesSplit
from mentorship.ml.cv.util import print_cv_test_scores


%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
CV_METRICS = [
    'neg_mean_squared_log_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    # 'neg_mean_absolute_percentage_error',
    'r2'
]

In [None]:
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

class PipelineLinearV2(PdPipelineAndSklearnEstimator):
    def __init__(self, num_columns, cat_columns, date_column='date'):
        self.num_columns = num_columns
        self.cat_columns = cat_columns
        self.date_column = date_column
        pipeline = pdp.PdPipeline([
            pdp.Scale('MinMaxScaler', self.num_columns),
            pdp.OneHotEncode(self.cat_columns),
            pdp.ColDrop([self.date_column, 'store_nbr']),
        ])
        model = PositiveRegressor(LinearRegression())
        super().__init__(pipeline=pipeline, estimator=model)

In [None]:
from sklearn.base import clone


class SplitPipelineV2:
    def __init__(self, base_pipeline):
        self.base_pipeline = base_pipeline
        self.pipelines_ = {}

    def fit(self, X, y):
        for current_store_nbr in X['store_nbr'].unique():
            indexer = X[X['store_nbr'] == current_store_nbr].index
            X_part = X[X['store_nbr'] == current_store_nbr]
            y_part = y.loc[indexer]

            pipeline = clone(self.base_pipeline)
            pipeline = pipeline.fit(X_part, y_part)

            self.pipelines_[current_store_nbr] = pipeline
        return self

    def predict(self, X):
        X.loc[:, 'forecast'] = 0
        for current_store_nbr in X['store_nbr'].unique():
            X_part = X[X['store_nbr'] == current_store_nbr]
            X_part = X_part.drop('forecast', axis=1)
            pipeline = self.pipelines_[current_store_nbr]
            X.loc[X_part.index, 'forecast'] = pipeline.predict(X_part)

        y_pred = X['forecast']
        X = X.drop(columns='forecast')
        return y_pred

    def get_params(self, deep=True):
        return {'base_pipeline': self.base_pipeline}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

In [None]:
X = train.copy().drop(columns='onpromotion')
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
dates_test_data = test_data['date'].unique()
all_data = pd.concat([X, test_data])
all_data['lag_16'] = all_data.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
test_data['lag_16'] = all_data[all_data['date'].isin(dates_test_data)]['lag_16']
X['lag_16'] = X.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)
X.head()

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV2(num_columns=['dcoilwtico'], cat_columns=['family'])
modelling_pipeline = SplitPipelineV2(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)