In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import lightgbm as lgb
import optuna
import shap
import pdpipe as pdp
import statsmodels.api as sm
from optuna.integration.lightgbm import LightGBMTunerCV
from lightgbm import early_stopping
from lightgbm import LGBMRegressor
from boruta import BorutaPy
from pandas.core.common import SettingWithCopyWarning
from sklearn.base import BaseEstimator, RegressorMixin, MetaEstimatorMixin, TransformerMixin, clone
from datetime import timedelta
from category_encoders import TargetEncoder
from statistics import median, mean, stdev
from pdpipe import df, PdPipelineStage
from pathlib import Path
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.models.reg import PositiveRegressor
from mentorship.ml.models.common import SplitPipeline
from mentorship.ml.models.kaggle.storesales.boosting import PipelineLGBMRegressor
from mentorship.features.kaggle.storesales.etl import ETLTransformer
from mentorship.features.history import cut_history
from mentorship.ml.cv.split import DateTimeSeriesSplit
from mentorship.ml.cv.util import print_cv_test_scores


%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
CV_METRICS = [
    'neg_mean_squared_log_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    'r2'
]

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

In [None]:
N_STORES = train['store_nbr'].nunique()
N_FAMILIES = train['family'].nunique()
N_TIME_SERIES = N_STORES * N_FAMILIES

DAYS_IN_YEAR = 365
N_HORIZONS = 16

In [None]:
class PositiveRegressor1(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, fit_params={}):
        """Regressor that always predicts positive values"""
        self.estimator = estimator
        self.fit_params = fit_params
        
    def fit(self, X, y=None):
        return self.estimator.fit(X, y, **self.fit_params)

    def predict(self, X):
        y_pred = self.estimator.predict(X)
        return np.clip(y_pred, 0, None)

    def __getattr__(self, item):
        """
        Return attributes of the underlying estimator
        (for easier hyper-parameter tuning)
        """
        if item in self.__dict__.keys():
            return getattr(self, item)
        else:
            return getattr(self.estimator, item)

In [None]:
class RecursiveTSEstimator1(BaseEstimator):
    def __init__(self, base_pipeline, split_key):
        self.split_key = split_key
        self.base_pipeline = base_pipeline
        self.pipelines_ = {}

    def fit(self, X, y):
        for key, X_part in X.groupby(self.split_key):
            y_part = y.loc[X_part.index]
            pipeline = clone(self.base_pipeline)
            
            if pipeline.use_final_metric:
                y_part = np.log(y_part + 1)
            
            pipeline = pipeline.fit(X_part.drop(columns=[self.split_key]), y_part)
            
            self.pipelines_[key] = pipeline
        return self

    def predict(self, X):
        y_preds = {}
        for split_key_value, X_part in X.groupby(self.split_key):
            pipeline = self.pipelines_[split_key_value]
            
            sorted_dates = X_part[pipeline.date_column].unique()
            sorted_dates.sort()
            for current_day_number, current_day in enumerate(sorted_dates):
                X_current_day = X_part[X_part[pipeline.date_column] == current_day]
                for lag in pipeline.lags:
                    # filling lags with the predictions of previous test days
                    if current_day_number + 1 > lag:
                        X_current_day['lag_{}'.format(lag)] = y_preds[(split_key_value, str(pd.to_datetime(current_day) - timedelta(days=lag)).split(' ')[0])].tolist()
                
                y_pred = pipeline.predict(X_current_day.drop(columns=self.split_key))
                y_preds[(split_key_value, current_day)] = pd.Series(data=y_pred, index=X_current_day.index, name='forecast')
                
            if pipeline.use_final_metric:
                for current_key in [key for key in y_preds.keys() if key[0] == split_key_value]:
                    y_preds[current_key] = np.exp(y_preds[current_key]) - 1
                    
        y_pred = pd.concat(y_preds.values()).loc[X.index]
        return y_pred

In [None]:
class LagComputer1(PdPipelineStage):
    def __init__(self, target_col, lags, split_key, date_column='date'):
        self.split_key = split_key
        self.lags = lags
        self.target_col = target_col
        self.date_column = date_column
        self.is_fitted = False
        super_kwargs = {
            'exraise': True,
            'desc': 'Pipeline for lags computing',
        }
        super().__init__(**super_kwargs)
    
    def _prec(self, X: pd.DataFrame) -> bool:
        if self.date_column not in X.columns:
            return False

        return True

    def _fit_transform(self, X, verbose=None):
        X_last_date = pd.to_datetime(X[self.date_column].max())

        # saving last train days for test data
        max_lag = max(self.lags)
        self.last_days_train = X[X[self.date_column] > str(X_last_date - pd.Timedelta(days=max_lag)).split(' ')[0]]
        
        self.is_fitted = True
        return X


    def _transform(self, X, verbose=None):
        if self.target_col not in X.columns:

            # filling lags for test data with last days 'target' in X
            for current_lag in self.lags:
                if str(pd.to_datetime(X[self.date_column].unique()[0]) - pd.Timedelta(days=current_lag)).split(' ')[0] in self.last_days_train[self.date_column].unique():
                    X['lag_{}'.format(current_lag)] = self.last_days_train[self.last_days_train[self.date_column] == str(pd.to_datetime(X[self.date_column].unique()[0]) - pd.Timedelta(days=current_lag)).split(' ')[0]][self.target_col].tolist()

        return X

In [None]:
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

class LinearPipeline1(PdPipelineAndSklearnEstimator):
    def __init__(self, split_key, target_col, cols_to_scale=None, cols_to_encode=None, drop_columns=None, date_column='date', 
                 lags=None, fit_params={}, use_final_metric=False):
        self.cols_to_scale = cols_to_scale
        self.cols_to_encode = cols_to_encode
        self.date_column = date_column
        self.drop_columns = drop_columns
        self.lags = lags
        self.split_key = split_key
        self.target_col = target_col
        self.fit_params = fit_params
        self.use_final_metric = use_final_metric

        pipeline = pdp.PdPipeline([
            LagComputer1(target_col=self.target_col, lags=self.lags, split_key=self.split_key, date_column=self.date_column),
            pdp.Scale('MinMaxScaler', self.cols_to_scale),
            pdp.OneHotEncode(self.cols_to_encode),
            pdp.ColDrop(self.drop_columns + [self.target_col, self.date_column], errors='ignore'),
        ])
        
        model = PositiveRegressor1(LinearRegression(), fit_params=self.fit_params)

        super().__init__(pipeline=pipeline, estimator=model)

In [None]:
from pdpipe.skintegrate import PdPipelineAndSklearnEstimator

class LGBMPipeline1(PdPipelineAndSklearnEstimator):
    def __init__(self, split_key, target_col, drop_columns=None, date_column='date', lags=None, fit_params={}, 
                 use_final_metric=False):
        self.date_column = date_column
        self.drop_columns = drop_columns
        self.lags = lags
        self.split_key = split_key
        self.target_col = target_col
        self.fit_params = fit_params
        self.use_final_metric = use_final_metric

        pipeline = pdp.PdPipeline([
            LagComputer1(target_col=self.target_col, lags=self.lags, split_key=self.split_key, date_column=self.date_column),
            pdp.ColDrop(self.drop_columns + [self.target_col, self.date_column], errors='ignore'),
        ])
        
        model = PositiveRegressor1(LGBMRegressor(), fit_params=self.fit_params)

        super().__init__(pipeline=pipeline, estimator=model)

In [None]:
class ETLTransformer1(BaseEstimator, TransformerMixin):
    def __init__(self, date_column, id_column, adding_lags=True, lags=None, target_col=None, use_final_metric=False):
        self.date_column = date_column
        self.id_column = id_column
        self.oil_data = pd.read_csv(DATA_ROOT / 'oil.csv')
        self.adding_lags = adding_lags
        self.lags = lags
        self.target_col = target_col
        self.use_final_metric = use_final_metric

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.merge(self.oil_data, on=self.date_column, how='left')
        X['dcoilwtico'] = X['dcoilwtico'].fillna(method='ffill')
        X = X.sort_values(by=[self.id_column], ascending=True, ignore_index=True)
        X = X.drop(columns=[self.id_column])
        X['family'] = X['family'].str.lower()
        
        # lags for train set
        if self.adding_lags:
            X_copy = X.copy()
            if self.use_final_metric:
                X_copy['sales'] = np.log(X_copy['sales'] + 1)
            for current_lag in self.lags:
                X.loc[:, 'lag_{}'.format(current_lag)] = X_copy.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)
        return X, y

In [None]:
def print_cv_test_scores1(scores):
    for metric_name, metric_values in scores.items():
        if metric_name.startswith('test_'):
            metric_name = metric_name[len('test_'):]
            if metric_name.startswith('neg_'):
                metric_name = metric_name[len('neg_'):]
                metric_values = -metric_values.copy()

            if metric_name == 'mean_squared_log_error':
                metric_name = 'root_mean_squared_log_error'
                metric_values = np.sqrt(metric_values)
                scores_RMSLE = metric_values

            print(f'{metric_name}: {metric_values.mean():.3f} ± {metric_values.std():.3f}')
    plt.plot(scores_RMSLE)
    plt.xlabel('Fold number')
    plt.ylabel('RMSLE value')

In [None]:
def save_cv_test_scores1(scores):
    mean_scores = {}
    for metric_name, metric_values in scores.items():
        if metric_name.startswith('test_'):
            metric_name = metric_name[len('test_'):]
            if metric_name.startswith('neg_'):
                metric_name = metric_name[len('neg_'):]
                metric_values = -metric_values.copy()

            if metric_name == 'mean_squared_log_error':
                metric_name = 'root_mean_squared_log_error'
                metric_values = np.sqrt(metric_values)

            mean_scores[metric_name] = f'{metric_values.mean():.3f} ± {metric_values.std():.3f}'

    return mean_scores

In [None]:
from typing import Optional, Tuple

def cut_history1(
    X: pd.DataFrame,
    date_column: str,
    keep_interval: pd.Timedelta,
    y: Optional[pd.Series] = None
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:

    last_date = pd.to_datetime(X[date_column].max())
    X_train = X[X[date_column] >= str(last_date - keep_interval).split(' ')[0]]
    if y is not None:
        assert X.index.equals(y.index)
        y = y.loc[X_train.index]

    return X_train, y

In [None]:
def make_submission_file(test_data, model, output_path):
    submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
    submission['sales'] = model.predict(test_data)
    submission.to_csv(DATA_ROOT / output_path, index = False)

# 0.1 test (linear regression with best lags)

In [None]:
# ETL stage for the train data

lags = [1, 2, 4, 6, 7, 14]

X = train.copy()
train_transformer = ETLTransformer1(date_column='date', id_column='id', lags=lags, use_final_metric=True)
X = train_transformer.transform(X)[0]

y = X['sales'].copy()

X.head()

In [None]:
# cross validation

splitter = DateTimeSeriesSplit()
base_pipeline = LinearPipeline1(cols_to_scale=['dcoilwtico'], cols_to_encode=['store_nbr'], drop_columns=['onpromotion'], 
                                lags=lags, split_key='family', target_col='sales', use_final_metric=True)
modelling_pipeline = RecursiveTSEstimator1(base_pipeline, split_key='family')

scores = cross_validate(modelling_pipeline, X, y, cv=splitter, scoring=CV_METRICS, return_estimator=True, error_score='raise')
print_cv_test_scores1(scores)

In [None]:
# getting X_train and y_train for the final model training

X_train, y_train = cut_history1(X=X, date_column='date', keep_interval=pd.Timedelta(days=DAYS_IN_YEAR), y=y)

In [None]:
# ETL stage for the test data

test_data = pd.read_csv(DATA_ROOT / 'test.csv')
test_transformer = ETLTransformer1(date_column='date', id_column='id', adding_lags=False)
test_data = test_transformer.transform(test_data)[0]
test_data.head()

In [None]:
# fitting the final model and making submission file

modelling_pipeline.fit(X_train, y_train)
make_submission_file(test_data, modelling_pipeline, 'linreg_with_recursive_lags.csv')

# 0.2 test (LGBMRegressor with default params, same features)

In [None]:
lags = [1, 2, 4, 6, 7, 14]

X = train.copy()
train_transformer = ETLTransformer1(date_column='date', id_column='id', lags=lags)
X = train_transformer.transform(X)[0]

y = X['sales'].copy()

X.head()

In [None]:
# Here I have a question: how to pass 'categorical_feature' parameter to the PositiveRegressor(lgb.LGBMRegressor)?
# Inside 'fit' method, X matrix has np.ndarray type, so I can't pass the name of the feature to 'categorical_feature' parameter.
# That is why I have to pass indices of the columns I need to mark as 'categorical' (indices become clear only 
# after ColDrop stage)


fit_params = {'categorical_feature': [0], 'eval_metric': 'rmse'}

splitter = DateTimeSeriesSplit()
base_pipeline = LGBMPipeline1(drop_columns=['onpromotion'], lags=lags, split_key='family', target_col='sales', fit_params=fit_params)
modelling_pipeline = RecursiveTSEstimator1(base_pipeline, split_key='family')

scores = cross_validate(modelling_pipeline, X, y, cv=splitter, scoring=CV_METRICS, return_estimator=True, error_score='raise')
print_cv_test_scores1(scores)

In [None]:
X_train, y_train = cut_history1(X=X, date_column='date', keep_interval=pd.Timedelta(days=DAYS_IN_YEAR), y=y)

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv')
test_transformer = ETLTransformer1(date_column='date', id_column='id', adding_lags=False)
test_data = test_transformer.transform(test_data)[0]
test_data.head()

In [None]:
modelling_pipeline.fit(X_train, y_train)
make_submission_file(test_data, modelling_pipeline, 'default_LGBM_with_recursive_lags.csv')

##########################################################################################

Next code isn't ready yet.

##########################################################################################

# 1. LightGBM Regressor (one for all families), features: 'store_nbr', 'family', 'dcoilwtico', 'lag' (1, 2, 4, 6, 7, 14)

In [None]:
X = train.copy()
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

for current_lag in days_to_shift:
    X['lag_{}'.format(current_lag)] = X.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)

X = X.dropna()
X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
X['family'] = LabelEncoder().fit_transform(X['family'])

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)

In [None]:
ends = [(16 - x) for x in days_to_shift]
ends.reverse()

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    model = lgb.LGBMRegressor()
    model.fit(X_train.drop(columns=['date']), y_train, categorical_feature=['store_nbr', 'family'])
    
    X_test.loc[:, 'pred'] = 0
    days_to_shift_copy = days_to_shift.copy()
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in days_to_shift_copy:
                current_day_plus_x[current_lag] = X_test['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test[X_test['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
                
            for current_lag in days_to_shift_copy:
                X_test.loc[X_test[X_test['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
            current_day_index += 1
                
        days_to_shift_copy = days_to_shift_copy[:-1]
        start = end

        
    y_pred = model.predict(X_test.drop(columns=['date', 'pred']))
    y_pred[y_pred < 0] = 0
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

# 2. LightGBM Regressor for every family, features: 'store_nbr', 'dcoilwtico', 'lag' (1, 2, 4, 6, 7, 14)

In [None]:
lags = [1, 2, 4, 6, 7, 14]

X = train.copy()
train_transformer = ETLTransformer1(date_column='date', id_column='id', lags=lags, use_final_metric=True)
X = train_transformer.transform(X)[0]

y = X['sales'].copy()

X.head()

#### Compare scores for each family (linear regression and simplest lgbm regressor)

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)

In [None]:
splitter = DateTimeSeriesSplit()

base_pipeline_linear = LinearPipeline1(cols_to_scale=['dcoilwtico'], cols_to_encode=['store_nbr'], drop_columns=['onpromotion'],
                                       lags=lags, split_key='family', target_col='sales', use_final_metric=True)
modelling_pipeline_linear = RecursiveTSEstimator1(base_pipeline_linear, split_key='family')

fit_params_lgbm = {'categorical_feature': [0], 'eval_metric': 'rmse'}
base_pipeline_lgbm = LGBMPipeline1(drop_columns=['onpromotion'], lags=lags, split_key='family', target_col='sales', 
                                   fit_params=fit_params_lgbm, use_final_metric=True)
modelling_pipeline_lgbm = RecursiveTSEstimator1(base_pipeline_lgbm, split_key='family')

final_scores_linear, final_scores_lgbm = {}, {}
for current_family in X['family'].unique():
    print(current_family)
    X_current_family = X[X['family'] == current_family]
    y_current_family = y.loc[X_current_family.index]
    scores_linear = cross_validate(modelling_pipeline_linear, X_current_family, y_current_family, cv=splitter, 
                                   scoring=CV_METRICS, return_estimator=True, error_score='raise')
    scores_lgbm = cross_validate(modelling_pipeline_lgbm, X_current_family, y_current_family, cv=splitter, 
                                 scoring=CV_METRICS, return_estimator=True, error_score='raise')
    
    final_scores_linear[current_family] = save_cv_test_scores1(scores_linear)
    final_scores_lgbm[current_family] = save_cv_test_scores1(scores_lgbm)

In [None]:
for key in final_scores_linear.keys():
    print(f'{key}: {final_scores_linear[key]["root_mean_squared_log_error"]}')
    print(f'{key}: {final_scores_lgbm[key]["root_mean_squared_log_error"]}')
    print()

In [None]:
linear_categories = ['baby care', 'beauty', 'books', 'frozen foods', 'grocery ii', 'home appliances', 'ladieswear', 
                     'lawn and garden', 'magazines', 'school and office supplies']

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=timedelta(days=DAYS_IN_YEAR), y=y)

In [None]:
bad_lgbm_families = ['baby care', 'books', 'home appliances']

for current_family in bad_lgbm_families:
    print(current_family)
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family_lgbm = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    
    lgbm_model = lgb.LGBMRegressor(importance_type='gain')
    lgbm_model.fit(X_train_current_family_lgbm.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'])
    
    
    X_train_current_family_linear = pd.get_dummies(X_train_current_family_lgbm, columns=['store_nbr'], drop_first=True)
    scaler = MinMaxScaler()
    X_train_current_family_linear[['dcoilwtico']] = scaler.fit_transform(X_train_current_family_linear[['dcoilwtico']])   
    linear_model = PositiveRegressor(LinearRegression())
    linear_model.fit(X_train_current_family_linear.drop(columns=['date']), y_train_current_family)
    
    
    # linear regression feature importance (weights)
    linear_feature_imp = pd.DataFrame(sorted(zip(linear_model.coef_, X_train_current_family_linear.drop(columns=['date']).columns)), columns=['Value', 'Feature'])
    plt.figure(figsize=(20, 10))
    sns.barplot(x='Value', y='Feature', data=linear_feature_imp.sort_values(by='Value', ascending=False))
    plt.title('LinearRegression')
    plt.tight_layout()
    plt.show()
    
    # lgbm feature importance
    lgbm_feature_imp = pd.DataFrame(sorted(zip(lgbm_model.feature_importances_, X_train_current_family_lgbm.drop(columns=['date']).columns)), columns=['Value', 'Feature'])
    plt.figure(figsize=(6, 3))
    sns.barplot(x='Value', y='Feature', data=lgbm_feature_imp.sort_values(by='Value', ascending=False))
    plt.title('LightGBM')
    plt.tight_layout()
    plt.show()


#### Learning curve for default lgbm regressor

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, scoring=None, train_sizes=np.linspace(0.1, 1.0, 5),
                        fit_params=None):

    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, scoring=scoring, cv=cv, train_sizes=train_sizes, 
                                                            fit_params=fit_params)
    
    train_scores_mean = np.mean(-train_scores, axis=1)
    test_scores_mean = np.mean(-test_scores, axis=1)

    # Plot learning curve
    plt.grid()
    plt.plot(train_sizes, train_scores_mean, "o-", color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score")
    plt.legend(loc="best")
    return plt

In [None]:
from sklearn.model_selection import validation_curve

def plotting_validation_curve(estimator, X, y, param_name, param_range, split_key, scoring, cv=None, date_column='date'):
    for current_split_key in X[split_key].unique():
        X_current_split_key = X[X[split_key] == current_split_key].drop(columns=[date_column, split_key])
        y_current_split_key = y.iloc[X_current_split_key.index]

        train_scores, test_scores = validation_curve(estimator, X_current_split_key, y_current_split_key, param_name=param_name,
                                                     param_range=param_range, scoring=scoring, cv=cv)
        
        train_scores_mean = np.mean(-train_scores, axis=1)
        test_scores_mean = np.mean(-test_scores, axis=1)
        
        plt.plot(param_range, train_scores_mean, 'o-', color='r', label='Training score')
        plt.plot(param_range, test_scores_mean, 'o-', color='g', label='Cross-validation score')
        
        plt.title('{}: Validation Curve with LGBMRegressor'.format(current_split_key))
        plt.xlabel('n_estimators')
        plt.ylabel('Score')
        plt.legend(loc="best")
        plt.show()

In [None]:
# linear models
for i, current_family in enumerate(X['family'].unique()):
    plt.figure(figsize=(20, 130))
    plt.subplot(X['family'].nunique(), 2, 2 * i + 1)
    X_current_family = X[X['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y.loc[X_current_family.index]
    
    titles = {'RMSLE': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
              'MAE': '{}: Learning Curves (default LGBM Regressor), MAE'.format(current_family)}
    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 120) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    model = PositiveRegressor(LinearRegression())
    plot_learning_curve(model, titles['MAE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_absolute_error')
    plt.subplot(X['family'].nunique(), 2, 2 * i + 2)
    plot_learning_curve(model, titles['RMSLE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_squared_log_error')
    plt.show()

In [None]:
for i, current_family in enumerate(X['family'].unique()):
    plt.figure(figsize=(20, 130))
    plt.subplot(X['family'].nunique(), 2, 2 * i + 1)
    X_current_family = X[X['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y.loc[X_current_family.index]
    
    titles = {'RMSLE': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
              'MAE': '{}: Learning Curves (default LGBM Regressor), MAE'.format(current_family)}
    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 120) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    model = PositiveRegressor(lgb.LGBMRegressor())
    plot_learning_curve(model, titles['MAE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_absolute_error')
    plt.subplot(X['family'].nunique(), 2, 2 * i + 2)
    plot_learning_curve(model, titles['RMSLE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_squared_log_error')
    plt.show()

In [None]:
param_range = [30, 50, 100, 250, 500, 1000]
tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
plotting_validation_curve(lgb.LGBMRegressor(), X, y, param_name='n_estimators', param_range=param_range,
                          split_key='family', scoring=custom_RMSLE, cv=tscv)

#### Mixed model (linear regression and lgbm depence on the current_family)

In [None]:
lgbm_default_RMSLE_families = ['automotive', 'baby_care', 'magazines', 'pet_supplies', 'school and office supplies', 'seafood']
linear_model_families = ['books', 'frozen foods']
lgbm_default_MAE_families = ['celebration', 'dairy', 'eggs', 'grocery i', 'home and kitchen i', 'liquor,wine,beer',
                             'meats', 'poultry', 'produce']
optuna_RMSLE_families = ['beauty', 'hardware', 'home appliances', 'ladieswear', 'lingerie']
optuna_MAE_families = ['beverages', 'bread/bakery', 'cleaning', 'deli', 'grocery ii', 'home and kitchen ii', 'home care', 
                       'lawn and garden', 'personal care', 'players and electronics', 'prepared foods']

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'num_boost_round': 1000,
    'n_jobs': -1,
}
split_key = 'family'

tscv_outer = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 165) * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)
tscv_inner = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 100) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)

linear_categories = ['books', 'frozen foods', 'beverages', 'cleaning', 'grocery ii', 'home and kitchen ii', 'home appliances']

scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}
for train_indices, test_indices in tscv_outer.split(X, y):
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]
    y_train = np.log(y_train + 1)
    
    X_test.loc[:, 'pred'] = 0
    for current_split_key_value in X['family'].unique():
        print(current_split_key_value)
        X_train_current_split_key_value = X_train[X_train[split_key] == current_split_key_value].drop(columns=[split_key, 'date'])
        y_train_current_split_key_value = y_train.loc[X_train_current_split_key_value.index]
        X_test_current_split_key_value = X_test[X_test[split_key] == current_split_key_value].drop(columns=split_key)
    
        model = PositiveRegressor(LinearRegression())
        if current_split_key_value in linear_categories:
            print(current_split_key_value)
            X_train_current_split_key_value = pd.get_dummies(X_train_current_split_key_value, columns=['store_nbr'], drop_first=True)
            scaler = MinMaxScaler()
            X_train_current_split_key_value[['dcoilwtico']] = scaler.fit_transform(X_train_current_split_key_value[['dcoilwtico']])
            model.fit(X_train_current_split_key_value, y_train_current_split_key_value)
            current_family_indices_test = X_test[X_test['family'] == current_split_key_value].index
            X_test_current_split_key_value = pd.get_dummies(X_test_current_split_key_value, columns=['store_nbr'], drop_first=True)
            X_test_current_split_key_value[['dcoilwtico']] = scaler.transform(X_test_current_split_key_value[['dcoilwtico']])
        
        else:
            dtrain = lgb.Dataset(X_train_current_split_key_value, label=y_train_current_split_key_value, 
                                 categorical_feature=['store_nbr'])

            tuner = LightGBMTunerCV(params, dtrain, folds=tscv_inner, callbacks=[early_stopping(25)], return_cvbooster=True)
            tuner.run()
        
            сurrent_family_best_params = {key:tuner.best_params[key] for key in tuner.best_params.keys() if key not in params.keys()}
            сurrent_family_best_params['num_boost_round'] = tuner.get_best_booster().best_iteration
            print(сurrent_family_best_params)
        
            model = PositiveRegressor(lgb.LGBMRegressor(**сurrent_family_best_params)) \
                                           .fit(X_train_current_split_key_value, y_train_current_split_key_value)
    
        lags_copy = lags.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_split_key_value['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in lags_copy:
                    current_day_plus_x[current_lag] = X_test_current_split_key_value['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_split_key_value[X_test_current_split_key_value['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in lags_copy:
                    X_test_current_split_key_value.loc[X_test_current_split_key_value[X_test_current_split_key_value['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            lags_copy = lags_copy[:-1]
            start = end
    
        X_test.loc[X_test_current_split_key_value.index, 'pred'] = model.predict(X_test_current_split_key_value.drop(columns=['date', 'pred']))
        
    y_pred = X_test['pred'].copy()
    y_pred = np.exp(y_pred) - 1

    print(mean_squared_log_error(y_test, y_pred, squared=False), mean_squared_error(y_test, y_pred, squared=False), 
          mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred))
    scores['RMSLE'].append(mean_squared_log_error(y_test, y_pred, squared=False))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

In [None]:
for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=pd.Timedelta(days=530), y=y)
y_train = np.log(y_train + 1)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'num_boost_round': 1000,
    'n_jobs': -1,
}
split_key = 'family'

tscv_inner = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 100) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)

linear_categories = ['books', 'frozen foods', 'beverages', 'cleaning', 'grocery ii', 'home and kitchen ii', 'home appliances']

best_params = {}
for current_split_key_value in [x for x in X[split_key].unique() if x not in linear_categories]:
    print(current_split_key_value)
    X_train_current_split_key_value = X_train[X_train[split_key] == current_split_key_value].drop(columns=[split_key, 'date'])
    y_train_current_split_key_value = y_train.loc[X_train_current_split_key_value.index]
    
    dtrain = lgb.Dataset(X_train_current_split_key_value, label=y_train_current_split_key_value, 
                            categorical_feature=['store_nbr'])

    tuner = LightGBMTunerCV(params, dtrain, folds=tscv_inner, callbacks=[early_stopping(25)], return_cvbooster=True)
    tuner.run()
        
    сurrent_family_best_params = {key:tuner.best_params[key] for key in tuner.best_params.keys() if key not in params.keys()}
    сurrent_family_best_params['num_boost_round'] = tuner.get_best_booster().best_iteration
    print(сurrent_family_best_params)
    best_params[current_split_key_value] = сurrent_family_best_params

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in lags:
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(current_lag - i)]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

for current_family in X['family'].unique():
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[X_train_current_family.index]
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    
    if current_family not in linear_categories:
        params = best_params[current_family]
        model = PositiveRegressor(lgb.LGBMRegressor(**params))
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'], 
                  eval_metric='rmse')
        
    else:
        X_train_current_family = pd.get_dummies(X_train_current_family, columns=['store_nbr'], drop_first=True)
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        X_test_current_family = pd.get_dummies(X_test_current_family, columns=['store_nbr'], drop_first=True)
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
    lags_copy = lags.copy()
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in lags_copy:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
            predictions = model.predict(X_test_for_current_day)
                
            for current_lag in lags_copy:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
            current_day_index += 1
                
        lags_copy = lags_copy[:-1]
        start = end
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family = np.exp(y_pred_current_family) - 1
    
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/lgbm_linear_regression_OptunaTunerCV.csv', index = False)

In [None]:
for i, current_family in enumerate(X['family'].unique()):
    plt.figure(figsize=(20, 130))
    plt.subplot(X['family'].nunique(), 2, 2 * i + 1)
    X_current_family = X[X['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y.loc[X_current_family.index]
    
    titles = {'RMSLE': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
              'MAE': '{}: Learning Curves (default LGBM Regressor), MAE'.format(current_family)}
    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 120) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    
    params = {}
    if current_family in optuna_RMSLE_families + optuna_MAE_families:
        params = current_family_best_params[current_family]
        
    model = PositiveRegressor(lgb.LGBMRegressor(**params))
    fit_params = {'categorical_feature': ['store_nbr']}
    plot_learning_curve(model, titles['MAE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_absolute_error',
                        fit_params=fit_params)
    plt.subplot(X['family'].nunique(), 2, 2 * i + 2)
    plot_learning_curve(model, titles['RMSLE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_squared_log_error', 
                        fit_params=fit_params)
    plt.show()

In [None]:
# tried to increase samples number for some categories to make learning curve converge

special = ['hardware', 'ladieswear', 'lawn and garden', 'magazines']
for i, current_family in enumerate(special):
    plt.figure(figsize=(20, 20))
    plt.subplot(len(special), 2, 2 * i + 1)
    X_current_family = X[X['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y.loc[X_current_family.index]
    
    titles = {'RMSLE': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
              'MAE': '{}: Learning Curves (default LGBM Regressor), MAE'.format(current_family)}
    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 180) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    
    params = {}
    if current_family in optuna_RMSLE_families + optuna_MAE_families:
        params = current_family_best_params[current_family]
        
    model = PositiveRegressor(lgb.LGBMRegressor(**params))
    fit_params = {'categorical_feature': ['store_nbr']}
    plot_learning_curve(model, titles['MAE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_absolute_error',
                        fit_params=fit_params)
    plt.subplot(len(special), 2, 2 * i + 2)
    plot_learning_curve(model, titles['RMSLE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_squared_log_error', 
                        fit_params=fit_params)
    plt.show()

In [None]:
# LGBM + linreg; default; 'store_nbr' -> target_encoding

scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

linear_categories = ['books', 'frozen foods', 'beverages', 'cleaning', 'grocery ii', 'home and kitchen ii', 'home appliances']

tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)
for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    y_train = np.log(y_train + 1)
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[X_train_current_family.index]
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        
        X_train_current_family['store_nbr'] = X_train_current_family['store_nbr'].astype('object')
        encoder = TargetEncoder()
        X_train_current_family['store_nbr'] = encoder.fit_transform(X_train_current_family['store_nbr'], y_train_current_family)
        X_test_current_family['store_nbr'] = encoder.transform(X_test_current_family['store_nbr'])
        
        if current_family not in linear_categories:
            #X_train_current_family['store_nbr'] = X_train_current_family['store_nbr'].astype('object')
            #encoder = TargetEncoder()
            #X_train_current_family['store_nbr'] = encoder.fit_transform(X_train_current_family['store_nbr'], y_train_current_family)
            model = PositiveRegressor(lgb.LGBMRegressor())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, eval_metric='rmse')#, categorical_feature=['store_nbr'])
            #X_test_current_family['store_nbr'] = encoder.transform(X_test_current_family['store_nbr'])
            
        else:
            #X_train_current_family = pd.get_dummies(X_train_current_family, columns=['store_nbr'], drop_first=True)
            scaler = MinMaxScaler()
            X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
            model = PositiveRegressor(LinearRegression())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
            #X_test_current_family = pd.get_dummies(X_test_current_family, columns=['store_nbr'], drop_first=True)
            X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
        lags_copy = lags.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in lags_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in lags_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            lags_copy = lags_copy[:-1]
            start = end

        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family

        
    y_pred = X_test['pred'].copy()
    y_pred = np.exp(y_pred) - 1
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=pd.Timedelta(days=DAYS_IN_YEAR), y=y)
y_train = np.log(y_train + 1)

test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in lags:
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(current_lag - i)]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

for current_family in X['family'].unique():
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[X_train_current_family.index]
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    
    X_train_current_family['store_nbr'] = X_train_current_family['store_nbr'].astype('object')
    encoder = TargetEncoder()
    X_train_current_family['store_nbr'] = encoder.fit_transform(X_train_current_family['store_nbr'], y_train_current_family)
    X_test_current_family['store_nbr'] = encoder.transform(X_test_current_family['store_nbr'])
    
    if current_family not in linear_categories:
        model = PositiveRegressor(lgb.LGBMRegressor())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, eval_metric='rmse')#, categorical_feature=['store_nbr'])
        
    else:
        #X_train_current_family = X_train_current_family.drop(columns=['onpromotion'])
        #X_train_current_family = pd.get_dummies(X_train_current_family, columns=['store_nbr'], drop_first=True)
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        #X_test_current_family = X_test_current_family.drop(columns=['onpromotion'])
        #X_test_current_family = pd.get_dummies(X_test_current_family, columns=['store_nbr'], drop_first=True)
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
    lags_copy = lags.copy()
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in lags_copy:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
            predictions = model.predict(X_test_for_current_day)
                
            for current_lag in lags_copy:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
            current_day_index += 1
                
        lags_copy = lags_copy[:-1]
        start = end
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family = np.exp(y_pred_current_family) - 1
    
    test_indices = X_test_current_family.index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/lgbm_and_linear_regression_v3.csv', index = False)

In [None]:
ccc = pd.read_csv(DATA_ROOT / 'lgbm_and_linear_regression.csv')
(ccc['sales'] - submission['sales']).unique().max()

# adding features from other datasets

In [None]:
X = X.drop(columns=['store_type', 'store_city'])

In [None]:
X.columns

In [None]:
X.dtypes

In [None]:
# LGBM + linreg; default; 'store_type', 'store_city'

X = train_transformer.adding_stores_data(X, columns_to_add=['type', 'city'])
X['store_type'] = LabelEncoder().fit_transform(X['store_type'])
X['store_city'] = LabelEncoder().fit_transform(X['store_city'])

scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

linear_categories = ['books', 'frozen foods', 'beverages', 'cleaning', 'grocery ii', 'home and kitchen ii', 'home appliances']

tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 120) * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)
for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    y_train = np.log(y_train + 1)
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[X_train_current_family.index]
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        
        if current_family not in linear_categories:
            model = PositiveRegressor(lgb.LGBMRegressor())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr', 'store_type', 'store_city'], 
                      eval_metric='rmse')
            
        else:
            X_train_current_family = X_train_current_family.drop(columns=['store_type', 'store_city'])
            X_train_current_family = pd.get_dummies(X_train_current_family, columns=['store_nbr'], drop_first=True)
            scaler = MinMaxScaler()
            X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
            model = PositiveRegressor(LinearRegression())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
            X_test_current_family = X_test_current_family.drop(columns=['store_type', 'store_city'])
            X_test_current_family = pd.get_dummies(X_test_current_family, columns=['store_nbr'], drop_first=True)
            X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
        lags_copy = lags.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in lags_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in lags_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            lags_copy = lags_copy[:-1]
            start = end

        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family

        
    y_pred = X_test['pred'].copy()
    y_pred = np.exp(y_pred) - 1
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

# best

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

linear_categories = ['books', 'frozen foods', 'beverages', 'cleaning', 'grocery ii', 'home and kitchen ii', 'home appliances']

tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)
for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    y_train = np.log(y_train + 1)
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[X_train_current_family.index]
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        current_family_indices_test = X_test_current_family.index
        
        if current_family not in linear_categories:
            model = PositiveRegressor(lgb.LGBMRegressor())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'], 
                      eval_metric='rmse')
            
        else:
            X_train_current_family = pd.get_dummies(X_train_current_family, columns=['store_nbr'], drop_first=True)
            scaler = MinMaxScaler()
            X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
            model = PositiveRegressor(LinearRegression())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
            X_test_current_family = pd.get_dummies(X_test_current_family, columns=['store_nbr'], drop_first=True)
            X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
        lags_copy = lags.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in lags_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in lags_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            lags_copy = lags_copy[:-1]
            start = end

        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family

        
    y_pred = X_test['pred'].copy()
    y_pred = np.exp(y_pred) - 1
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=pd.Timedelta(days=DAYS_IN_YEAR), y=y)
y_train = np.log(y_train + 1)

test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in lags:
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(current_lag - i)]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

for current_family in X['family'].unique():
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[X_train_current_family.index]
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    
    if current_family not in linear_categories:
        model = PositiveRegressor(lgb.LGBMRegressor())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'], 
                  eval_metric='rmse')
        
    else:
        X_train_current_family = pd.get_dummies(X_train_current_family, columns=['store_nbr'], drop_first=True)
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        X_test_current_family = pd.get_dummies(X_test_current_family, columns=['store_nbr'], drop_first=True)
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
    lags_copy = lags.copy()
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in lags_copy:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
            predictions = model.predict(X_test_for_current_day)
                
            for current_lag in lags_copy:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
            current_day_index += 1
                
        lags_copy = lags_copy[:-1]
        start = end
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family = np.exp(y_pred_current_family) - 1
    
    test_indices = X_test_current_family.index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/lgbm_and_linear_regression_v3.csv', index = False)

In [None]:
ccc = pd.read_csv(DATA_ROOT / 'lgbm_and_linear_regression.csv')
(ccc['sales'] - submission['sales']).unique().max()

#### simplest lgbm without any parameters

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():        
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        model = lgb.LGBMRegressor()
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'])
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
                  
        days_to_shift_copy = days_to_shift.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in days_to_shift_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                predictions[predictions < 0] = 0
                
                for current_lag in days_to_shift_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            days_to_shift_copy = days_to_shift_copy[:-1]
            start = end

        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family

        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

# 3. LightGBM Regressor for every family, features: 'store_nbr', 'dcoilwtico', 'lag' (1, 2, 4, 6, 7, 14) with optuna

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
lags = [1, 2, 4, 6, 7, 14]

X_copy = X.copy()
X_copy['sales'] = np.log(X_copy['sales'] + 1)

for current_lag in lags:
    X['lag_{}'.format(current_lag)] = X_copy.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)

X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=timedelta(days=DAYS_IN_YEAR + N_HORIZONS), y=y)
X_train = X_train[~X_train['date'].isin(cut_history(X=X, date_column='date', keep_interval=timedelta(days=N_HORIZONS), y=y)[0]['date'].unique())]
y_train = y_train.loc[X_train.index]

for i, current_family in enumerate(X['family'].unique()):
    plt.figure(figsize=(20, 130))
    plt.subplot(X['family'].nunique(), 2, 2 * i + 1)
    X_current_family = X_train[X_train['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y_train.loc[X_current_family.index]
    
    titles = {'RMSLE': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
              'MAE': '{}: Learning Curves (default LGBM Regressor), MAE'.format(current_family)}
    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR - 4 * N_HORIZONS) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    model = PositiveRegressor(lgb.LGBMRegressor())
    plot_learning_curve(model, titles['MAE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_absolute_error')
    plt.subplot(X['family'].nunique(), 2, 2 * i + 2)
    plot_learning_curve(model, titles['MSLE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_squared_log_error')
    plt.show()

In [None]:
linear_categories = ['books', 'frozen foods']

In [None]:
params = {
    'objective': 'regression_l1',
    'metric': 'l1',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'num_boost_round': 1000,
    'n_jobs': -1,
}

In [None]:
a = [1, 2, 3]
b = [5, 6, 7]
np.mean(np.square(np.log1p(a) - np.log1p(b)))

In [None]:
def msle_for_lgbm(preds: np.ndarray, data: lgb.Dataset) -> Tuple[str, float, bool]:
    """Calculate MSLE"""
    label = data.get_label()
    # weight = data.get_weight()
    # pred_label = (preds > threshold).astype(int)
    # acc = np.average(label == pred_label, weights=weight)
    preds[preds < 0] = 0
    msle = np.mean(np.square(np.log1p(y_pred + 1) - np.log1p(y_true + 1)))
    # # eval_name, eval_result, is_higher_better
    return 'my_bin_acc', acc, True

In [None]:
# optimize MAE

best_params_MAE = {}

for current_family in list(set(X['family'].unique())-set(linear_categories)):
    print(current_family)
    X_current_family = X_train[X_train['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y_train.loc[X_current_family.index]
    
    dtrain = lgb.Dataset(X_current_family, label=y_current_family, categorical_feature=['store_nbr'])
    tscv_inner = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR - 4 * N_HORIZONS) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    
    tuner = LightGBMTunerCV(params, dtrain, folds=tscv_inner, callbacks=[early_stopping(25)], return_cvbooster=True)
    tuner.run()
        
    сurrent_family_best_params = {key:tuner.best_params[key] for key in tuner.best_params.keys() if key not in params.keys()}
    сurrent_family_best_params['num_boost_round'] = tuner.get_best_booster().best_iteration
    print(сurrent_family_best_params)
    
    best_params_MAE[current_family] = сurrent_family_best_params

In [None]:
from optuna.integration.lightgbm import LightGBMTunerCV
from lightgbm import early_stopping

tscv_outer = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)
X_for_lgbm_Tuner_CV_indices = [x for x in tscv_outer.split(X, y)][-1][0]
X_for_lgbm_Tuner_CV = X.iloc[X_for_lgbm_Tuner_CV_indices]
y_for_lgbm_Tuner_CV = y.iloc[X_for_lgbm_Tuner_CV_indices]

tscv_inner = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR - 65) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)

current_family_best_params = {}
current_family_best_scores = {}

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
}

for current_family in special:
# for current_family in X['family'].unique():
    print('\n\n', current_family, '\n\n')
    params['num_boost_round'] = best_n_estimators[current_family]['n_estimators']
    X_current_family = X[X['family'] == current_family].drop(columns=['family'])
    y_current_family = y.loc[X_current_family.index]
    
    dtrain = lgb.Dataset(X_current_family.drop(columns=['date']), label=y_current_family, categorical_feature=['store_nbr'])

    tuner = LightGBMTunerCV(
        params,
        dtrain,
        folds=tscv_inner,
        callbacks=[early_stopping(25)],
    )

    tuner.run()
    
    current_family_best_params[current_family] = tuner.best_params
    [current_family_best_params[current_family].pop(key) for key in {'objective', 'metric', 'verbosity'}]
    current_family_best_scores[current_family] = tuner.best_score

In [None]:
for current_family in current_family_best_params.keys():
    print(current_family)
    for key, value in current_family_best_params[current_family].items():
        print("    {}: {}".format(key, value))
    print('\n\n')

In [None]:
for i, current_family in enumerate(special):
#for i, current_family in enumerate(X['family'].unique()):
    plt.figure(figsize=(20, 130))
    X_current_family = X[X['family'] == current_family].drop(columns=['family', 'date'])
    y_current_family = y.loc[X_current_family.index]
    
    titles = {'without_tuning': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
              'with_tuning': '{}: Learning Curves (LGBM Regressor with LGBMTunerCV), RMSLE'.format(current_family)}
    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 120) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
    model_default = PositiveRegressor(lgb.LGBMRegressor())
    model_tuned_params = PositiveRegressor(lgb.LGBMRegressor(**current_family_best_params[current_family]))
    plt.subplot(X['family'].nunique(), 2, 2 * i + 1)
    plot_learning_curve(model_default, titles['without_tuning'], X_current_family, y_current_family, cv=tscv, scoring=custom_RMSLE)
    plt.subplot(X['family'].nunique(), 2, 2 * i + 2)
    plot_learning_curve(model_tuned_params, titles['with_tuning'], X_current_family, y_current_family, cv=tscv, scoring=custom_RMSLE)
    plt.show()
    
    
# for i, current_family in enumerate(X['family'].unique()):
#     plt.figure(figsize=(20, 130))
#     plt.subplot(X['family'].nunique(), 2, 2 * i + 1)
#     X_current_family = X[X['family'] == current_family].drop(columns=['family', 'date'])
#     y_current_family = y.loc[X_current_family.index]
    
#     titles = {'RMSLE': '{}: Learning Curves (default LGBM Regressor), RMSLE'.format(current_family), 
#               'MAE': '{}: Learning Curves (default LGBM Regressor), MAE'.format(current_family)}
#     tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 120) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)
#     model_default = PositiveRegressor(lgb.LGBMRegressor())
#     plot_learning_curve(model, titles['MAE'], X_current_family, y_current_family, cv=tscv, scoring='neg_mean_absolute_error')
#     plt.subplot(X['family'].nunique(), 2, 2 * i + 2)
#     plot_learning_curve(model, titles['RMSLE'], X_current_family, y_current_family, cv=tscv, scoring=custom_RMSLE)
#     plt.show()

In [None]:
ends = [(16 - x) for x in lags]
ends.reverse()

In [None]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y, n_estimators):
    param_grid = {
        # 'n_estimators': trial.suggest_categorical('n_estimators', [50, 100, 250, 500, 1000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'num_leaves': trial.suggest_int('num_leaves', 8, 500, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 1000, step=100),
        'lambda_l1': trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
    }

    tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 100) * N_STORES, n_splits=4, test_size=N_HORIZONS * N_STORES)

    cv_scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}
    for train_indices, test_indices in tscv.split(X, y):
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        model = lgb.LGBMRegressor(n_estimators=n_estimators, **param_grid)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='rmse', categorical_feature=['store_nbr'],
            callbacks=[
                LightGBMPruningCallback(trial, 'rmse'),
                lgb.early_stopping(stopping_rounds=25)
            ],
        )
        
        y_pred = model.predict(X_test)
        y_pred[y_pred < 0] = 0
        cv_scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
        cv_scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
        cv_scores['MAE'].append(mean_absolute_error(y_test, y_pred))
        cv_scores['R2'].append(r2_score(y_test, y_pred))


    return np.mean(cv_scores['RMSLE']), cv_scores

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=timedelta(days=540), y=y)

In [None]:
best_params = {family:{} for family in X['family'].unique()}
best_values = {family:0 for family in X['family'].unique()}
for current_family in X['family'].unique():
    print('\n\n')
    print(current_family)
    print('\n\n')
    X_current_family = X_train[X_train['family'] == current_family].drop(columns=['date', 'family'])
    y_current_family = y_train.loc[X_current_family.index]
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction='minimize', study_name='LGBM Regressor')
    func = lambda trial: objective(trial, X_current_family, y_current_family, best_n_estimators[current_family]['n_estimators'])[0]
    study.optimize(func, n_trials=100)
    for key, value in study.best_params.items():
        best_params[current_family][key] = value
    best_values[current_family] = study.best_value

In [None]:
for family, all_params in best_params.items():
    print(family)
    for param in all_params.keys():
        print('{}:'.format(param), all_params[param])
    print()

In [None]:
rmsle_errors = {'{}:'.format(x): best_values[x] for x in best_values.keys()}
mean(rmsle_errors.values())

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}
tscv = TimeSeriesSplit(gap=0, max_train_size=(DAYS_IN_YEAR + 100) * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[X_train_current_family.index]
        
        current_family_hyper_params = current_family_best_params[current_family]
        #if current_family != 'automotive':
        #    [current_family_hyper_params.pop(key) for key in {'objective', 'metric', 'verbosity'}]
        print(current_family_hyper_params)
        
        model = lgb.LGBMRegressor(**current_family_hyper_params, verbosity=-1)
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'])
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
                  
        lags_copy = lags.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in lags_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                predictions[predictions < 0] = 0
                
                for current_lag in lags_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            lags_copy = lags_copy[:-1]
            start = end

        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
for current_lag in lags:
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(current_lag - i)]].tolist()

In [None]:
print(lags)
test_data

In [None]:
X_train, y_train = cut_history(X=X, date_column='date', keep_interval=timedelta(days=DAYS_IN_YEAR), y=y)

In [None]:
for current_family in X['family'].unique():
    print(current_family)
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[X_train_current_family.index]
    
    current_family_hyper_params = current_family_best_params[current_family]
    
    model = lgb.LGBMRegressor(**current_family_hyper_params)
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])   
    
    lags_copy = lags.copy()
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in lags_copy:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0    
                
            for current_lag in lags_copy:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
            current_day_index += 1
                
        lags_copy = lags_copy[:-1]
        start = end
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family[y_pred_current_family < 0] = 0
    
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/lgbmTunerCV_lags_and_dcoilwtico.csv', index = False)

# 4. LightGBM Regressor for every family, features: 'store_nbr', 'dcoilwtico', 'lag' (1, 2, 4, 6, 7, 14), with 'boruta'

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

for current_lag in days_to_shift:
    X['lag_{}'.format(current_lag)] = X.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)

X = X.dropna()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
ends = [(16 - x) for x in days_to_shift]
ends.reverse()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
hyper_params = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.005,
    'max_depth': 4, 
    'max_bin': 128,
    'num_leaves': 16,
    "n_estimators": 10000
}

In [None]:
best_features = {family:{} for family in X_train['family'].unique()}
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['date', 'family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    
    model = lgb.LGBMRegressor(objective='regression', **hyper_params)
    boruta = BorutaPy(estimator=model, n_estimators='auto', max_iter = 100)
    boruta.fit(np.array(X_train_current_family), np.array(y_train_current_family))
    best_features[current_family]['green_area'] = X_train_current_family.columns[boruta.support_].to_list()
    best_features[current_family]['blue_area'] = X_train_current_family.columns[boruta.support_weak_].to_list()

In [None]:
for family, all_features in best_features.items():
    print(family)
    for features in all_features.keys():
        print(features, all_features[features])
    print()

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        columns_to_drop = [feature for feature in X if feature not in best_features[current_family]['green_area'] and \
                                                       feature not in best_features[current_family]['blue_area'] and \
                                                       feature != 'date']
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        model = lgb.LGBMRegressor(objective='regression', **hyper_params)
        if 'store_nbr' in columns_to_drop:
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        else:
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family, categorical_feature=['store_nbr'])
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family]
                  
        days_to_shift_copy = days_to_shift.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in days_to_shift_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=columns_to_drop) \
                                                                                                            .drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                predictions[predictions < 0] = 0
                
                for current_lag in days_to_shift_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            days_to_shift_copy = days_to_shift_copy[:-1]
            start = end

        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=columns_to_drop).drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

# 5. LightGBM Regressor for every family, features: 'store_nbr', 'dcoilwtico', 'lag' (1, 2, 4, 6, 7, 14), 'rolling' with 'boruta'

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in days_to_shift:
    X.loc[:, 'lag_{}'.format(current_lag)] = X.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = X.loc[X[X['date'] == X['date'].unique()[-(current_lag - i)]].index, 'sales'].tolist()

In [None]:
rolling_periods = {'year': 365, '6m': 183, '3m': 92, '1m': 31, '16d': 16, '10d': 10, '7d': 7, '5d': 5, '3d': 3}
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min']

In [None]:
for period, days in rolling_periods.items():
    for function in aggregate_functions:
        X['rolling_{0}_{1}'.format(period, function)] = X.groupby(['store_nbr', 'family'])['sales'].apply(lambda x: x.shift().rolling(days).agg({'sales': function}))

In [None]:
X = X.dropna()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])
X['sales'] = y

In [None]:
hyper_params = {
    'max_depth': 6, 
    'n_estimators': 300,
    'num_leaves': 40,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
best_features = {family:{} for family in X_train['family'].unique()}
for current_family in X_train['family'].unique():
    print(current_family)
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['date', 'family', 'sales'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    
    model = lgb.LGBMRegressor(objective='regression', **hyper_params)
    
    boruta = BorutaPy(estimator=model, n_estimators='auto', max_iter=100)
    boruta.fit(np.array(X_train_current_family), np.array(y_train_current_family))
    best_features[current_family]['green_area'] = X_train_current_family.columns[boruta.support_].to_list()
    best_features[current_family]['blue_area'] = X_train_current_family.columns[boruta.support_weak_].to_list()
    print('green: ', X_train_current_family.columns[boruta.support_].to_list())
    print('blue: ', X_train_current_family.columns[boruta.support_weak_].to_list())
    
    model.fit(X_train_current_family, y_train_current_family)
    explainer = shap.TreeExplainer(model, X_train_current_family)
    shap_values = explainer(X_train_current_family)
    shap.plots.bar(shap_values, max_display=25)
    
    print('\n\n')

In [None]:
final_best_features = {
    'automotive': ['dcoilwtico', 'lag_1', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 'rolling_6m_max',
                   'rolling_7d_mean', 'rolling_3m_mean', 'rolling_1m_mean', 'rolling_1m_median',
                   'rolling_6m_median', 'rolling_6m_mean', 'rolling_10d_mean', 'rolling_year_median',
                   'rolling_16d_mean'],
    'baby care': ['dcoilwtico', 'rolling_year_mean', 'rolling_6m_mean', 'rolling_3m_mean', 'rolling_1m_mean', 
                  'rolling_16d_mean', 'rolling_10d_mean'],
    'beauty': ['dcoilwtico', 'lag_1', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 'rolling_3m_mean', 
               'rolling_1m_mean', 'rolling_16d_mean', 'rolling_7d_mean', 'rolling_5d_max', 'rolling_3m_min', 
               'rolling_7d_min', 'rolling_1m_median'],
    'beverages': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_1m_min',
                  'rolling_16d_mean', 'rolling_16d_max', 'rolling_10d_min', 'rolling_5d_min', 'rolling_3d_median',
                  'rolling_3d_max', 'rolling_3d_min', 'rolling_7d_median', 'rolling_7d_mean', 'rolling_5d_median',
                  'rolling_10d_median', 'rolling_7d_min', 'rolling_year_mean', 'rolling_16d_median'],
    'books': ['rolling_year_mean', 'rolling_3m_mean', 'rolling_1m_mean', 'rolling_16d_mean', 'rolling_16d_max',
              'rolling_10d_mean', 'rolling_7d_mean', 'rolling_5d_mean', 'rolling_10d_max'],
    'bread/bakery': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_3m_max', 
                     'rolling_3m_min', 'rolling_16d_max', 'rolling_5d_max', 'rolling_5d_min', 'rolling_3d_median', 
                     'rolling_3d_min', 'rolling_7d_mean', 'rolling_7d_median', 'rolling_5d_min'],
    'celebration': ['dcoilwtico', 'lag_1', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 'rolling_3m_mean', 
                    'rolling_1m_median', 'rolling_1m_min', 'rolling_16d_mean', 'rolling_7d_median', 'rolling_7d_mean', 
                    'rolling_7d_min', 'rolling_5d_mean', 'rolling_3d_mean', 'rolling_3m_median', 'rolling_1m_mean',
                    'rolling_6m_mean', 'rolling_year_median', 'lag_4', 'rolling_3d_max'],
    'cleaning': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_max', 'rolling_3m_max', 
                 'rolling_1m_max', 'rolling_16d_median', 'rolling_16d_mean', 'rolling_16d_max', 'rolling_10d_max', 
                 'rolling_7d_mean', 'rolling_7d_max', 'rolling_5d_median', 'rolling_5d_max', 'rolling_5d_min', 
                 'rolling_3d_median', 'rolling_3d_max', 'rolling_3m_mean', 'rolling_year_mean', 'rolling_10d_median', 
                 'rolling_10d_mean', 'rolling_1m_median'],
    'dairy': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_min', 'rolling_1m_min', 
              'rolling_16d_max', 'rolling_5d_median', 'rolling_5d_max', 'rolling_5d_min', 'rolling_3d_median', 
              'rolling_3d_mean', 'rolling_3d_max', 'rolling_3d_min', 'rolling_7d_mean', 'rolling_7d_median', 
              'rolling_6m_max', 'rolling_16d_median', 'rolling_year_max'],
    'deli': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_min', 'rolling_3m_min', 
             'rolling_16d_median', 'rolling_16d_max', 'rolling_16d_min', 'rolling_10d_min', 'rolling_5d_min', 
             'rolling_3d_median', 'rolling_3d_max', 'rolling_3d_min', 'rolling_3m_median', 'rolling_year_median',
             'rolling_7d_mean', 'rolling_3m_mean', 'rolling_7d_median'],
    'eggs': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_max', 'rolling_1m_max', 
             'rolling_16d_max', 'rolling_10d_max', 'rolling_7d_mean', 'rolling_3d_max', 'rolling_3d_min', 
             'rolling_3m_median'],
    'frozen foods': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_max', 
                     'rolling_16d_median', 'rolling_16d_max', 'rolling_10d_max', 'rolling_7d_median', 'rolling_7d_mean', 
                     'rolling_7d_max', 'rolling_5d_max', 'rolling_5d_min', 'rolling_3d_min', 'rolling_10d_mean'],
    'grocery i': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_min', 'rolling_1m_max', 
                  'rolling_16d_median', 'rolling_16d_mean', 'rolling_16d_max', 'rolling_5d_max', 'rolling_5d_min', 
                  'rolling_3d_max', 'rolling_3d_min', 'rolling_year_mean', 'rolling_7d_mean', 'rolling_5d_median',
                  'rolling_year_median'],
    'grocery ii': ['lag_1', 'lag_7', 'lag_14', 'rolling_7d_mean', 'rolling_10d_median', 'rolling_3m_median', 
                   'rolling_3m_min'],
    'hardware': ['dcoilwtico', 'rolling_year_mean', 'rolling_3m_mean', 'rolling_1m_mean'],
    'home and kitchen i': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_6', 'lag_7', 'lag_14', 'rolling_3m_median', 
                           'rolling_16d_median', 'rolling_16d_min', 'rolling_10d_median', 'rolling_7d_median', 
                           'rolling_7d_mean', 'rolling_5d_median', 'rolling_5d_min', 'rolling_3d_median', 'rolling_3d_mean', 
                           'rolling_3d_min', 'rolling_6m_median', 'rolling_16d_mean', 'lag_4'],
    'home and kitchen ii': ['dcoilwtico', 'lag_1', 'lag_6', 'lag_7', 'lag_14', 'rolling_10d_max', 'rolling_7d_mean', 
                            'rolling_7d_median', 'rolling_3d_median', 'rolling_16d_mean', 'rolling_3m_mean', 
                            'rolling_10d_mean', 'rolling_1m_median'],
    'home appliances': ['dcoilwtico', 'rolling_year_mean', 'rolling_6m_mean', 'rolling_3m_mean', 'rolling_1m_mean', 
                        'rolling_16d_mean'],
    'home care': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 
                  'rolling_6m_min', 'rolling_16d_median', 'rolling_16d_mean', 'rolling_16d_max', 'rolling_16d_min', 
                  'rolling_7d_mean', 'rolling_7d_min', 'rolling_5d_median', 'rolling_5d_max', 'rolling_5d_min', 
                  'rolling_3d_median', 'rolling_3d_max', 'rolling_3d_min', 'rolling_3m_median', 'rolling_6m_mean', 
                  'rolling_10d_median'],
    'ladieswear': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 
                   'rolling_3m_max', 'rolling_16d_median', 'rolling_16d_mean', 'rolling_7d_mean', 'rolling_5d_max', 
                   'rolling_3d_mean', 'rolling_3m_mean', 'rolling_6m_mean', 'rolling_year_max', 'rolling_6m_median'],
    'lawn and garden': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_6', 'lag_7', 'lag_14', 'rolling_1m_median', 'rolling_1m_mean', 
                        'rolling_7d_median', 'rolling_7d_mean', 'rolling_5d_max', 'rolling_3d_median', 'rolling_3d_mean', 
                        'rolling_3d_max', 'rolling_3d_min', 'rolling_7d_min', 'rolling_10d_median'],
    'lingerie': ['dcoilwtico', 'lag_1', 'lag_7', 'lag_14', 'rolling_year_mean', 'rolling_16d_mean', 'rolling_7d_mean', 
                 'rolling_3d_mean', 'rolling_year_median', 'rolling_16d_median', 'rolling_1m_mean'],
    'liquor,wine,beer': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_min', 
                         'rolling_10d_max', 'rolling_7d_min', 'rolling_year_mean'],
    'magazines': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 
                  'rolling_3m_mean', 'rolling_16d_mean', 'rolling_10d_mean', 'rolling_7d_mean', 'rolling_6m_mean', 
                  'rolling_10d_min'],
    'meats': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_7', 'lag_14', 'rolling_1m_max', 'rolling_7d_max', 'rolling_5d_min', 
              'rolling_1m_mean', 'rolling_16d_median', 'rolling_5d_max', 'rolling_6m_median', 'rolling_3m_median', 
              'rolling_6m_mean'],
    'personal care': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_16d_median', 
                      'rolling_16d_mean', 'rolling_16d_max', 'rolling_10d_min', 'rolling_5d_max', 'rolling_5d_min', 
                      'rolling_3d_median', 'rolling_3d_max', 'rolling_3d_min', 'rolling_3m_median', 'rolling_7d_mean', 
                      'rolling_3m_mean', 'rolling_year_median', 'rolling_7d_median', 'rolling_year_mean', 'rolling_7d_min'],
    'pet supplies': ['dcoilwtico', 'lag_1', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 'rolling_3m_mean', 
                     'rolling_1m_mean', 'rolling_16d_mean', 'rolling_7d_mean', 'rolling_5d_mean', 'rolling_5d_max', 
                     'rolling_16d_median'],
    'players and electronics': ['dcoilwtico', 'lag_1', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 
                                'rolling_7d_mean', 'rolling_16d_mean', 'rolling_1m_mean', 'rolling_3m_mean', 
                                'rolling_1m_median'],
    'poultry': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_max', 'rolling_3m_min', 
                'rolling_1m_max', 'rolling_16d_mean', 'rolling_16d_max', 'rolling_10d_min', 'rolling_7d_median', 
                'rolling_7d_mean', 'rolling_7d_max', 'rolling_5d_mean', 'rolling_5d_min', 'rolling_3d_median', 
                'rolling_3d_max', 'rolling_3d_min', 'rolling_10d_mean', 'rolling_10d_max', 'rolling_6m_mean', 
                'rolling_3m_median', 'rolling_year_mean', 'rolling_year_median'],
    'prepared foods': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_1m_max', 'rolling_1m_min', 
                       'rolling_7d_mean', 'rolling_16d_mean', 'rolling_1m_median', 'rolling_year_max', 'rolling_3m_median',
                       'rolling_16d_median'],
    'produce': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_6m_min', 'rolling_1m_min', 
                'rolling_16d_max', 'rolling_16d_min', 'rolling_10d_min', 'rolling_7d_median', 'rolling_5d_median', 
                'rolling_5d_min', 'rolling_3d_median', 'rolling_3d_max', 'rolling_3d_min', 'rolling_7d_mean', 
                'rolling_3m_mean', 'rolling_3m_median', 'rolling_year_max', 'rolling_7d_max'],
    'school and office supplies': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_mean', 
                                   'rolling_16d_median', 'rolling_10d_mean', 'rolling_7d_median', 'rolling_7d_mean', 
                                   'rolling_7d_min', 'rolling_5d_mean', 'rolling_5d_min', 'rolling_3d_median', 
                                   'rolling_3d_mean', 'rolling_3d_max', 'rolling_3d_min', 'rolling_7d_max'],
    'seafood': ['dcoilwtico', 'lag_1', 'lag_2', 'lag_4', 'lag_6', 'lag_7', 'lag_14', 'rolling_year_median', 'rolling_1m_max', 
                'rolling_16d_mean', 'rolling_16d_max', 'rolling_7d_min', 'rolling_5d_max', 'rolling_3d_max', 'rolling_3d_min',
                'rolling_10d_min', 'rolling_3d_median']
}

In [None]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 500, 1000, 5000, 10000]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 10000, step=100),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100, step=5),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100, step=5),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 15),
    }

    tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 54, n_splits=4, test_size=16 * 54)

    cv_scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}
    for train_indices, test_indices in tscv.split(X, y):
        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        model = lgb.LGBMRegressor(objective='regression_l1', **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='rmse',
            callbacks=[
                LightGBMPruningCallback(trial, 'rmse'),
                lgb.early_stopping(stopping_rounds=100)
            ],  # Add a pruning callback
        )
        y_pred = model.predict(X_test)
        y_pred[y_pred < 0] = 0
        cv_scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
        cv_scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
        cv_scores['MAE'].append(mean_absolute_error(y_test, y_pred))
        cv_scores['R2'].append(r2_score(y_test, y_pred))


    return np.mean(cv_scores['RMSLE']), cv_scores

In [None]:
best_params = {family:{} for family in X['family'].unique()}
best_values = {family:0 for family in X['family'].unique()}
for current_family in X['family'].unique():
    print()
    print()
    print(current_family)
    print()
    print()
    columns_to_drop = [feature for feature in X if feature not in final_best_features[current_family] and \
                                                   feature != 'store_nbr']
    current_family_indices = X[X['family'] == current_family].index
    X_current_family = X[X['family'] == current_family].drop(columns=columns_to_drop)
    y_current_family = y.loc[current_family_indices]
    
    
    study = optuna.create_study(direction='minimize', study_name='LGBM Regressor')
    func = lambda trial: objective(trial, X_current_family, y_current_family)[0]
    study.optimize(func, n_trials=20)
    for key, value in study.best_params.items():
        best_params[current_family][key] = value
    best_values[current_family] = study.best_value

In [None]:
for family, all_params in best_params.items():
    print(family)
    for param in all_params.keys():
        print('{}:'.format(param), all_params[param])
    print()

In [None]:
# objective='regression', eval_metric='l2' -> ~0.47
# objective='regression_l1', eval_metric='rmsle' -> ~0.39

rmsle_errors = {'{}:'.format(x): best_values[x] for x in best_values.keys()}
median(rmsle_errors.values())

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'sales'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        columns_to_drop = [feature for feature in X if feature not in final_best_features[current_family] and \
                                                       feature != 'date' and feature != 'sales' and feature != 'store_nbr']
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        #del best_params[current_family]['best_value']
        current_family_hyper_params = best_params[current_family]
        model = lgb.LGBMRegressor(objective='regression_l1', **current_family_hyper_params)
        if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
            model.fit(X_train_current_family.drop(columns=['date', 'sales']), y_train_current_family, categorical_feature=['store_nbr'])
        else:
            model.fit(X_train_current_family.drop(columns=['date', 'sales', 'store_nbr']), y_train_current_family)
        
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=columns_to_drop)
        
        current_days_to_shift = [int(x[4:]) for x in X_train_current_family.columns if x[:3] == 'lag']
        ends = [(16 - x) for x in current_days_to_shift]
        ends.reverse()
        
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in current_days_to_shift:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                current_day_indices = X_test_current_family[X_test_current_family['date'] == current_day].index
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day]
                X_all_rows = pd.concat([X_train_current_family, X_test_current_family])
                
                rolling_features = [x for x in X_test_for_current_day.columns if x[:7] == 'rolling']
                for current_rolling_feature in rolling_features:
                    current_period = current_rolling_feature.split('_')[1]
                    current_aggregate = current_rolling_feature.split('_')[2]
                    X_test_for_current_day[current_rolling_feature] = X_all_rows.groupby(['store_nbr'])['sales'].apply(lambda x: x.shift().rolling(rolling_periods[current_period]).agg({'sales': current_aggregate})).loc[current_day_indices]
 
                
                predictions = 0
                if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
                    predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales']))
                else:
                    predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales', 'store_nbr']))
                
                predictions[predictions < 0] = 0
                X_test_current_family.loc[current_day_indices, 'sales'] = predictions
                
                for current_lag in current_days_to_shift:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                current_day_index += 1
                
            current_days_to_shift = current_days_to_shift[:-1]
            start = end
        
        for current_day in X_test_current_family['date'].unique()[start:16]:
            current_day_indices = X_test_current_family[X_test_current_family['date'] == current_day].index
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day]
            X_all_rows = pd.concat([X_train_current_family, X_test_current_family])
                
            rolling_features = [x for x in X_test_for_current_day.columns if x[:7] == 'rolling']
            for current_rolling_feature in rolling_features:
                current_period = current_rolling_feature.split('_')[1]
                current_aggregate = current_rolling_feature.split('_')[2]
                X_test_for_current_day[current_rolling_feature] = X_all_rows.groupby(['store_nbr'])['sales'].apply(lambda x: x.shift().rolling(rolling_periods[current_period]).agg({'sales': current_aggregate})).loc[current_day_indices]
 
                
            predictions = 0
            if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
                predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales']))
            else:
                predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales', 'store_nbr']))
            
            predictions[predictions < 0] = 0
            X_test_current_family.loc[current_day_indices, 'sales'] = predictions
            current_day_index += 1
        
        
        y_pred_current_family = 0
        if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
            y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'sales']))
        else:
            y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'sales', 'store_nbr']))
        
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'sales'] = y_pred_current_family
        
        
    y_pred = X_test['sales'].copy()
    X_test = X_test.drop(columns=['sales'])
    
    print(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

print()
for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for period, days in rolling_periods.items():
    for function in aggregate_functions:
        test_data['rolling_{0}_{1}'.format(period, function)] = 0

In [None]:
test_data['sales'] = 0
for current_family in X_train['family'].unique():
    print(current_family)
    columns_to_drop = [feature for feature in X_train if feature not in best_features[current_family]['green_area'] and \
                                                       feature not in best_features[current_family]['blue_area'] and \
                                                       feature != 'date' and feature != 'sales' and feature != 'store_nbr']
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
    y_train_current_family = y_train.loc[current_family_indices_train]
    
    current_family_hyper_params = best_params[current_family]
    model = lgb.LGBMRegressor(objective='regression_l1', **current_family_hyper_params)
    if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
        model.fit(X_train_current_family.drop(columns=['date', 'sales']), y_train_current_family, categorical_feature=['store_nbr'])
    else:
        model.fit(X_train_current_family.drop(columns=['date', 'sales', 'store_nbr']), y_train_current_family)
        
        
    current_family_indices_test = test_data[test_data['family'] == current_family].index
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=columns_to_drop)
    
    current_days_to_shift = [int(x[4:]) for x in X_train_current_family.columns if x[:3] == 'lag']
    ends = [(16 - x) for x in current_days_to_shift]
    ends.reverse()

    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in current_days_to_shift:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            current_day_indices = X_test_current_family[X_test_current_family['date'] == current_day].index
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day]
            X_all_rows = pd.concat([X_train_current_family, X_test_current_family])
                
            rolling_features = [x for x in X_test_for_current_day.columns if x[:7] == 'rolling']
            for current_rolling_feature in rolling_features:
                current_period = current_rolling_feature.split('_')[1]
                current_aggregate = current_rolling_feature.split('_')[2]
                X_test_for_current_day[current_rolling_feature] = X_all_rows.groupby(['store_nbr'])['sales'].apply(lambda x: x.shift().rolling(rolling_periods[current_period]).agg({'sales': current_aggregate})).loc[current_day_indices]
 
                
            predictions = 0
            if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
                predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales']))
            else:
                predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales', 'store_nbr']))
                
            predictions[predictions < 0] = 0
            X_test_current_family.loc[current_day_indices, 'sales'] = predictions
        
            for current_lag in current_days_to_shift:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
            current_day_index += 1
                
        current_days_to_shift = current_days_to_shift[:-1]
        start = end
        
    for current_day in X_test_current_family['date'].unique()[start:16]:
        current_day_indices = X_test_current_family[X_test_current_family['date'] == current_day].index
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day]#.drop(columns=[x for x in X_test_current_family.columns if x in columns_to_drop])
        X_all_rows = pd.concat([X_train_current_family, X_test_current_family])
                
        rolling_features = [x for x in X_test_for_current_day.columns if x[:7] == 'rolling']
        for current_rolling_feature in rolling_features:
            current_period = current_rolling_feature.split('_')[1]
            current_aggregate = current_rolling_feature.split('_')[2]
            X_test_for_current_day[current_rolling_feature] = X_all_rows.groupby(['store_nbr'])['sales'].apply(lambda x: x.shift().rolling(rolling_periods[current_period]).agg({'sales': current_aggregate})).loc[current_day_indices]
 
                
        predictions = 0
        if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
            predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales']))
        else:
            predictions = model.predict(X_test_for_current_day.drop(columns=['date', 'sales', 'store_nbr']))
            
        predictions[predictions < 0] = 0
        X_test_current_family.loc[current_day_indices, 'sales'] = predictions
        current_day_index += 1
        
        
    y_pred_current_family = 0
    if 'store_nbr' in best_features[current_family]['green_area'] or best_features[current_family]['blue_area']:
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'sales']))
    else:
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'sales', 'store_nbr']))
        
    y_pred_current_family[y_pred_current_family < 0] = 0
    submission.loc[current_family_indices_test, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/lgbm_boruta_optuna.csv', index = False)