In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
import statsmodels.api as sm
from pandas.core.common import SettingWithCopyWarning
from sklearn.base import BaseEstimator, RegressorMixin, MetaEstimatorMixin, TransformerMixin, clone
from datetime import timedelta
from statistics import median, mean, stdev
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.models.reg import PositiveRegressor
from mentorship.ml.models.common import SplitPipeline
from mentorship.ml.models.kaggle.storesales.linear import PipelineLinearV1
from mentorship.ml.models.kaggle.storesales.ridge import PipelineRidgeV1
from mentorship.ml.models.kaggle.storesales.lasso import PipelineLassoV1
from mentorship.ml.models.kaggle.storesales.elasticnet import PipelineElasticNetV1
from mentorship.features.kaggle.storesales.etl import ETLTransformer
from mentorship.ml.cv.split import DateTimeSeriesSplit
from mentorship.ml.cv.util import print_cv_test_scores


%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
CV_METRICS = [
    'neg_mean_squared_log_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    # 'neg_mean_absolute_percentage_error',
    'r2'
]

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

 # 1. Linear Regression, features: 'store_nbr', 'dcoilwtico', 'lags' for target (1, 2, 4, 6, 7, 14 days shift) (recursive strategy), 'rolling' features (SelectFromModel)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in days_to_shift:
    X.loc[:, 'lag_{}'.format(current_lag)] = X.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = X.loc[X[X['date'] == X['date'].unique()[-(current_lag - i)]].index, 'sales'].tolist()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]

test_data.loc[:, 'sales'] = 0
for date in test_data['date'].unique():
    test_data.loc[test_data[test_data['date'] == date].index, 'sales'] = X_train.groupby(['store_nbr', 'family'])['sales'].median().reset_index([0, 1], drop=True).tolist()

In [None]:
X_sales_copy = X['sales'].copy()
X = X.drop(columns=['sales'])
X['sales'] = X_sales_copy

test_dates = test_data['date'].unique()
X_all_rows = pd.concat([X, test_data])
X_all_rows = X_all_rows.reset_index().drop(columns=['index'])

In [None]:
rolling_periods = {'year': 365, '6m': 183, '3m': 92, '1m': 31, '16d': 16, '10d': 10, '7d': 7, '5d': 5, '3d': 3}
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min']

In [None]:
for period, days in rolling_periods.items():
    for function in aggregate_functions:
        X_all_rows['rolling_{0}_{1}'.format(period, function)] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].apply(lambda x: x.shift().rolling(days).agg({'sales': function}))

In [None]:
features_to_scale = ['dcoilwtico', 'rolling_year_median', 'rolling_year_mean', 'rolling_year_sum', 'rolling_year_max', 
                     'rolling_6m_median', 'rolling_6m_mean', 'rolling_6m_sum', 'rolling_6m_max',
                     'rolling_3m_median', 'rolling_3m_mean', 'rolling_3m_sum', 'rolling_3m_max', 
                     'rolling_1m_median', 'rolling_1m_mean', 'rolling_1m_sum', 'rolling_1m_max', 
                     'rolling_16d_median', 'rolling_16d_mean', 'rolling_16d_sum', 'rolling_16d_max', 
                     'rolling_10d_median', 'rolling_10d_mean', 'rolling_10d_sum', 'rolling_10d_max', 
                     'rolling_7d_median', 'rolling_7d_mean', 'rolling_7d_sum', 'rolling_7d_max',
                     'rolling_5d_median', 'rolling_5d_mean', 'rolling_5d_sum', 'rolling_5d_max',
                     'rolling_3d_median', 'rolling_3d_mean', 'rolling_3d_sum', 'rolling_3d_max']

In [None]:
features_to_drop = features_to_scale[1:]

In [None]:
y = X['sales'].copy()

In [None]:
test_data = X_all_rows[X_all_rows['date'].isin(test_dates)].reset_index(drop=True).drop(columns=['sales'])
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
X = X_all_rows[~X_all_rows['date'].isin(test_dates)].drop(columns=['sales'])
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

In [None]:
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

best_features = {family:0 for family in X_train['family'].unique()}
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scalers = []
    for current_feature in features_to_scale:
        current_scaler = MinMaxScaler()
        X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
        scalers.append(current_scaler)
    
    model = LinearRegression()
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    importance = np.abs(model.coef_)
    threshold = np.sort(importance)[-2] + 0.01
    sfm = SelectFromModel(model, threshold=threshold).fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    best_features[current_family] = X_train_current_family.drop(columns=['date']).columns[sfm.get_support()]

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
ends = [(16 - x) for x in days_to_shift]
ends.reverse()

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        columns_to_drop = [x for x in features_to_scale if x not in best_features[current_family]]
        columns_to_drop.append('family')
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        all_scalers = {}
        for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
            current_scaler = MinMaxScaler()
            X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
            all_scalers[current_feature] = current_scaler
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=columns_to_drop)
        for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
            X_test_current_family[[current_feature]] = all_scalers[current_feature].transform(X_test_current_family[[current_feature]])
        
        days_to_shift_copy = days_to_shift.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in days_to_shift_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in days_to_shift_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            days_to_shift_copy = days_to_shift_copy[:-1]
            start = end

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

# 2. Linear Regression, features: 'store_nbr', 'dcoilwtico', 'lags' for target (1, 2, 4, 6, 7, 14 days shift) (recursive strategy), 'rolling' features (SelectKBest)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in days_to_shift:
    X.loc[:, 'lag_{}'.format(current_lag)] = X.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = X.loc[X[X['date'] == X['date'].unique()[-(current_lag - i)]].index, 'sales'].tolist()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]

test_data.loc[:, 'sales'] = 0
for date in test_data['date'].unique():
    test_data.loc[test_data[test_data['date'] == date].index, 'sales'] = X_train.groupby(['store_nbr', 'family'])['sales'].median().reset_index([0, 1], drop=True).tolist()

In [None]:
X_sales_copy = X['sales'].copy()
X = X.drop(columns=['sales'])
X['sales'] = X_sales_copy

test_dates = test_data['date'].unique()
X_all_rows = pd.concat([X, test_data])
X_all_rows = X_all_rows.reset_index().drop(columns=['index'])

In [None]:
rolling_periods = {'year': 365, '6m': 183, '3m': 92, '1m': 31, '16d': 16, '10d': 10, '7d': 7, '5d': 5, '3d': 3}
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min']

In [None]:
for period, days in rolling_periods.items():
    for function in aggregate_functions:
        X_all_rows['rolling_{0}_{1}'.format(period, function)] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].apply(lambda x: x.shift().rolling(days).agg({'sales': function}))

In [None]:
features_to_scale = ['dcoilwtico', 'rolling_year_median', 'rolling_year_mean', 'rolling_year_sum', 'rolling_year_max', 
                     'rolling_6m_median', 'rolling_6m_mean', 'rolling_6m_sum', 'rolling_6m_max',
                     'rolling_3m_median', 'rolling_3m_mean', 'rolling_3m_sum', 'rolling_3m_max', 
                     'rolling_1m_median', 'rolling_1m_mean', 'rolling_1m_sum', 'rolling_1m_max', 
                     'rolling_16d_median', 'rolling_16d_mean', 'rolling_16d_sum', 'rolling_16d_max', 
                     'rolling_10d_median', 'rolling_10d_mean', 'rolling_10d_sum', 'rolling_10d_max', 
                     'rolling_7d_median', 'rolling_7d_mean', 'rolling_7d_sum', 'rolling_7d_max',
                     'rolling_5d_median', 'rolling_5d_mean', 'rolling_5d_sum', 'rolling_5d_max',
                     'rolling_3d_median', 'rolling_3d_mean', 'rolling_3d_sum', 'rolling_3d_max']

In [None]:
y = X['sales'].copy()

In [None]:
test_data = X_all_rows[X_all_rows['date'].isin(test_dates)].reset_index(drop=True).drop(columns=['sales'])
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
X = X_all_rows[~X_all_rows['date'].isin(test_dates)].drop(columns=['sales'])
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

In [None]:
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
best_features = {family:0 for family in X_train['family'].unique()}
for current_family in X_train['family'].unique():
    print(current_family)
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scalers = []
    for current_feature in features_to_scale:
        current_scaler = MinMaxScaler()
        X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
        scalers.append(current_scaler)
    
    selector = SelectKBest(mutual_info_regression, k=10)
    selector.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    best_features[current_family] = selector.get_feature_names_out()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        columns_to_drop = [x for x in features_to_scale if x not in best_features[current_family]]
        columns_to_drop.append('family')
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        all_scalers = {}
        for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
            current_scaler = MinMaxScaler()
            X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
            all_scalers[current_feature] = current_scaler
        
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=columns_to_drop)
        for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
            X_test_current_family[[current_feature]] = all_scalers[current_feature].transform(X_test_current_family[[current_feature]])
        
        days_to_shift_copy = days_to_shift.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in days_to_shift_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in days_to_shift_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            days_to_shift_copy = days_to_shift_copy[:-1]
            start = end

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

# 3. Linear Regression, features: 'store_nbr', 'dcoilwtico', 'lags' for target (1, 2, 4, 6, 7, 14 days shift) (recursive strategy), 'rolling' features (L1-based selection)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]

for current_lag in days_to_shift:
    X.loc[:, 'lag_{}'.format(current_lag)] = X.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)
    test_data.loc[:, 'lag_{}'.format(current_lag)] = 0
    
    for i in range(current_lag):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_{}'.format(current_lag)] = X.loc[X[X['date'] == X['date'].unique()[-(current_lag - i)]].index, 'sales'].tolist()

In [None]:
test_data_sales = pd.read_csv(DATA_ROOT / 'linreg_best_lags_only_v2.csv')
test_data['sales'] = test_data_sales['sales'].copy()

In [None]:
X_sales_copy = X['sales'].copy()
X = X.drop(columns=['sales'])
X['sales'] = X_sales_copy

test_dates = test_data['date'].unique()
X_all_rows = pd.concat([X, test_data])
X_all_rows = X_all_rows.reset_index().drop(columns=['index'])

In [None]:
rolling_periods = {'year': 365, '6m': 183, '3m': 92, '1m': 31, '16d': 16, '10d': 10, '7d': 7, '5d': 5, '3d': 3}
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min']

In [None]:
for period, days in rolling_periods.items():
    for function in aggregate_functions:
        X_all_rows['rolling_{0}_{1}'.format(period, function)] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].apply(lambda x: x.shift().rolling(days).agg({'sales': function}))

In [None]:
features_to_scale = ['dcoilwtico', 'rolling_year_median', 'rolling_year_mean', 'rolling_year_sum', 'rolling_year_max', 
                     'rolling_6m_median', 'rolling_6m_mean', 'rolling_6m_sum', 'rolling_6m_max',
                     'rolling_3m_median', 'rolling_3m_mean', 'rolling_3m_sum', 'rolling_3m_max', 
                     'rolling_1m_median', 'rolling_1m_mean', 'rolling_1m_sum', 'rolling_1m_max', 
                     'rolling_16d_median', 'rolling_16d_mean', 'rolling_16d_sum', 'rolling_16d_max', 
                     'rolling_10d_median', 'rolling_10d_mean', 'rolling_10d_sum', 'rolling_10d_max', 
                     'rolling_7d_median', 'rolling_7d_mean', 'rolling_7d_sum', 'rolling_7d_max',
                     'rolling_5d_median', 'rolling_5d_mean', 'rolling_5d_sum', 'rolling_5d_max',
                     'rolling_3d_median', 'rolling_3d_mean', 'rolling_3d_sum', 'rolling_3d_max']

In [None]:
y = X['sales'].copy()

In [None]:
test_data = X_all_rows[X_all_rows['date'].isin(test_dates)].reset_index(drop=True).drop(columns=['sales'])
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
X = X_all_rows[~X_all_rows['date'].isin(test_dates)].drop(columns=['sales'])
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

In [None]:
nan_ind = X[X.isna().any(axis=1)].index
X = X.dropna()
y = y.drop(index=nan_ind)

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
best_features = {}
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
        
    for current_feature in features_to_scale:
        current_scaler = MinMaxScaler()
        X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
        
    lss = Lasso(alpha=0.15).fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    model_select = SelectFromModel(lss, prefit=True)
    best_features[current_family] = X_train_current_family.drop(columns=['date']).columns[model_select.get_support()].tolist()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        X_train_current_family = X_train_current_family.drop(columns=[x for x in X_train_current_family.columns if x not in best_features[current_family]])
            
        all_scalers = {}
        for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
            current_scaler = MinMaxScaler()
            X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
            all_scalers[current_feature] = current_scaler
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family, y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family]
        X_test_current_family = X_test_current_family.drop(columns=[x for x in X_test_current_family.columns \
                                                                    if x not in best_features[current_family] and x != 'date' and x != 'pred'])
        for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
            X_test_current_family[[current_feature]] = all_scalers[current_feature].transform(X_test_current_family[[current_feature]])

        current_days_to_shift = [int(x[4:]) for x in best_features[current_family] if x[:3] == 'lag']
        ends = [(16 - x) for x in current_days_to_shift]
        ends.reverse()
        
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in current_days_to_shift:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in current_days_to_shift:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            current_days_to_shift = current_days_to_shift[:-1]
            start = end

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family]
    X_train_current_family = X_train_current_family.drop(columns=[x for x in X_train_current_family.columns \
                                                                  if x not in best_features[current_family]])
    y_train_current_family = y_train.loc[current_family_indices_train]
                    
    all_scalers = {}
    for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
        current_scaler = MinMaxScaler()
        X_train_current_family[[current_feature]] = current_scaler.fit_transform(X_train_current_family[[current_feature]])
        all_scalers[current_feature] = current_scaler
        
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family, y_train_current_family)    
    
    X_test_current_family = test_data[test_data['family'] == current_family]
    X_test_current_family = X_test_current_family.drop(columns=[x for x in X_test_current_family.columns \
                                                                if x not in best_features[current_family] and x != 'date'])
    for current_feature in [x for x in features_to_scale if x in best_features[current_family]]:
        X_test_current_family[[current_feature]] = all_scalers[current_feature].transform(X_test_current_family[[current_feature]])
    
    current_days_to_shift = [int(x[4:]) for x in best_features[current_family] if x[:3] == 'lag']
    ends = [(16 - x) for x in current_days_to_shift]
    ends.reverse()
        
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in current_days_to_shift:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
            predictions = model.predict(X_test_for_current_day)
            
            for current_lag in current_days_to_shift:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                
            current_day_index += 1
                
        current_days_to_shift = current_days_to_shift[:-1]
        start = end
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_best_features_l1_fs.csv', index = False)