In [1]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [2]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
import statsmodels.api as sm
from pandas.core.common import SettingWithCopyWarning
from sklearn.base import BaseEstimator, RegressorMixin, MetaEstimatorMixin, TransformerMixin, clone
from datetime import timedelta
from statistics import median, mean, stdev
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.models.reg import PositiveRegressor
from mentorship.ml.models.common import SplitPipeline
from mentorship.ml.models.kaggle.storesales.linear import PipelineLinearV1
from mentorship.ml.models.kaggle.storesales.ridge import PipelineRidgeV1
from mentorship.ml.models.kaggle.storesales.lasso import PipelineLassoV1
from mentorship.ml.models.kaggle.storesales.elasticnet import PipelineElasticNetV1
from mentorship.features.kaggle.storesales.etl import ETLTransformer
from mentorship.ml.cv.split import DateTimeSeriesSplit
from mentorship.ml.cv.util import print_cv_test_scores


%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)



In [3]:
CV_METRICS = [
    'neg_mean_squared_log_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    # 'neg_mean_absolute_percentage_error',
    'r2'
]

In [4]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [5]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [6]:
N_STORES = train['store_nbr'].nunique()
N_FAMILIES = train['family'].nunique()
N_TIME_SERIES = N_STORES * N_FAMILIES

DAYS_IN_YEAR = 365
N_HORIZONS = 16

 # 1. Linear Regression, features: 'store_nbr', 'dcoilwtico', 'lags' for target (1, 2, 4, 6, 7, 14 days shift) (recursive strategy), 'rolling' features (SelectFromModel)

In [17]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [18]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [19]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_dates = test_data['date'].unique()

X_all_rows = pd.concat([X, test_data])
X_all_rows = X_all_rows.reset_index().drop(columns=['index'])
X_all_rows = X_all_rows.fillna(0)

In [20]:
rolling_periods = [365, 183, 92, 31, 16, 10, 7, 5, 3]
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min', 'std']

In [38]:
X_all_rows['lag_1'] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].shift()
for rolling_days in rolling_periods:
    cols = [f'sales_rolling_{rolling_days}d_{agg}' for agg in aggregate_functions]
    X_all_rows[cols] = X_all_rows.groupby(['store_nbr', 'family'])['lag_1'].rolling(rolling_days).agg(aggregate_functions).reset_index([0, 1], drop=True)

In [40]:
X_all_rows

Unnamed: 0,date,store_nbr,family,sales,dcoilwtico,lag_1,sales_rolling_3d_median,sales_rolling_3d_mean,sales_rolling_3d_sum,sales_rolling_3d_max,...,sales_rolling_183d_sum,sales_rolling_183d_max,sales_rolling_183d_min,sales_rolling_183d_std,sales_rolling_365d_median,sales_rolling_365d_mean,sales_rolling_365d_sum,sales_rolling_365d_max,sales_rolling_365d_min,sales_rolling_365d_std
0,2013-01-01,1,automotive,0.0,0.00,,,,,,...,,,,,,,,,,
1,2013-01-01,1,baby care,0.0,0.00,,,,,,...,,,,,,,,,,
2,2013-01-01,1,beauty,0.0,0.00,,,,,,...,,,,,,,,,,
3,2013-01-01,1,beverages,0.0,0.00,,,,,,...,,,,,,,,,,
4,2013-01-01,1,books,0.0,0.00,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029395,2017-08-31,9,poultry,0.0,47.26,0.0,0.0,5.343281e-12,1.602984e-11,0.0,...,74573.600930,857.480,0.0,179.301057,423.294,443.502353,161878.359010,2198.853,0.0,195.506387
3029396,2017-08-31,9,prepared foods,0.0,47.26,0.0,0.0,5.286438e-12,1.585931e-11,0.0,...,20754.545002,311.147,0.0,49.555923,113.978,115.765726,42254.489993,311.147,0.0,43.048422
3029397,2017-08-31,9,produce,0.0,47.26,0.0,0.0,6.063298e-12,1.818989e-11,0.0,...,280142.216800,2861.572,0.0,649.904719,1524.197,1622.782758,592315.706600,4059.907,0.0,623.674176
3029398,2017-08-31,9,school and office supplies,0.0,47.26,0.0,0.0,6.063298e-12,1.818989e-11,0.0,...,2947.000000,203.000,0.0,42.542740,2.000,12.890411,4705.000000,326.000,0.0,39.463839


In [41]:
for current_lag in days_to_shift[1:]:
    X_all_rows.loc[:, 'lag_{}'.format(current_lag)] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)

In [42]:
features_to_scale = [x for x in X_all_rows.columns if 'rolling' in x or x == 'dcoilwtico']

In [43]:
y = X['sales'].copy()

In [44]:
test_data = X_all_rows[X_all_rows['date'].isin(test_dates)].reset_index(drop=True).drop(columns=['sales'])
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
X = X_all_rows[~X_all_rows['date'].isin(test_dates)].drop(columns=['sales'])
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

In [45]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

best_features = {}
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[features_to_scale] = scaler.fit_transform(X_train_current_family[features_to_scale])
    
    model = LinearRegression()
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    importance = np.abs(model.coef_)
    threshold = np.sort(importance)[-2] + 0.01
    sfm = SelectFromModel(model, threshold=threshold).fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    best_features[current_family] = X_train_current_family.drop(columns=['date']).columns[sfm.get_support()]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
automotive
baby care
beauty
beverages
books
bread/bakery
celebration
cleaning
dairy
deli
eggs
frozen foods
grocery i
grocery ii
hardware
home and kitchen i
home and kitchen ii
home appliances
home care
ladieswear
lawn and garden
lingerie
liquor,wine,beer
magazines
meats
personal care
pet supplies
players and electronics
poultry
prepared foods
produce
school and office supplies
seafood


In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)

In [None]:
ends = [(16 - x) for x in days_to_shift]
ends.reverse()

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        print(current_family)
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        columns_to_drop = [x for x in features_to_scale if x not in best_features[current_family]]
        columns_to_drop.append('family')
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        scaler = MinMaxScaler()
        current_features_to_scale = [x for x in features_to_scale if x in best_features[current_family]]
        X_train_current_family[current_features_to_scale] = scaler.fit_transform(X_train_current_family[current_features_to_scale])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=columns_to_drop)
        X_test_current_family[current_features_to_scale] = scaler.transform(X_test_current_family[current_features_to_scale])
        
        days_to_shift_copy = days_to_shift.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in days_to_shift_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in days_to_shift_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            days_to_shift_copy = days_to_shift_copy[:-1]
            start = end

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

# 2. Linear Regression, features: 'store_nbr', 'dcoilwtico', 'lags' for target (1, 2, 4, 6, 7, 14 days shift) (recursive strategy), 'rolling' features (SelectKBest)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_dates = test_data['date'].unique()

X_all_rows = pd.concat([X, test_data])
X_all_rows = X_all_rows.reset_index().drop(columns=['index'])
X_all_rows = X_all_rows.fillna(0)

In [None]:
rolling_periods = [365, 183, 92, 31, 16, 10, 7, 5, 3]
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min', 'std']

In [None]:
X_all_rows['lag_1'] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].shift()
for rolling_days in rolling_periods:
    cols = [f'sales_rolling_{rolling_days}d_{agg}' for agg in aggregate_functions]
    X_all_rows[cols] = X_all_rows.groupby(['store_nbr', 'family'])['lag_1'].rolling(rolling_days).agg(aggregate_functions).reset_index([0, 1], drop=True)

In [None]:
for current_lag in days_to_shift[1:]:
    X_all_rows.loc[:, 'lag_{}'.format(current_lag)] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)

In [None]:
features_to_scale = [x for x in X_all_rows.columns if 'rolling' in x or x == 'dcoilwtico']

In [None]:
y = X['sales'].copy()

In [None]:
test_data = X_all_rows[X_all_rows['date'].isin(test_dates)].reset_index(drop=True).drop(columns=['sales'])
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
X = X_all_rows[~X_all_rows['date'].isin(test_dates)].drop(columns=['sales'])
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

In [None]:
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index]

In [None]:
best_features = {}
for current_family in X_train['family'].unique():
    print(current_family)
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[features_to_scale] = scaler.fit_transform(X_train_current_family[features_to_scale])
    
    selector = SelectKBest(mutual_info_regression, k=10)
    selector.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    best_features[current_family] = selector.get_feature_names_out()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        columns_to_drop = [x for x in features_to_scale if x not in best_features[current_family]]
        columns_to_drop.append('family')
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=columns_to_drop)
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        scaler = MinMaxScaler()
        current_features_to_scale = [x for x in features_to_scale if x in best_features[current_family]]
        X_train_current_family[current_features_to_scale] = scaler.fit_transform(X_train_current_family[current_features_to_scale])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=columns_to_drop)
        X_test_current_family[current_features_to_scale] = scaler.transform(X_test_current_family[current_features_to_scale])
        
        days_to_shift_copy = days_to_shift.copy()
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in days_to_shift_copy:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in days_to_shift_copy:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            days_to_shift_copy = days_to_shift_copy[:-1]
            start = end

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

# 3. Linear Regression, features: 'store_nbr', 'dcoilwtico', 'lags' for target (1, 2, 4, 6, 7, 14 days shift) (recursive strategy), 'rolling' features (L1-based selection)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
days_to_shift = [1, 2, 4, 6, 7, 14]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_dates = test_data['date'].unique()

X_all_rows = pd.concat([X, test_data])
X_all_rows = X_all_rows.reset_index().drop(columns=['index'])
X_all_rows = X_all_rows.fillna(0)

In [None]:
rolling_periods = [365, 183, 92, 31, 16, 10, 7, 5, 3]
aggregate_functions = ['median', 'mean', 'sum', 'max', 'min', 'std']

In [None]:
X_all_rows['lag_1'] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].shift()
for rolling_days in rolling_periods:
    cols = [f'sales_rolling_{rolling_days}d_{agg}' for agg in aggregate_functions]
    X_all_rows[cols] = X_all_rows.groupby(['store_nbr', 'family'])['lag_1'].rolling(rolling_days).agg(aggregate_functions).reset_index([0, 1], drop=True)

In [None]:
for current_lag in days_to_shift[1:]:
    X_all_rows.loc[:, 'lag_{}'.format(current_lag)] = X_all_rows.groupby(['store_nbr', 'family'])['sales'].shift(current_lag)

In [None]:
features_to_scale = [x for x in X_all_rows.columns if 'rolling' in x or x == 'dcoilwtico']

In [None]:
y = X['sales'].copy()

In [None]:
test_data = X_all_rows[X_all_rows['date'].isin(test_dates)].reset_index(drop=True).drop(columns=['sales'])
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
X = X_all_rows[~X_all_rows['date'].isin(test_dates)].drop(columns=['sales'])
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

In [None]:
nan_ind = X[X.isna().any(axis=1)].index
X = X.dropna()
y = y.drop(index=nan_ind)

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
best_features = {}
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
      
    scaler = MinMaxScaler()
    X_train_current_family[features_to_scale] = scaler.fit_transform(X_train_current_family[features_to_scale])
        
    lss = Lasso(alpha=0.15).fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    model_select = SelectFromModel(lss, prefit=True)
    best_features[current_family] = X_train_current_family.drop(columns=['date']).columns[model_select.get_support()].tolist()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=DAYS_IN_YEAR * N_TIME_SERIES, n_splits=4, test_size=N_HORIZONS * N_TIME_SERIES)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        
        X_train_current_family = X_train_current_family.drop(columns=[x for x in X_train_current_family.columns if x not in best_features[current_family]])
            
        scaler = MinMaxScaler()
        current_features_to_scale = [x for x in features_to_scale if x in best_features[current_family]]
        X_train_current_family[current_features_to_scale] = scaler.fit_transform(X_train_current_family[current_features_to_scale])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family, y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family]
        X_test_current_family = X_test_current_family.drop(columns=[x for x in X_test_current_family.columns \
                                                                    if x not in best_features[current_family] and x != 'date' and x != 'pred'])
        X_test_current_family[current_features_to_scale] = scaler.transform(X_test_current_family[current_features_to_scale])


        current_days_to_shift = [int(x[4:]) for x in best_features[current_family] if x[:3] == 'lag']
        ends = [(16 - x) for x in current_days_to_shift]
        ends.reverse()
        
        start = 0
        current_day_index = 0
        for end in ends:
            for current_day in X_test_current_family['date'].unique()[start:end]:
                current_day_plus_x = {}
                for current_lag in current_days_to_shift:
                    current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
                predictions = model.predict(X_test_for_current_day)
                
                for current_lag in current_days_to_shift:
                    X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                    
                current_day_index += 1
                
            current_days_to_shift = current_days_to_shift[:-1]
            start = end

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.plot(scores['RMSLE'])

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
for current_family in X_train['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family]
    X_train_current_family = X_train_current_family.drop(columns=[x for x in X_train_current_family.columns \
                                                                  if x not in best_features[current_family]])
    y_train_current_family = y_train.loc[current_family_indices_train]
                    
    scaler = MinMaxScaler()
    current_features_to_scale = [x for x in features_to_scale if x in best_features[current_family]]
    X_train_current_family[current_features_to_scale] = scaler.fit_transform(X_train_current_family[current_features_to_scale])
        
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family, y_train_current_family)    
    
    X_test_current_family = test_data[test_data['family'] == current_family]
    X_test_current_family = X_test_current_family.drop(columns=[x for x in X_test_current_family.columns \
                                                                if x not in best_features[current_family] and x != 'date'])
    X_test_current_family[current_features_to_scale] = scaler.transform(X_test_current_family[current_features_to_scale])

    
    current_days_to_shift = [int(x[4:]) for x in best_features[current_family] if x[:3] == 'lag']
    ends = [(16 - x) for x in current_days_to_shift]
    ends.reverse()
        
    start = 0
    current_day_index = 0
    for end in ends:
        for current_day in X_test_current_family['date'].unique()[start:end]:
            current_day_plus_x = {}
            for current_lag in current_days_to_shift:
                current_day_plus_x[current_lag] = X_test_current_family['date'].unique()[current_day_index + current_lag]
                
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
            predictions = model.predict(X_test_for_current_day)
            
            for current_lag in current_days_to_shift:
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_x[current_lag]].index, 'lag_{}'.format(current_lag)] = predictions
                
            current_day_index += 1
                
        current_days_to_shift = current_days_to_shift[:-1]
        start = end
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_best_features_l1_fs.csv', index = False)