# Experiments with linear models for each family with different strategies for lags

In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
import statsmodels.api as sm
from pandas.core.common import SettingWithCopyWarning
from sklearn.base import BaseEstimator, RegressorMixin, MetaEstimatorMixin, TransformerMixin, clone
from datetime import timedelta
from statistics import median, mean, stdev
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.models.reg import PositiveRegressor
from mentorship.ml.models.common import SplitPipeline
from mentorship.ml.models.kaggle.storesales.linear import PipelineLinearV1
from mentorship.ml.models.kaggle.storesales.ridge import PipelineRidgeV1
from mentorship.ml.models.kaggle.storesales.lasso import PipelineLassoV1
from mentorship.ml.models.kaggle.storesales.elasticnet import PipelineElasticNetV1
from mentorship.features.kaggle.storesales.etl import ETLTransformer
from mentorship.ml.cv.split import DateTimeSeriesSplit
from mentorship.ml.cv.util import print_cv_test_scores


%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
CV_METRICS = [
    'neg_mean_squared_log_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    # 'neg_mean_absolute_percentage_error',
    'r2'
]

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

# 1. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (1 day) (recursive strategy)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
X['lag_1'] = X.groupby(['store_nbr', 'family'])['sales'].shift()
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)
lag_1 = X['lag_1'].copy()
X = X.drop(columns=['lag_1'])
X['lag_1'] = lag_1
X.head()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]

    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])

        previous_day = X_test_current_family['date'].unique()[0]
        for current_day in X_test_current_family['date'].unique()[1:]:
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
    
            #######
            X_test_current_family.loc[X_test_current_family['date'] == current_day, 'lag_1'] = predictions
            #######
            
            previous_day = current_day
            
        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
keys = np.arange(1, 17)
cv_scores = {key: [] for key in keys}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]

    for i, current_day in enumerate(X_test['date'].unique()):
        print('Day: ', i + 1)
        current_day_indices_test = X_test[X_test['date'] == current_day].index
        X_test_current_day = X_test[X_test['date'] == current_day]
        y_test_current_day = y_test.loc[current_day_indices_test]
        
        X_test_current_day.loc[:, 'pred'] = 0
        for current_family in X['family'].unique():
            current_family_indices_train = X_train[X_train['family'] == current_family].index
            X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
            y_train_current_family = y_train.loc[current_family_indices_train]
            scaler = MinMaxScaler()
            X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
            
            model = PositiveRegressor(LinearRegression())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
            current_family_indices_test = X_test_current_day[X_test_current_day['family'] == current_family].index
            X_test_current_family = X_test_current_day[X_test_current_day['family'] == current_family].drop(columns=['family'])
            X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])

            previous_day = X_test_current_family['date'].unique()[0]
            for current_day in X_test_current_family['date'].unique()[1:]:
                X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop(columns=['pred', 'date'])
                predictions = model.predict(X_test_for_current_day)
                predictions[predictions < 0] = 0
    
                #######
                X_test_current_family.loc[X_test_current_family['date'] == current_day, 'lag_1'] = predictions
                #######
            
                previous_day = current_day
            
        
            y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
            y_pred_current_family[y_pred_current_family < 0] = 0
            X_test_current_day.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
        y_pred = X_test_current_day['pred'].copy()
        X_test_current_day = X_test_current_day.drop(columns=['pred'])
        cv_scores[i + 1].append(np.sqrt(mean_squared_log_error(y_test_current_day, y_pred)))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
plt.figure(figsize=(10, 10))
X=list(cv_scores.keys())
y=[mean(cv_scores[key]) for key in X]
plt.plot(X, y)

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
test_data['lag_1'] = 0
test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[0]].index, 'lag_1'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-1]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for current_family in X['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
    
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    
    X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])    
    
    previous_day = X_test_current_family['date'].unique()[0]
    for current_day in X_test_current_family['date'].unique()[1:]:
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop('date', axis=1)
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day].index, 'lag_1'] = predictions
        previous_day = current_day
            
    X_test_current_family = X_test_current_family.drop(columns=['date'])
        
    y_pred_current_family = model.predict(X_test_current_family)
    y_pred_current_family[y_pred_current_family < 0] = 0
        
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_and_lag_one.csv', index = False)

In [None]:
X = train.copy()

first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]

X_train = X_train.groupby(['family', 'date'])['sales'].mean()

for i, current_family in enumerate(X['family'].unique()):
    current_family_oil = X_train.loc[current_family].reset_index()['sales']
    sm.graphics.tsa.plot_pacf(current_family_oil, lags=16)
    plt.title(current_family)
    plt.show()

# 2. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (1, 2, 6, 7 and 14 days) (recursive strategy)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
X['lag_1'] = X.groupby(['store_nbr', 'family'])['sales'].shift(1)
X['lag_2'] = X.groupby(['store_nbr', 'family'])['sales'].shift(2)
X['lag_6'] = X.groupby(['store_nbr', 'family'])['sales'].shift(6)
X['lag_7'] = X.groupby(['store_nbr', 'family'])['sales'].shift(7)
X['lag_14'] = X.groupby(['store_nbr', 'family'])['sales'].shift(14)
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

# to place 'lag' features in the end of the DataFrame
lag_1 = X['lag_1'].copy()
lag_2 = X['lag_2'].copy()
lag_6 = X['lag_6'].copy()
lag_7 = X['lag_7'].copy()
lag_14 = X['lag_14'].copy()
X = X.drop(columns=['lag_1', 'lag_2', 'lag_6', 'lag_7', 'lag_14'])
X['lag_1'] = lag_1
X['lag_2'] = lag_2
X['lag_6'] = lag_6
X['lag_7'] = lag_7
X['lag_14'] = lag_14
X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        y_test_current_family = y_test.loc[current_family_indices_test]
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
                  

        for i, current_day in enumerate(X_test_current_family['date'].unique()[:2]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
            current_day_plus_seven = X_test_current_family['date'].unique()[i + 7]
            current_day_plus_fourteen = X_test_current_family['date'].unique()[i + 14]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_seven].index, 'lag_7'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_fourteen].index, 'lag_14'] = predictions
        
    
        for i, current_day in enumerate(X_test_current_family['date'].unique()[2:9]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
            current_day_plus_seven = X_test_current_family['date'].unique()[i + 7]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_seven].index, 'lag_7'] = predictions

            
        for i, current_day in enumerate(X_test_current_family['date'].unique()[9:10]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions

            
        for i, current_day in enumerate(X_test_current_family['date'].unique()[10:14]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions

        
        for i, current_day in enumerate(X_test_current_family['date'].unique()[14:15]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
test_data.loc[:, 'lag_1'] = 0
test_data.loc[:, 'lag_2'] = 0
test_data.loc[:, 'lag_6'] = 0
test_data.loc[:, 'lag_7'] = 0
test_data.loc[:, 'lag_14'] = 0
for i in range(1):
    test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_1'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(1 - i)]].tolist()

for i in range(2):
    test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_2'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(2 - i)]].tolist()
    
for i in range(6):
    test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_6'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(6 - i)]].tolist()
    
for i in range(7):
    test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_7'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(7 - i)]].tolist()
    
for i in range(14):
    test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[i]].index, 'lag_14'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(14 - i)]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for current_family in X['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
    
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])    
    
    for i, current_day in enumerate(X_test_current_family['date'].unique()[:2]):
        current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
        current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
        current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
        current_day_plus_seven = X_test_current_family['date'].unique()[i + 7]
        current_day_plus_fourteen = X_test_current_family['date'].unique()[i + 14]
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_seven].index, 'lag_7'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_fourteen].index, 'lag_14'] = predictions

    
    for i, current_day in enumerate(X_test_current_family['date'].unique()[2:9]):
        current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
        current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
        current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
        current_day_plus_seven = X_test_current_family['date'].unique()[i + 7]
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_seven].index, 'lag_7'] = predictions

            
    for i, current_day in enumerate(X_test_current_family['date'].unique()[9:10]):
        current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
        current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
        current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions

            
    for i, current_day in enumerate(X_test_current_family['date'].unique()[10:14]):
        current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
        current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions

        
    for i, current_day in enumerate(X_test_current_family['date'].unique()[14:15]):
        current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family[y_pred_current_family < 0] = 0
    
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_and_best_lags.csv', index = False)

In [None]:
#cc = pd.read_csv(DATA_ROOT / 'linreg_dcoilwtico_and_best_lags.csv')
#(cc['sales'] - submission['sales']).unique()

# 3. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (1 - 16 days) (recursive strategy)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
for i in range(16):
    X['lag_{}'.format(i + 1)] = X.groupby(['store_nbr', 'family'])['sales'].shift(i + 1)

X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)
lags = []
for i in range(16):
    lags.append(X['lag_{}'.format(i + 1)].copy()) 

    
    
# to place 'lag' features in the end of the DataFrame    
X = X.drop(columns=['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8',
                    'lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_16'])

for i in range(16):
    X['lag_{}'.format(i + 1)] = lags[i]
    
X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        y_test_current_family = y_test.loc[current_family_indices_test]
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
                  

        for i, current_day in enumerate(X_test_current_family['date'].unique()):
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            
            for j in range(1, 16 - i):
                next_day = X_test_current_family['date'].unique()[i + j]
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == next_day].index, 'lag_{}'.format(j)] = predictions
        
        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
for i in range(16):
    test_data.loc[:, 'lag_{}'.format(i + 1)] = 0
    for j in range(i + 1):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[j]].index, 'lag_{}'.format(i + 1)] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(i + 1 - j)]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for current_family in X['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
    
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])    
    
    for i, current_day in enumerate(X_test_current_family['date'].unique()):
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
            
        for j in range(1, 16 - i):
            next_day = X_test_current_family['date'].unique()[i + j]
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == next_day].index, 'lag_{}'.format(j)] = predictions
            
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family[y_pred_current_family < 0] = 0
        
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_and_lags.csv', index = False)

# 4. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (1 - 16, 30 days) (recursive strategy)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
for i in range(16):
    X['lag_{}'.format(i + 1)] = X.groupby(['store_nbr', 'family'])['sales'].shift(i + 1)

X['lag_30'] = X.groupby(['store_nbr', 'family'])['sales'].shift(30)
    
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)
lags = []
for i in range(16):
    lags.append(X['lag_{}'.format(i + 1)].copy()) 

lags.append(X['lag_30'].copy())
    
    
# to place 'lag' features in the end of the DataFrame    
X = X.drop(columns=['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8', 'lag_9', 
                    'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_30'])

for i in range(16):
    X['lag_{}'.format(i + 1)] = lags[i]
    
X['lag_30'] = lags[-1]
    
X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        y_test_current_family = y_test.loc[current_family_indices_test]
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
                  

        for i, current_day in enumerate(X_test_current_family['date'].unique()):
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            
            for j in range(1, 16 - i):
                next_day = X_test_current_family['date'].unique()[i + j]
                X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == next_day].index, 'lag_{}'.format(j)] = predictions
        
        
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
for i in range(16):
    test_data.loc[:, 'lag_{}'.format(i + 1)] = 0
    for j in range(i + 1):
        test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[j]].index, 'lag_{}'.format(i + 1)] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(i + 1 - j)]].tolist()
        
test_data.loc[:, 'lag_30'] = 0
for i in range(30):
    test_data.loc[test_data[test_data['date'] == test_data['date'].unique()[j]].index, 'lag_30'] = y_train.loc[X_train['date'] == X_train['date'].unique()[-(30 - j)]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for current_family in X['family'].unique():
    current_family_indices_train = X_train[X_train['family'] == current_family].index
    X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
    
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data[test_data['family'] == current_family].drop(columns=['family'])
    X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])    
    
    for i, current_day in enumerate(X_test_current_family['date'].unique()):
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date'])
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
            
        for j in range(1, 16 - i):
            next_day = X_test_current_family['date'].unique()[i + j]
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == next_day].index, 'lag_{}'.format(j)] = predictions
            
            
        
    y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date']))
    y_pred_current_family[y_pred_current_family < 0] = 0
        
    test_indices = test_data[test_data['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_and_lags_v30.csv', index = False)

# (not ready)                                                                                                                5. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (1 - 16 days shift) (direct strategy)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
for i in range(16):
    X['lag_{}'.format(i + 1)] = X.groupby(['store_nbr', 'family'])['sales'].shift(i + 1)
    X['-{}'.format(i + 1)] = X.groupby(['store_nbr', 'family'])['sales'].shift(-(i + 1))
X = X[X['dcoilwtico'].notna()]
X = X.dropna()
X = X.reset_index(drop=True)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)
lags = []
targets = []
for i in range(16):
    lags.append(X['lag_{}'.format(i + 1)].copy())
    targets.append(X['{}'.format(-(i + 1))].copy())
    
# to place 'lag' features in the end of the DataFrame    
X = X.drop(columns=['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5', 'lag_6', 'lag_7', 'lag_8',
                    'lag_9', 'lag_10', 'lag_11', 'lag_12', 'lag_13', 'lag_14', 'lag_15', 'lag_16',
                    '-1', '-2', '-3', '-4', '-5', '-6', '-7', '-8', '-9', '-10',
                    '-11', '-12', '-13', '-14', '-15', '-16'])

for i in range(16):
    X['lag_{}'.format(i + 1)] = lags[i]
    
X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        y_test_current_family = y_test.loc[current_family_indices_test]
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])

# 6. Linear Regression, features: 'store_nbr', 'dcoilwtico' with 'lags' (1 day shift) and 'lags' for target (1 - 16 days shift) (recursive strategy)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
X.head()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]

In [None]:
oil_price = X_train.groupby('date')['dcoilwtico'].mean().reset_index()['dcoilwtico']
sm.graphics.tsa.plot_pacf(oil_price, lags=16)
plt.show()

In [None]:
X['lag_1'] = X.groupby(['store_nbr', 'family'])['sales'].shift(1)
X['lag_2'] = X.groupby(['store_nbr', 'family'])['sales'].shift(2)
X['lag_6'] = X.groupby(['store_nbr', 'family'])['sales'].shift(6)
X['lag_7'] = X.groupby(['store_nbr', 'family'])['sales'].shift(7)
X['lag_14'] = X.groupby(['store_nbr', 'family'])['sales'].shift(14)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)

# to place 'lag' features in the end of the DataFrame
lag_1 = X['lag_1'].copy()
lag_2 = X['lag_2'].copy()
lag_6 = X['lag_6'].copy()
lag_7 = X['lag_7'].copy()
lag_14 = X['lag_14'].copy()
X = X.drop(columns=['lag_1', 'lag_2', 'lag_6', 'lag_7', 'lag_14'])
X['lag_1'] = lag_1
X['lag_2'] = lag_2
X['lag_6'] = lag_6
X['lag_7'] = lag_7
X['lag_14'] = lag_14
X.head()

In [None]:
y = X['sales'].copy()
X = X.drop(columns=['sales'])

In [None]:
lag_1_oil = pd.DataFrame(data=X.groupby(['date'])['dcoilwtico'].mean())
lag_1_oil['dcoilwtico'] = lag_1_oil['dcoilwtico'].shift(1)
lag_1_oil = lag_1_oil.rename(columns={'dcoilwtico': 'lag_1_oil'})
X = X.merge(lag_1_oil, on='date', how='left')
X.head()

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 33 * 54, n_splits=4, test_size=16 * 33 * 54)

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv.split(X, y):
    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in X['family'].unique():
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        X_train_current_family[['lag_1_oil']] = scaler.transform(X_train_current_family[['lag_1_oil']])
        
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        y_test_current_family = y_test.loc[current_family_indices_test]
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
        X_test_current_family[['lag_1_oil']] = scaler.transform(X_test_current_family[['lag_1_oil']])

        for i, current_day in enumerate(X_test_current_family['date'].unique()[:2]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
            current_day_plus_seven = X_test_current_family['date'].unique()[i + 7]
            current_day_plus_fourteen = X_test_current_family['date'].unique()[i + 14]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_seven].index, 'lag_7'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_fourteen].index, 'lag_14'] = predictions
        
    
        for i, current_day in enumerate(X_test_current_family['date'].unique()[2:9]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
            current_day_plus_seven = X_test_current_family['date'].unique()[i + 7]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_seven].index, 'lag_7'] = predictions

            
        for i, current_day in enumerate(X_test_current_family['date'].unique()[9:10]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            current_day_plus_six = X_test_current_family['date'].unique()[i + 6]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_six].index, 'lag_6'] = predictions

            
        for i, current_day in enumerate(X_test_current_family['date'].unique()[10:14]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            current_day_plus_two = X_test_current_family['date'].unique()[i + 2]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_two].index, 'lag_2'] = predictions

        
        for i, current_day in enumerate(X_test_current_family['date'].unique()[14:15]):
            current_day_plus_one = X_test_current_family['date'].unique()[i + 1]
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == current_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
            X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day_plus_one].index, 'lag_1'] = predictions

            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
        
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))

for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')