# Experiments with linear models for each family

In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
from pandas.core.common import SettingWithCopyWarning
from sklearn.base import BaseEstimator, RegressorMixin, MetaEstimatorMixin, TransformerMixin, clone
from datetime import timedelta
from statistics import median, mean, stdev
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from mentorship.ml.models.reg import PositiveRegressor
from mentorship.ml.models.common import SplitPipeline
from mentorship.ml.models.kaggle.storesales.linear import PipelineLinearV1
from mentorship.ml.models.kaggle.storesales.ridge import PipelineRidgeV1
from mentorship.ml.models.kaggle.storesales.lasso import PipelineLassoV1
from mentorship.ml.models.kaggle.storesales.elasticnet import PipelineElasticNetV1
from mentorship.features.kaggle.storesales.etl import ETLTransformer
from mentorship.ml.cv.split import DateTimeSeriesSplit
from mentorship.ml.cv.util import print_cv_test_scores


%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
CV_METRICS = [
    'neg_mean_squared_log_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    # 'neg_mean_absolute_percentage_error',
    'r2'
]

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'train.csv')
train.head()

# 1. Linear regression with 'store_nbr', 'dcoilwtico' and 'onpromotion' feature

<p><b>Preparing train data</b>

In [None]:
X = train.copy()
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
X.head()

In [None]:
X.columns[X.isna().any()].tolist()

In [None]:
inds = X.loc[pd.isna(X['dcoilwtico']), :].index

In [None]:
X = X[X['dcoilwtico'].notna()]

In [None]:
X = X.reset_index(drop=True)

In [None]:
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X.head()

In [None]:
X.shape, y.shape

In [None]:
categorical_columns = ['store_nbr', 'family']
numerical_columns = X.columns.drop(['date', 'store_nbr', 'family'])

<p><b>Cross-validation process</b>

In [None]:
splitter = DateTimeSeriesSplit()

In [None]:
base_pipeline = PipelineLinearV1(num_columns=list(numerical_columns), cat_columns='store_nbr')
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)
print_cv_test_scores(scores)

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 54 * 33, n_splits=4,
                               test_size=16 * 54 * 33)

In [None]:
X_copy = X.copy()
X_copy = pd.get_dummies(X_copy, columns=['store_nbr'], drop_first=True)

In [None]:
keys = np.arange(1, 17)
cv_scores = {key : [] for key in keys}
for train_indices, test_indices in tscv.split(X_copy, y):
    X_train = X_copy.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X_copy.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    for i, current_day in enumerate(X_test['date'].unique()):
        current_day_indices_test = X_test[X_test['date'] == current_day].index
        X_test_current_day = X_test[X_test['date'] == current_day]
        y_test_current_day = y_test.loc[current_day_indices_test]
        
        X_test_current_day.loc[:, 'pred'] = 0
        for current_family in X['family'].unique():
            current_family_indices_train = X_train[X_train['family'] == current_family].index
            X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
            y_train_current_family = y_train.loc[current_family_indices_train]
            scaler_oil = MinMaxScaler()
            scaler_promo = MinMaxScaler()
            X_train_current_family[['dcoilwtico']] = scaler_oil.fit_transform(X_train_current_family[['dcoilwtico']])
            X_train_current_family[['onpromotion']] = scaler_promo.fit_transform(X_train_current_family[['onpromotion']])
            model = PositiveRegressor(LinearRegression())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        
            current_family_indices_test = X_test_current_day[X_test_current_day['family'] == current_family].index
            X_test_current_family = X_test_current_day[X_test_current_day['family'] == current_family].drop(columns=['family'])
            X_test_current_family[['dcoilwtico']] = scaler_oil.transform(X_test_current_family[['dcoilwtico']])
            X_test_current_family[['onpromotion']] = scaler_promo.transform(X_test_current_family[['onpromotion']])
            X_test_current_family = X_test_current_family.drop(columns=['date'])
        
            y_pred_current_family = model.predict(X_test_current_family.drop(columns=['pred']))
            y_pred_current_family[y_pred_current_family < 0] = 0
            X_test_current_day.loc[current_family_indices_test, 'pred'] = y_pred_current_family

        
        y_pred = X_test_current_day['pred'].copy()
        X_test_current_day = X_test_current_day.drop(columns=['pred'])
        cv_scores[i + 1].append(np.sqrt(mean_squared_log_error(y_test_current_day, y_pred)))

In [None]:
# RMSLE for every day in the test set

plt.figure(figsize=(10, 10))
X=list(cv_scores.keys())
y=[mean(cv_scores[key]) for key in X]
plt.plot(X, y)

<p><b>Test data preparing<b>

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv')
test_transformer = ETLTransformer(date_column='date', id_column='id')

In [None]:
test_data = test_transformer.transform(test_data)[0]

In [None]:
test_data.head()

<p><b>Training models and saving predictions</b>

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
base_pipeline = PipelineLinearV1(num_columns=list(numerical_columns), cat_columns='store_nbr')
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_for_every_family_last_year_26_08.csv', index = False)

# 2. Linear regression with 'store_nbr' and 'time-step' feature

<p><b>Train data preparing</b>

In [None]:
X = train.copy().drop(columns=['onpromotion'])

In [None]:
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]
X = X.drop(columns='dcoilwtico')

In [None]:
X_dates = pd.DataFrame(data={'date': X['date'].unique()})
X_dates['time'] = np.arange(len(X_dates.index))
X = X.merge(X_dates, on='date', how='left')

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

<p><b>Cross-validation process</b>

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=None, cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

<p><b>Test data preparing</b>

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')

In [None]:
test_data = test_transformer.transform(test_data)[0]
test_data = test_data.drop(columns=['dcoilwtico'])

In [None]:
test_dates = pd.DataFrame(data={'date': test_data['date'].unique()})
test_dates['time'] = np.arange(len(X_dates.index), len(X_dates.index) + len(test_dates.index))
test_data = test_data.merge(test_dates, on='date', how='left')
test_data.head()

<p><b>Training models and saving predictions</b>

In [None]:
base_pipeline = PipelineLinearV1(num_columns=None, cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_time_step_only.csv', index = False)

# 3. Linear regression with 'store_nbr' and 'dcoilwtico' features

<p><b>Train data preparing</b>

In [None]:
X = train.copy().drop(columns=['onpromotion'])

In [None]:
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')

In [None]:
inds = X.loc[pd.isna(X['dcoilwtico']), :].index

In [None]:
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)

In [None]:
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X.head()

<p><b>Cross-validation process</b>

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

In [None]:
tscv = TimeSeriesSplit(gap=0, max_train_size=365 * 54 * 33, n_splits=4,
                               test_size=16 * 54 * 33)

In [None]:
X_copy = X.copy()
X_copy = pd.get_dummies(X_copy, columns=['store_nbr'], drop_first=True)

In [None]:
keys = np.arange(1, 17)
cv_scores = {key : [] for key in keys}
for train_indices, test_indices in tscv.split(X_copy, y):
    X_train = X_copy.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X_copy.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    for i, current_day in enumerate(X_test['date'].unique()):
        current_day_indices_test = X_test[X_test['date'] == current_day].index
        X_test_current_day = X_test[X_test['date'] == current_day]
        y_test_current_day = y_test.loc[current_day_indices_test]
        
        X_test_current_day.loc[:, 'pred'] = 0
        for current_family in X['family'].unique():
            current_family_indices_train = X_train[X_train['family'] == current_family].index
            X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
            y_train_current_family = y_train.loc[current_family_indices_train]
            scaler_oil = MinMaxScaler()
            X_train_current_family[['dcoilwtico']] = scaler_oil.fit_transform(X_train_current_family[['dcoilwtico']])
            model = PositiveRegressor(LinearRegression())
            model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        
            current_family_indices_test = X_test_current_day[X_test_current_day['family'] == current_family].index
            X_test_current_family = X_test_current_day[X_test_current_day['family'] == current_family].drop(columns=['family'])
            X_test_current_family[['dcoilwtico']] = scaler_oil.transform(X_test_current_family[['dcoilwtico']])
            X_test_current_family = X_test_current_family.drop(columns=['date'])
        
            y_pred_current_family = model.predict(X_test_current_family.drop(columns=['pred']))
            y_pred_current_family[y_pred_current_family < 0] = 0
            X_test_current_day.loc[current_family_indices_test, 'pred'] = y_pred_current_family

        
        y_pred = X_test_current_day['pred'].copy()
        X_test_current_day = X_test_current_day.drop(columns=['pred'])
        cv_scores[i + 1].append(np.sqrt(mean_squared_log_error(y_test_current_day, y_pred)))

In [None]:
# RMSLE for every day in the test set

plt.figure(figsize=(10, 10))
X=list(cv_scores.keys())
y=[mean(cv_scores[key]) for key in X]
plt.plot(X, y)

<p><b>Test data preparing</b>

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data.head()

<p><b>Training models and saving predictions</b>

In [None]:
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_only.csv', index = False)

# 4. Linear regression with 'store_nbr', 'dcoilwtico' and 'time-step' features

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')

In [None]:
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X_dates = pd.DataFrame(data={'date': X['date'].unique()})
X_dates['time'] = np.arange(len(X_dates.index))
X = X.merge(X_dates, on='date', how='left')

In [None]:
X.head()

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

# 5. Linear Regression with 'store_nbr', 'dcoilwtico' and 'is_holiday' binary feature

In [None]:
def adding_is_holiday_feature(data):
    data_copy = data.copy()
    
    holidays_data = pd.read_csv(DATA_ROOT / 'holidays_events.csv')
    data_copy = data_copy.merge(holidays_data, on='date', how='left')
    
    data_copy['transferred'] = np.where((data_copy['transferred'] == True), 1, data_copy['transferred'])
    data_copy['transferred'] = np.where((data_copy['transferred'] == False), 1, data_copy['transferred'])
    data_copy.loc[data_copy['transferred'].isna(), 'transferred'] = 0

    is_holiday = pd.get_dummies(data_copy['transferred'])
    data_copy = pd.concat((is_holiday, data_copy), axis=1).drop([0, 'transferred'], axis=1)
    data_copy = data_copy.rename(columns={1: "is_holiday"})
    return data_copy

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
X.loc[:, 'is_holiday'] = adding_is_holiday_feature(X)['is_holiday']

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')

In [None]:
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X.head()

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data.loc[:, 'is_holiday'] = adding_is_holiday_feature(test_data)['is_holiday']
test_data.head()

In [None]:
test_data['is_holiday'].value_counts()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_only.csv', index = False)

# 6. Linear Regression with 'store_nbr', 'dcoilwtico' and 'store_type' features 

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
stores_data = pd.read_csv(DATA_ROOT / 'stores.csv').drop(columns=['city', 'state', 'cluster'])
stores_data = stores_data.rename(columns={'type': 'store_type'})
X = X.merge(stores_data, on='store_nbr', how='left')

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X.head()

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr', 'store_type'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

# 7. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (16 days)

In [None]:
X = train.copy().drop(columns='onpromotion')
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
dates_test_data = test_data['date'].unique()
all_data = pd.concat([X, test_data])
all_data['lag_16'] = all_data.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
test_data['lag_16'] = all_data[all_data['date'].isin(dates_test_data)]['lag_16']
X['lag_16'] = X.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X.head()

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

In [None]:
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data.head()

In [None]:
test_data.isna().sum()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_lag_sixteen.csv', index = False)

# 8. Ridge Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (16 days), params: alpha

In [None]:
X = train.copy().drop(columns='onpromotion')
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
dates_test_data = test_data['date'].unique()
all_data = pd.concat([X, test_data])
all_data['lag_16'] = all_data.groupby(['store_nbr', 'family'])['sales'].shift(16)

test_data['lag_16'] = all_data[all_data['date'].isin(dates_test_data)]['lag_16']
X['lag_16'] = X.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)
X.head()

In [None]:
hyper_params = {'alpha': np.array([0.01, 0.1, 1, 10, 25, 50, 100])}

In [None]:
splitter = DateTimeSeriesSplit()
for i, alpha in enumerate(hyper_params['alpha']):
    base_pipeline = PipelineRidgeV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'], best_params=alpha)
    modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)
    
    scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)
    
    print('alpha: ', alpha)
    print_cv_test_scores(scores)
    print()
    print()

In [None]:
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data.head()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
base_pipeline = PipelineRidgeV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'], best_params=25)
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/ridgereg_dcoilwtico_lag_sixteen.csv', index = False)

# 9. Lasso Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (16 days), params: alpha

In [None]:
X = train.copy().drop(columns='onpromotion')
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
dates_test_data = test_data['date'].unique()
all_data = pd.concat([X, test_data])
all_data['lag_16'] = all_data.groupby(['store_nbr', 'family'])['sales'].shift(16)

test_data['lag_16'] = all_data[all_data['date'].isin(dates_test_data)]['lag_16']
X['lag_16'] = X.groupby(['store_nbr', 'family'])['sales'].shift(16)

train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)
X.head()

In [None]:
hyper_params = {'alpha': np.array([0.001, 0.01, 0.1, 1])}

In [None]:
splitter = DateTimeSeriesSplit()
for i, alpha in enumerate(hyper_params['alpha']):
    base_pipeline = PipelineLassoV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'], best_params=alpha)
    modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)
    
    scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)
    
    print('alpha: ', alpha)
    print_cv_test_scores(scores)
    print()
    print()

In [None]:
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data.head()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
base_pipeline = PipelineLassoV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'], best_params=0.001)
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/lassoreg_dcoilwtico_lag_sixteen.csv', index = False)

# 10. Linear Regression with 'store_nbr', 'dcoilwtico' and 'lag' feature (16 days), params: alpha (ElasticNet)

In [None]:
X = train.copy().drop(columns='onpromotion')
test_data = pd.read_csv(DATA_ROOT / 'test.csv').drop(columns=['onpromotion'])
dates_test_data = test_data['date'].unique()
all_data = pd.concat([X, test_data])
all_data['lag_16'] = all_data.groupby(['store_nbr', 'family'])['sales'].shift(16)

test_data['lag_16'] = all_data[all_data['date'].isin(dates_test_data)]['lag_16']
X['lag_16'] = X.groupby(['store_nbr', 'family'])['sales'].shift(16)

train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)
X.head()

In [None]:
hyper_params = {'alpha': np.array([0.001, 0.01, 0.1, 1]), 'l1_ratio': np.array([0.05, 0.1, 0.15, 0.2, 0.3])}

In [None]:
splitter = DateTimeSeriesSplit()
for i, alpha in enumerate(hyper_params['alpha']):
    for j, l1_ratio in enumerate(hyper_params['l1_ratio']):
        base_pipeline = PipelineElasticNetV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'], alpha=alpha, l1_ratio=l1_ratio)
        modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)
    
        scores = cross_validate(
            modelling_pipeline, X, y,
            cv=splitter, scoring=CV_METRICS, return_estimator=True)
    
        print('alpha: ', alpha, 'l1_ratio: ', l1_ratio)
        print_cv_test_scores(scores)
        print()
        print()

In [None]:
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data.head()

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train = X[X['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train = y.loc[indexer]

In [None]:
base_pipeline = PipelineElasticNetV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'], alpha=0.001, l1_ratio=0.15)
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train, y_train)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = modelling_pipeline.predict(test_data)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/elasticnet_dcoilwtico_lag_sixteen.csv', index = False)

# 11. Linear Regression with 'store_nbr', 'dcoilwtico', **'onpromotion' and 'lag' feature (16 days)

In [None]:
X = train.copy()
test_data = pd.read_csv(DATA_ROOT / 'test.csv')
dates_test_data = test_data['date'].unique()
all_data = pd.concat([X, test_data])
all_data['lag_16'] = all_data.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
test_data['lag_16'] = all_data[all_data['date'].isin(dates_test_data)]['lag_16']
X['lag_16'] = X.groupby(['store_nbr', 'family'])['sales'].shift(16)

In [None]:
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
families_with_promo = ['beauty', 'beverages', 'home and kitchen ii', 'home care', 'produce', 'school and office supplies']

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
indices_promo = X[X['family'].isin(families_with_promo)].index
X_with_promo = X[X['family'].isin(families_with_promo)]
y_with_promo = y.loc[indices_promo]

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico', 'onpromotion'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X_with_promo, y_with_promo,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

In [None]:
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data_with_promo_indices = test_data[test_data['family'].isin(families_with_promo)].index
test_data_with_promo = test_data[test_data['family'].isin(families_with_promo)]
test_data_with_promo

In [None]:
first_day_of_last_year = pd.to_datetime(X_with_promo['date'].unique()[-1]) - timedelta(days=365)
indexer = X_with_promo[X_with_promo['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train_with_promo = X_with_promo[X_with_promo['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train_with_promo = y_with_promo.loc[indexer]

In [None]:
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico', 'onpromotion'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train_with_promo, y_train_with_promo)

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
submission.loc[test_data_with_promo_indices, 'sales'] = modelling_pipeline.predict(test_data_with_promo)

In [None]:
indices_without_promo = X[~X['family'].isin(families_with_promo)].index
X_without_promo = X[~X['family'].isin(families_with_promo)]
y_without_promo = y.loc[indices_without_promo]

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X_without_promo, y_without_promo,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

In [None]:
test_data_without_promo_indices = test_data[~test_data['family'].isin(families_with_promo)].index
test_data_without_promo = test_data[~test_data['family'].isin(families_with_promo)]
test_data_without_promo

In [None]:
indexer = X_without_promo[X_without_promo['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train_without_promo = X_without_promo[X_without_promo['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train_without_promo = y_without_promo.loc[indexer]

In [None]:
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
modelling_pipeline.fit(X_train_without_promo, y_train_without_promo)

In [None]:
submission.loc[test_data_without_promo_indices, 'sales'] = modelling_pipeline.predict(test_data_without_promo)

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_lag_sixteen_promo.csv', index = False)

# 12. Linear Regression with 'store_nbr', 'dcoilwtico', 'store_city' and 'lag' feature (16 days)

In [None]:
X = train.copy().drop(columns='onpromotion')
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
stores_data = pd.read_csv(DATA_ROOT / 'stores.csv').drop(columns=['type', 'state', 'cluster'])
stores_data = stores_data.rename(columns={'city': 'store_city'})
X = X.merge(stores_data, on='store_nbr', how='left')

In [None]:
y = X['sales'].copy()
X = X.drop(columns='sales')
inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)
X.head()

In [None]:
splitter = DateTimeSeriesSplit()
base_pipeline = PipelineLinearV1(num_columns=['dcoilwtico'], cat_columns=['store_nbr', 'store_city'])
modelling_pipeline = SplitPipeline(base_pipeline=base_pipeline)

In [None]:
scores = cross_validate(
        modelling_pipeline, X, y,
        cv=splitter, scoring=CV_METRICS, return_estimator=True)

In [None]:
print_cv_test_scores(scores)

# 13. Linear Regression with 'store_nbr', 'dcoilwtico', **'onpromotion' and 'lag' feature (1 day)

In [None]:
X = train.copy()
train_transformer = ETLTransformer(date_column='date', id_column='id')
X = train_transformer.transform(X)[0]

In [None]:
X['lag_1'] = X.groupby(['store_nbr', 'family'])['sales'].shift()

y = X['sales'].copy()
X = X.drop(columns='sales')

inds = X.loc[pd.isna(X['dcoilwtico']), :].index
X = X[X['dcoilwtico'].notna()]
X = X.reset_index(drop=True)
y = y.drop(labels=inds)
y = y.reset_index(drop=True)

In [None]:
X = pd.get_dummies(X, columns=['store_nbr'], drop_first=True)
lag_1 = X['lag_1'].copy()
X = X.drop(columns=['lag_1'])
X['lag_1'] = lag_1
X.head()

In [None]:
families_with_promo = ['beauty', 'beverages', 'home and kitchen ii', 'home care', 'produce', 'school and office supplies']
families_without_promo = [family for family in X['family'].unique() if family not in families_with_promo]

In [None]:
tscv_promo = TimeSeriesSplit(gap=0, max_train_size=365 * len(families_with_promo) * train['store_nbr'].nunique(), n_splits=4, test_size=16 * len(families_with_promo) * train['store_nbr'].nunique())

In [None]:
indices_promo = X[X['family'].isin(families_with_promo)].index
X_with_promo = X[X['family'].isin(families_with_promo)].copy()
y_with_promo = y.loc[indices_promo].copy()

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv_promo.split(X_with_promo, y_with_promo):
    X_train = X_with_promo.iloc[train_indices]
    y_train = y_with_promo.iloc[train_indices]
    X_test = X_with_promo.iloc[test_indices]
    y_test = y_with_promo.iloc[test_indices]

    X_test.loc[:, 'pred'] = 0
    for current_family in families_with_promo:
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler_oil = MinMaxScaler()
        scaler_promo = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler_oil.fit_transform(X_train_current_family[['dcoilwtico']])
        X_train_current_family[['onpromotion']] = scaler_promo.fit_transform(X_train_current_family[['onpromotion']])
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        X_test_current_family[['dcoilwtico']] = scaler_oil.transform(X_test_current_family[['dcoilwtico']])
        X_test_current_family[['onpromotion']] = scaler_promo.transform(X_test_current_family[['onpromotion']])

        previous_day = X_test_current_family['date'].unique()[0]
        for current_day in X_test_current_family['date'].unique()[1:]:
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop(columns=['pred', 'date'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
    
            #######
            X_test_current_family.loc[X_test_current_family['date'] == current_day, 'lag_1'] = predictions
            #######
            
            previous_day = current_day
            
            
        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family
             
    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))
          
for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
tscv_without_promo = TimeSeriesSplit(gap=0, max_train_size=365 * len(families_without_promo) * len(train['store_nbr'].unique()), n_splits=4, test_size=16 * len(families_without_promo) * len(train['store_nbr'].unique()))

In [None]:
indices_without_promo = X[X['family'].isin(families_without_promo)].index
X_without_promo = X[X['family'].isin(families_without_promo)].copy().drop(columns=['onpromotion'])
y_without_promo = y.loc[indices_without_promo].copy()

In [None]:
scores = {'RMSLE': [], 'RMSE': [], 'MAE': [], 'R2': []}

for train_indices, test_indices in tscv_without_promo.split(X_without_promo, y_without_promo):
    X_train = X_without_promo.iloc[train_indices]
    y_train = y_without_promo.iloc[train_indices]
    X_test = X_without_promo.iloc[test_indices]
    y_test = y_without_promo.iloc[test_indices]
    
    X_test.loc[:, 'pred'] = 0
    for current_family in families_without_promo:
        current_family_indices_train = X_train[X_train['family'] == current_family].index
        X_train_current_family = X_train[X_train['family'] == current_family].drop(columns=['family'])
        y_train_current_family = y_train.loc[current_family_indices_train]
        scaler = MinMaxScaler()
        X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
        model = PositiveRegressor(LinearRegression())
        model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
        
        
        current_family_indices_test = X_test[X_test['family'] == current_family].index
        X_test_current_family = X_test[X_test['family'] == current_family].drop(columns=['family'])
        X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
        
        previous_day = X_test_current_family['date'].unique()[0]
        for current_day in X_test_current_family['date'].unique()[1:]:
            X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop(columns=['date', 'pred'])
            predictions = model.predict(X_test_for_current_day)
            predictions[predictions < 0] = 0
    
            #######
            X_test_current_family.loc[X_test_current_family['date'] == current_day, 'lag_1'] = predictions
            #######
            
            previous_day = current_day
            

        y_pred_current_family = model.predict(X_test_current_family.drop(columns=['date', 'pred']))
        y_pred_current_family[y_pred_current_family < 0] = 0
        X_test.loc[current_family_indices_test, 'pred'] = y_pred_current_family

    y_pred = X_test['pred'].copy()
    X_test = X_test.drop(columns=['pred'])
    scores['RMSLE'].append(np.sqrt(mean_squared_log_error(y_test, y_pred)))
    scores['RMSE'].append(mean_squared_error(y_test, y_pred, squared=False))
    scores['MAE'].append(mean_absolute_error(y_test, y_pred))
    scores['R2'].append(r2_score(y_test, y_pred))   
      
for metric_name, metric_values in scores.items():
    print(f'{metric_name}: {mean(metric_values):.3f} ± {stdev(metric_values):.3f}')

In [None]:
first_day_of_last_year = pd.to_datetime(X['date'].unique()[-1]) - timedelta(days=365)
indexer = X_with_promo[X_with_promo['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train_promo = X_with_promo[X_with_promo['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train_promo = y_with_promo.loc[indexer]

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv')
test_transformer = ETLTransformer(date_column='date', id_column='id')
test_data = test_transformer.transform(test_data)[0]
test_data = pd.get_dummies(test_data, columns=['store_nbr'], drop_first=True)
test_data['lag_1'] = 0
test_data_with_promo = test_data[test_data['family'].isin(families_with_promo)]
test_data_with_promo.loc[test_data_with_promo[test_data_with_promo['date'] == test_data_with_promo['date'].unique()[0]].index, 'lag_1'] = y_train_promo.loc[X_train_promo['date'] == X_train_promo['date'].unique()[-1]].tolist()

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
for current_family in families_with_promo:
    current_family_indices_train = X_train_promo[X_train_promo['family'] == current_family].index
    X_train_current_family = X_train_promo[X_train_promo['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train_promo.loc[current_family_indices_train]
    scaler_oil = MinMaxScaler()
    scaler_promo = MinMaxScaler()
    X_train_current_family[['dcoilwtico']] = scaler_oil.fit_transform(X_train_current_family[['dcoilwtico']])
    X_train_current_family[['onpromotion']] = scaler_promo.fit_transform(X_train_current_family[['onpromotion']])
    
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data_with_promo[test_data_with_promo['family'] == current_family].drop(columns=['family'])
    
    X_test_current_family[['dcoilwtico']] = scaler_oil.transform(X_test_current_family[['dcoilwtico']])
    X_test_current_family[['onpromotion']] = scaler_promo.transform(X_test_current_family[['onpromotion']])
    
    previous_day = X_test_current_family['date'].unique()[0]
    for current_day in X_test_current_family['date'].unique()[1:]:
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop('date', axis=1)
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day].index, 'lag_1'] = predictions
        previous_day = current_day
            
    X_test_current_family = X_test_current_family.drop(columns=['date'])
        
    y_pred_current_family = model.predict(X_test_current_family)
    y_pred_current_family[y_pred_current_family < 0] = 0
        
    test_indices = test_data_with_promo[test_data_with_promo['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
indexer = X_without_promo[X_without_promo['date'] >= str(first_day_of_last_year).split(' ')[0]].index
X_train_without_promo = X_without_promo[X_without_promo['date'] >= str(first_day_of_last_year).split(' ')[0]]
y_train_without_promo = y_without_promo.loc[indexer]

In [None]:
test_data_without_promo = test_data[test_data['family'].isin(families_without_promo)].drop(columns=['onpromotion'])
test_data_without_promo.loc[test_data_without_promo[test_data_without_promo['date'] == test_data_without_promo['date'].unique()[0]].index, 'lag_1'] = y_train_without_promo.loc[X_train_without_promo['date'] == X_train_without_promo['date'].unique()[-1]].tolist()

In [None]:
for current_family in families_without_promo:
    current_family_indices_train = X_train_without_promo[X_train_without_promo['family'] == current_family].index
    X_train_current_family = X_train_without_promo[X_train_without_promo['family'] == current_family].drop(columns=['family'])
    y_train_current_family = y_train_without_promo.loc[current_family_indices_train]
    scaler = MinMaxScaler()
    X_train_current_family[['dcoilwtico']] = scaler.fit_transform(X_train_current_family[['dcoilwtico']])
    model = PositiveRegressor(LinearRegression())
    model.fit(X_train_current_family.drop(columns=['date']), y_train_current_family)
    
    X_test_current_family = test_data_without_promo[test_data_without_promo['family'] == current_family].drop(columns=['family'])
    
    X_test_current_family[['dcoilwtico']] = scaler.transform(X_test_current_family[['dcoilwtico']])
    
    previous_day = X_test_current_family['date'].unique()[0]
    for current_day in X_test_current_family['date'].unique()[1:]:
        X_test_for_current_day = X_test_current_family[X_test_current_family['date'] == previous_day].drop('date', axis=1)
        predictions = model.predict(X_test_for_current_day)
        predictions[predictions < 0] = 0
        X_test_current_family.loc[X_test_current_family[X_test_current_family['date'] == current_day].index, 'lag_1'] = predictions
        previous_day = current_day
            
    X_test_current_family = X_test_current_family.drop(columns=['date'])
        
    y_pred_current_family = model.predict(X_test_current_family)
    y_pred_current_family[y_pred_current_family < 0] = 0
        
    test_indices = test_data_without_promo[test_data_without_promo['family'] == current_family].index
    submission.loc[test_indices, 'sales'] = y_pred_current_family

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linreg_dcoilwtico_lag_one_and_promo.csv', index = False)