# Experiments with linear models for each family

In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
from datetime import timedelta
from statistics import median
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'prepared_train.csv', low_memory=False)
train.head()

In [None]:
X = train[['store_nbr', 'family', 'onpromotion', 'dcoilwtico', 'sales']]
print(X.shape)

In [None]:
categorical_columns = ['store_nbr', 'family']
numerical_columns = X.columns.drop(categorical_columns).drop('sales')

In [None]:
transform_pipeline = pdp.PdPipeline([
        pdp.Scale('MinMaxScaler', numerical_columns),
        pdp.OneHotEncode('store_nbr'),
    ])

In [None]:
X = transform_pipeline.apply(X)
X.shape

In [None]:
ts_cv = TimeSeriesSplit(
    n_splits=4,
    gap=0,
    max_train_size=365*54,
    test_size=15*54,
)

In [None]:
oil_data = pd.read_csv(DATA_ROOT / 'oil.csv')

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv', index_col = 'id')
test_data = test_data.merge(oil_data, on='date', how='left')
test_data['dcoilwtico'] = test_data['dcoilwtico'].fillna(method='ffill')
test_data = test_data.sort_values(by=['date', 'store_nbr'], ascending=True, ignore_index=True)

test_data_copy = test_data.copy()

test_data = test_data.drop('date', axis=1)
test_data = transform_pipeline(test_data)
test_data['family'] = test_data['family'].str.lower()
test_data.shape

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')

In [None]:
def training_algorithm(X, test_data, model=LinearRegression()):
    scores_RMSLE = []
    scores_RMSE = []
    scores_MAE = []
    scores_MAPE = []
    scores_R2 = []
    count = 0
    for family in X['family'].unique():
        X_train = X[X['family'] == family].drop(columns=['sales', 'family'])
        X_train_copy = X_train.copy()
        y_train = X[X['family'] == family]['sales']
        for train_index, val_index in ts_cv.split(X_train_copy):
            X_train_cv, X_val_cv = X_train_copy.iloc[train_index], X_train_copy.iloc[val_index]
            y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
            model_cv = LinearRegression()
            model_cv.fit(X_train_cv, y_train_cv)
            y_val_cv_pred = model_cv.predict(X_val_cv)
            y_val_cv_pred[y_val_cv_pred < 0] = 0
            if (count + 1) % ts_cv.n_splits == 0:
                scores_RMSLE.append(np.sqrt(mean_squared_log_error(y_val_cv, y_val_cv_pred)))
                scores_RMSE.append(mean_squared_error(y_val_cv, y_val_cv_pred, squared=False))
                scores_MAE.append(mean_absolute_error(y_val_cv, y_val_cv_pred))
                scores_MAPE.append(mean_absolute_percentage_error(y_val_cv, y_val_cv_pred))
                scores_R2.append(r2_score(y_val_cv, y_val_cv_pred))
            count += 1
        model.fit(X_train, y_train)
        X_test = test_data[test_data['family'] == family].drop('family', axis=1)
        test_indices = test_data[test_data['family'] == family].index
        submission.loc[test_indices, 'sales'] = model.predict(X_test)
    print(f"Root Mean Squared Log Error: {median(scores_RMSLE)}\n"
          f"Root Mean Squared Error: {median(np.abs(scores_RMSE))}\n"
          f"Mean Absolute Error: {median(np.abs(scores_MAE))}\n"
          f"Mean Absolute Percentage Error: {median(np.abs(scores_MAPE))}\n"
          f"R-2: {median(scores_R2)}")

# Simple Linear Regression for each family

In [None]:
training_algorithm(X=X, test_data=test_data)

In [None]:
submission.loc[submission['sales'] < 0, 'sales'] = 0

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linear_regression_for_every_family.csv', index = False)

# Linear Regression with is_holiday binary feature

In [None]:
def adding_is_holiday_feature(data):
    data_copy = data.copy()
    data_copy['holiday_transferred'] = np.where((data_copy['holiday_transferred'] == True), 1, data_copy['holiday_transferred'])
    data_copy['holiday_transferred'] = np.where((data_copy['holiday_transferred'] == False), 1, data_copy['holiday_transferred'])
    data_copy.loc[data_copy['holiday_transferred'].isna(), 'holiday_transferred'] = 0

    is_holiday = pd.get_dummies(data_copy['holiday_transferred'])
    data_copy = pd.concat((is_holiday, data_copy), axis=1).drop([0, 'holiday_transferred'], axis=1)
    data_copy = data_copy.rename(columns={1: "is_holiday"})
    return data_copy

In [None]:
train_data_copy = train[['store_nbr', 'family', 'onpromotion', 'dcoilwtico', 'sales', 'holiday_transferred']]

X_copy = X.copy()
X_copy.loc[:, 'is_holiday'] = adding_is_holiday_feature(train_data_copy)['is_holiday'].tolist()

In [None]:
X_copy['is_holiday'].value_counts()

In [None]:
holidays_events_data = pd.read_csv(DATA_ROOT / 'holidays_events.csv')
holidays_events_data.head()

In [None]:
test_data_copy_with_holiday = test_data_copy.merge(holidays_events_data, on='date', how='left')

In [None]:
test_data_copy_with_holiday.head()

In [None]:
test_data_copy_with_holiday = test_data_copy_with_holiday.drop(columns=['type', 'locale', 'locale_name', 'description']).rename(columns={'transferred': 'holiday_transferred'})
test_data_copy_with_holiday.head()

In [None]:
test_copy = adding_is_holiday_feature(test_data_copy_with_holiday)
test_copy['is_holiday'].value_counts() // train['store_nbr'].nunique() // train['family'].nunique()

In [None]:
test_copy = test_copy.drop('date', axis=1)
test_copy = transform_pipeline(test_copy)
test_copy['family'] = test_copy['family'].str.lower()
test_copy.shape

In [None]:
training_algorithm(X=X_copy, test_data=test_copy)

In [None]:
submission.loc[submission['sales'] < 0, 'sales'] = 0

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linear_regression_for_every_family_with_holiday.csv', index = False)

# Linear Regression with lag feature (1 day shift)

In [None]:
test_data_copy = transform_pipeline(test_data_copy)
test_data_copy['family'] = test_data_copy['family'].str.lower()

In [None]:
test_data_copy['date'] = pd.to_datetime(test_data_copy['date'])

In [None]:
test_data_copy.head()

In [None]:
scores_RMSLE = []
scores_RMSE = []
scores_MAE = []
scores_MAPE = []
scores_R2 = []
count = 0
for family in X['family'].unique():
    model = LinearRegression()
    X_train = X[X['family'] == family]
    X_train['Lag_1'] = X_train['sales'].shift(train['store_nbr'].nunique())
    X_train.dropna(inplace=True)
    y_train = X_train['sales']
    X_train = X_train.drop(columns=['sales', 'family'])
    for train_index, val_index in ts_cv.split(X_train):
        X_train_cv, X_val_cv = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
        model_cv = LinearRegression()
        model_cv.fit(X_train_cv, y_train_cv)
        y_val_cv_pred = model_cv.predict(X_val_cv)
        y_val_cv_pred[y_val_cv_pred < 0] = 0
        if (count + 1) % ts_cv.n_splits == 0:
            scores_RMSLE.append(np.sqrt(mean_squared_log_error(y_val_cv, y_val_cv_pred)))
            scores_RMSE.append(mean_squared_error(y_val_cv, y_val_cv_pred, squared=False))
            scores_MAE.append(mean_absolute_error(y_val_cv, y_val_cv_pred))
            scores_MAPE.append(mean_absolute_percentage_error(y_val_cv, y_val_cv_pred))
            scores_R2.append(r2_score(y_val_cv, y_val_cv_pred))
        count += 1
    model.fit(X_train, y_train)
    
    X_current_test = test_data_copy[test_data_copy['family'] == family].drop('family', axis=1)
    X_current_test['Lag_1'] = 0
    X_current_test.loc[X_current_test['date'] == test_data_copy['date'].unique()[0], 'Lag_1'] = y_train.tail(train['store_nbr'].nunique()).tolist()
    previous_day = test_data_copy['date'].unique()[0]
    for day in test_data_copy['date'].unique()[1:]:
        X_test_for_current_day = X_current_test[X_current_test['date'] == previous_day].drop('date', axis=1)
        predictions = model.predict(X_test_for_current_day)
        X_current_test.loc[X_current_test['date'] == day, 'Lag_1'] = predictions
        previous_day = day
    
    test_indices = test_data[test_data['family'] == family].index
    X_current_test = X_current_test.drop('date', axis=1)
    submission.loc[test_indices, 'sales'] = model.predict(X_current_test)
print(f"Root Mean Squared Log Error: {median(scores_RMSLE)}\n"
      f"Root Mean Squared Error: {median(np.abs(scores_RMSE))}\n"
      f"Mean Absolute Error: {median(np.abs(scores_MAE))}\n"
      f"Mean Absolute Percentage Error: {median(np.abs(scores_MAPE))}\n"
      f"R-2: {median(scores_R2)}")

In [None]:
submission.loc[submission['sales'] < 0, 'sales'] = 0

In [None]:
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/linear_regression_for_every_family_with_lag.csv', index = False)