# Experiments with linear models for all the train data

In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import sklearn
import pdpipe as pdp
from pdpipe import df
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, cross_val_score, TimeSeriesSplit, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
DATA_ROOT = Path('data', 'kaggle', 'store-sales-time-series-forecasting')

In [None]:
train = pd.read_csv(DATA_ROOT / 'prepared_train.csv', low_memory=False)
train.head()

In [None]:
X = train[['store_nbr', 'family', 'onpromotion', 'dcoilwtico']]
y = train['sales']
print(X.shape, y.shape)

In [None]:
categorical_columns = ['store_nbr', 'family']
numerical_columns = X.columns.drop(categorical_columns)

In [None]:
transform_pipeline = pdp.PdPipeline([
        pdp.Scale('MinMaxScaler', numerical_columns),
        pdp.OneHotEncode(categorical_columns),
    ])

In [None]:
X = transform_pipeline.apply(X)
X.shape

In [None]:
ts_cv = TimeSeriesSplit(
    n_splits=4,
    gap=0,
    max_train_size=365*54*33,
    test_size=15*54*33,
)

In [None]:
def cv_algorithm(X=X, model_cv=LinearRegression()):
    scores_RMSLE = []
    scores_RMSE = []
    scores_MAE = []
    scores_MAPE = []
    scores_R2 = []
    for train_index, val_index in ts_cv.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]
        #print(train.loc[train_index, 'date'], train.loc[val_index, 'date'])
        model_cv.fit(X_train, y_train)
        y_val_pred = model_cv.predict(X_val)
        y_val_pred[y_val_pred < 0] = 0
        scores_RMSLE.append(np.sqrt(mean_squared_log_error(y_val, y_val_pred)))
        scores_RMSE.append(mean_squared_error(y_val, y_val_pred, squared=False))
        scores_MAE.append(mean_absolute_error(y_val, y_val_pred))
        scores_MAPE.append(mean_absolute_percentage_error(y_val, y_val_pred))
        scores_R2.append(r2_score(y_val, y_val_pred))
    print(f"Root Mean Squared Log Error: {scores_RMSLE}\n"
          f"Root Mean Squared Error: {np.abs(scores_RMSE)}\n"
          f"Mean Absolute Error: {np.abs(scores_MAE)}\n"
          f"Mean Absolute Percentage Error: {np.abs(scores_MAPE)}\n"
          f"R-2: {scores_R2}")

# Linear Regression

In [None]:
cv_algorithm()

In [None]:
model = LinearRegression()

In [None]:
scores_RMSE = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=ts_cv)
scores_MAE = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=ts_cv)
scores_MAPE = cross_val_score(model, X, y, scoring='neg_mean_absolute_percentage_error', cv=ts_cv)
scores_R2 = cross_val_score(model, X, y, scoring='r2', cv=ts_cv)
print(
    f"Mean Absolute Error:     {-scores_MAE}\n"
    f"Root Mean Squared Error: {-scores_RMSE}\n"
    f"Mean Absolute Percentage Error: {-scores_MAPE}\n"
    f"R-2: {scores_R2}"
)

In [None]:
model.fit(X, y)
model.coef_

In [None]:
oil_data = pd.read_csv(DATA_ROOT / 'oil.csv')

In [None]:
test_data = pd.read_csv(DATA_ROOT / 'test.csv', index_col = 'id')
test_data = test_data.merge(oil_data, on='date', how='left')
test_data['dcoilwtico'] = test_data['dcoilwtico'].fillna(method='ffill')
test_data = test_data.drop('date', axis=1)
test_data = transform_pipeline(test_data)
test_data.shape

In [None]:
predictions = model.predict(test_data)

In [None]:
predictions[predictions < 0] = 0

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = predictions
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/my_submission_03_08_2022.csv', index = False)

# Ridge regression

In [None]:
hyper_params = [{'alpha': np.linspace(63, 66, 10)}]

In [None]:
model_ridge_grid = GridSearchCV(estimator = Ridge(),
                                param_grid = hyper_params,
                                cv = ts_cv,
                                verbose = 1,
                                return_train_score=True)
model_ridge_grid.fit(X, y)
model_ridge_grid.best_params_

In [None]:
cv_algorithm(model_cv=Ridge(alpha=model_ridge_grid.best_params_['alpha']))

In [None]:
model_ridge = Ridge(alpha=model_ridge_grid.best_params_['alpha'])
model_ridge.fit(X, y)
model_ridge.coef_

In [None]:
predictions_ridge = model_ridge.predict(test_data)
predictions_ridge[predictions_ridge < 0] = 0

In [None]:
submission = pd.read_csv(DATA_ROOT / 'sample_submission.csv')
submission['sales'] = predictions_ridge
submission.to_csv('./data/kaggle/store-sales-time-series-forecasting/my_submission_ridge_regression.csv', index = False)

# Lasso Regression

In [None]:
hyper_params_lasso = [{'alpha': np.linspace(0.45, 0.7, 10)}]

In [None]:
model_lasso_grid = GridSearchCV(estimator = Lasso(),
                                param_grid = hyper_params_lasso,
                                cv = ts_cv,
                                verbose = 1,
                                return_train_score=True)
model_lasso_grid.fit(X, y)
model_lasso_grid.best_params_

In [None]:
cv_algorithm(model_cv=Lasso(alpha=model_lasso_grid.best_params_['alpha']))

# Linear Regression + time step feature

In [None]:
X_copy = X.copy()

In [None]:
X_copy['Time'] = np.arange(len(X_copy.index))
X_copy.head()

In [None]:
cv_algorithm(X=X_copy)