In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error,r2_score

from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from constants import BASE_DIR

%matplotlib inline

In [2]:
# Helper functions

import pickle

def save_model(fname, model):
    file = open(fname, 'wb')
    pickle.dump(model, file)
    file.close()

def load_model(fname):
    file = open(fname, 'rb')
    data = pickle.load(file)
    file.close()

    return data

def train_model(X_train,y_train, model_list,cv):
    dict_model = {}
    for model in model_list:
        scores = cross_validate(model, X_train, y_train, cv=cv, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)
        
        model_name = type(model).__name__
        print(model_name)
    
        df = pd.DataFrame(scores)
        df.loc['ave'] = df.mean()
        display(df)
    
        dict_model[model_name] = df.loc['ave']

    return dict_model

In [3]:
# CONFIGS FOR RE-TRAINING AND SAVING

MODEL_SELECTION = True
BEST_MODEL_SAVE = True

### I. Initializing Processed Data

In [4]:
df_input = pd.read_csv(BASE_DIR / 'data' / "processed_data.csv").set_index('Id', drop=True)
df_input.head()

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,Promo2,No_Promo,...,public_holiday,easter_holiday,christmas,basic,extra,extended,store_type_a,store_type_b,store_type_c,store_type_d
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(1, '2015-04-30')",1,4,6228,650,1,1,0,1270,0,1,...,0,0,0,1,0,0,0,0,1,0
"(1, '2015-04-29')",1,3,5775,579,1,1,0,1270,0,1,...,0,0,0,1,0,0,0,0,1,0
"(1, '2015-04-28')",1,2,5199,552,1,1,0,1270,0,1,...,0,0,0,1,0,0,0,0,1,0
"(1, '2015-04-27')",1,1,5575,574,1,1,0,1270,0,1,...,0,0,0,1,0,0,0,0,1,0
"(1, '2015-04-26')",1,7,0,0,0,0,0,1270,0,1,...,0,0,0,1,0,0,0,0,1,0


### II. Test-Train Split and Scaling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_input.drop('Sales',axis=1), df_input['Sales'], test_size=0.3, random_state=42)

X_train_tr = X_train.copy()
X_test_tr = X_test.copy()

In [6]:
# Scaling of ordinal categorical values using MinMaxScaler()

minmax_str = ['Store','Year',"Week","Month", "DayOfWeek"]
scaler = MinMaxScaler().fit(X_train[minmax_str])
if BEST_MODEL_SAVE:
    save_model(BASE_DIR / 'models' / 'transform_minmax.pkl', scaler)

X_train_tr.loc[:,minmax_str] = scaler.transform(X_train[minmax_str])
X_test_tr.loc[:,minmax_str] = scaler.transform(X_test[minmax_str])

In [7]:
# Scaling of Numerical values using standard scaler

std_str = ['Customers', 'CompetitionDistance', 'CompetitionOpenSinceDuration', 'Promo2SinceDuration']
scaler_std = StandardScaler().fit(X_train[std_str])
if BEST_MODEL_SAVE:
    save_model(BASE_DIR / 'models' / 'transform_std.pkl', scaler_std)

X_train_tr.loc[:,std_str] = scaler_std.transform(X_train[std_str])
X_test_tr.loc[:,std_str] = scaler_std.transform(X_test[std_str])

### III. Model Selection using 10 Fold Cross-Validation

In [8]:
if MODEL_SELECTION:
    model_list = [LinearRegression(), ElasticNet(), Ridge(), Lasso()]
    linear_dict = train_model(X_train_tr,y_train, model_list, 10)

LinearRegression


Unnamed: 0,fit_time,score_time,test_r2,train_r2,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
0,1.175668,0.024001,0.901513,0.901209,-1197.849123,-1208.935691
1,1.310876,0.029999,0.902389,0.901125,-1194.003223,-1209.288007
2,1.337837,0.024973,0.900244,0.901369,-1223.269032,-1205.997006
3,1.148746,0.024028,0.900854,0.9013,-1216.239918,-1206.789711
4,1.262241,0.037035,0.900215,0.901366,-1205.910119,-1207.949063
5,1.17501,0.024004,0.902185,0.901151,-1200.603254,-1208.531377
6,1.184341,0.025032,0.900883,0.901296,-1207.431981,-1207.769217
7,1.157054,0.024032,0.900814,0.9013,-1202.905274,-1208.293449
8,1.167621,0.025033,0.900581,0.901327,-1223.520366,-1205.991953
9,1.209993,0.025,0.902721,0.901089,-1206.10664,-1207.920409


ElasticNet


Unnamed: 0,fit_time,score_time,test_r2,train_r2,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
0,0.654995,0.025002,0.778858,0.778313,-1794.93501,-1810.989768
1,0.58,0.024999,0.780061,0.777936,-1792.289423,-1812.276173
2,0.905967,0.025002,0.778595,0.777902,-1822.413119,-1809.71873
3,0.574004,0.024001,0.777273,0.77801,-1822.917552,-1809.835211
4,0.583614,0.024,0.778257,0.77832,-1797.661408,-1810.920576
5,0.570966,0.025002,0.777781,0.778004,-1809.617793,-1811.110951
6,0.652763,0.024001,0.776825,0.778301,-1811.805874,-1810.087039
7,0.649465,0.026,0.776422,0.778459,-1806.014942,-1810.255707
8,0.551999,0.027002,0.775727,0.778109,-1837.661991,-1808.488584
9,0.578848,0.025,0.780911,0.777469,-1810.02797,-1811.805961


Ridge


Unnamed: 0,fit_time,score_time,test_r2,train_r2,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
0,0.397968,0.032737,0.901528,0.901225,-1197.758143,-1208.841474
1,0.382764,0.028001,0.902387,0.901131,-1194.017944,-1209.250616
2,0.450294,0.025999,0.900244,0.901369,-1223.274534,-1205.995146
3,0.437803,0.026001,0.900857,0.9013,-1216.220362,-1206.788668
4,0.462767,0.025746,0.900219,0.901368,-1205.886648,-1207.940303
5,0.386766,0.023998,0.902186,0.901152,-1200.5982,-1208.523872
6,0.394321,0.025971,0.900881,0.901297,-1207.441837,-1207.76749
7,0.417031,0.024999,0.900817,0.901303,-1202.887964,-1208.27305
8,0.404033,0.025003,0.90059,0.90133,-1223.466279,-1205.973476
9,0.374014,0.024033,0.90272,0.90109,-1206.110441,-1207.915591


Lasso


Unnamed: 0,fit_time,score_time,test_r2,train_r2,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
0,2.084683,0.024998,0.901431,0.901176,-1198.346649,-1209.14241
1,2.10627,0.025002,0.902391,0.901081,-1193.99426,-1209.555451
2,2.640703,0.027001,0.900212,0.901318,-1223.468594,-1206.305275
3,2.634422,0.026003,0.900778,0.901252,-1216.705369,-1207.081824
4,2.689772,0.025002,0.900202,0.901318,-1205.98846,-1208.24468
5,3.374043,0.026406,0.90216,0.901102,-1200.758343,-1208.830864
6,2.143552,0.023999,0.900794,0.901248,-1207.976467,-1208.063555
7,2.40388,0.027998,0.900764,0.901253,-1203.207383,-1208.578095
8,5.419896,0.024,0.900502,0.901281,-1224.005606,-1206.277683
9,1.797209,0.024,0.902701,0.901039,-1206.230958,-1208.225553


### IV. Model Training and Hyperparameter Tuning

In [9]:
best_model = Ridge()

In [10]:
# This tuning method are applicable to Lasso, ElasticNet, or Ridge. Perform different Hyperparameter tuning for different ML Algorithm.
best_model_cv = RidgeCV()

tuned_cv =best_model_cv.set_params(alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10], cv=10).fit(X_train_tr,y_train)
print(f'Optimal Ridge at alpha = {tuned_cv.alpha_}')

tuned_model = best_model.set_params(alpha = tuned_cv.alpha_).fit(X_train_tr,y_train)
y_pred = tuned_model.predict(X_test_tr)

print(f'RMSE: {np.sqrt(mean_squared_error(y_test,y_pred))}')
print(f'R2: {r2_score(y_test,y_pred)}')

Optimal Ridge at alpha = 1.0
RMSE: 1204.7650499667538
R2: 0.9015777850926334


In [11]:
if BEST_MODEL_SAVE:
    save_model(BASE_DIR / 'models' / 'tuned_model.pkl', tuned_model)

### MISCELLANEOUS: Feature Selection using Embedded Method and Evaluate Effect on Model Performance

In [12]:
# lasso_cv_model = LassoCV(cv = 10).fit(X_train_tr,y_train)
# print(lasso_cv_model.alpha_)
# print(lasso_cv_model.score(X_train_tr,y_train))

In [13]:
# lasso_cv_model.score(X_test_tr,y_test)
# lasso_tuned = Lasso(alpha = lasso_cv_model.alpha_).fit(X_train_tr,y_train)

# y_pred_tuned = lasso_tuned.predict(X_test_tr)

# print(np.sqrt(mean_squared_error(y_test,y_pred_tuned)))
# print(r2_score(y_train,y_pred_tuned_train))

In [14]:
# # We know that the coefficients not used in Lasso Regression are set to zero.
# features_coef = pd.Series(lasso_tuned.coef_, index = X_train.columns)
# features_coef

In [15]:
# features_coef_order = features_coef.index

In [16]:
# plt.figure(figsize=(8,10))
# features_coef = features_coef.sort_values(ascending=False)
# sns.barplot(y=features_coef.index, x=features_coef)
# plt.show()

In [17]:
# features_coef = features_coef.apply(lambda x: np.abs(x))
# features_coef = features_coef[features_coef_order]

In [18]:
# features_names = features_coef[features_coef > 0.0000001]
# features_names

In [19]:
# X_train_new = X_train_tr[features_names.index]
# X_test_new = X_test_tr[features_names.index]

In [20]:
# linear_dict = train_model(X_train_new,y_train, model_list, 10)

In [21]:
# ridge_model_new = RidgeCV(alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10], cv=10).fit(X_train_new,y_train)
# ridge_model_new.score(X_test_new, y_test)

# ridge_tuned_new = Ridge(alpha = ridge_model_new.alpha_).fit(X_train_new,y_train)
# y_pred_new = ridge_tuned_new.predict(X_test_new)

# print(f'Optimal Ridge at alpha = {ridge_model_new.alpha_}')
# print(f'RMSE: {np.sqrt(mean_squared_error(y_test,y_pred_new))}')
# print(f'R2: {r2_score(y_test,y_pred_new)}')

### MISCELLANEOUS: Training of Common Non-linear Regression Models

In [22]:
# # Decision Tree 
# information_models = [DecisionTreeRegressor()]
# information_models_dict = train_model(X_train,y_train, information_models, 10)

In [23]:
# # KNN
# other_models = [KNeighborsRegressor()]
# other_models_dict = train_model(X_train_tr,y_train, other_models, 10)

In [24]:
# # SVM
# other_models = [KNeighborsRegressor(), SVR()]
# other_models_dict = train_model(X_train_tr,y_train, other_models, 10)

In [25]:
# # Random Forest
# information_models = [DecisionTreeRegressor(), RandomForestRegressor()]
# information_models_dict = train_model(X_train,y_train, information_models, 10)