In [9]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
# from scikeras.wrappers import KerasRegressor
import warnings
warnings.filterwarnings('ignore')

Reading the data and filtering only the values where purchase = 1

In [10]:
df = pd.read_csv('../data/HW3_new.csv')
values = df.values
values = values[np.where(values[:,-2]==1)]
X = values[:, 1:-2]
Y = values[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state = 29)

1-b

In [11]:
# linear regression, k-NN, regression tree, SVM regreesion and Neural Network and ensembling models
model_names = []
clf1 = ElasticNet()
p_grid1 = [{'alpha': np.power(10, range(0,6)), 'normalize': [True, False], 'l1_ratio': [0,0.5,1]},
{'alpha': [0],'normalize': [True, False], 'l1_ratio': np.arange(0,1,0.3)}]
model_names.append('ElasticNet')

clf2 = KNeighborsRegressor()
p_grid2 = [{'clf2__n_neighbors': range(2,8), 'clf2__weights': ['uniform', 'distance'], 'clf2__p' : [1,2]}]
pipe2 = Pipeline([('std', StandardScaler()), ('clf2', clf2)])
model_names.append('KNN')

clf3 = DecisionTreeRegressor()
p_grid3 = [{'criterion': ['absolute_error','squared_error'], 'max_depth': range(2,6)}]
model_names.append('Decision Tree')

# epsilon ????
clf4 = SVR()
p_grid4 = [{'clf4__kernel': ['rbf', 'poly', 'sigmoid'], 'clf4__degree': range(1,4), 'clf4__gamma': ['scale', 'auto'], 
'clf4__C': np.power(10, range(0,4)), 'clf4__epsilon': np.arange(0.1, 0.5, 0.1)}]
pipe4 = Pipeline([('std', StandardScaler()), ('clf4', clf4)])
model_names.append('SVR')

# clf5 = KerasRegressor(build_fn=create_model, epochs=20, batch_size = 400, verbose=0)
# p_grid5 = [{'clf5__activation': ['relu', 'tanh', 'sigmoid', 'linear'], 'clf5__nb_hidden': [11,12,22], 'clf5__layers':range(0,4),
# 'clf5__epochs': [20, 50], 'clf5__batch_size': [200, 400]}]
# pipe5 = Pipeline([('std', StandardScaler()), ('clf5', clf5)])

clf5 = MLPRegressor(random_state=29)
p_grid5 = [{'clf5__activation': ['relu', 'tanh'], 'clf5__solver': ['lbfgs', 'adam'], 'clf5__hidden_layer_sizes': [(22),(22,22),(12,12)]}]
pipe5 = Pipeline([('std', StandardScaler()), ('clf5', clf5)])
model_names.append('Neural Net')

clf6 = RandomForestRegressor(random_state=29)
p_grid6 = [{'n_estimators': [100, 200, 300]}]
model_names.append('Random Forest')

inner_cv = KFold(n_splits=4, shuffle=True, random_state=29)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=29)
grid_cv = []

for p_grid, est in zip((p_grid1, p_grid2, p_grid3, p_grid4, p_grid5, p_grid6), (clf1, pipe2, clf3, pipe4, pipe5, clf6)):
    gs = GridSearchCV(estimator=est, param_grid=p_grid, scoring='neg_root_mean_squared_error', cv=inner_cv)
    grid_cv.append(gs)

Getting the negative RMSE for multiple models

In [12]:
nested_scores = []
for i in range(len(grid_cv)):
    nested_score = cross_val_score(grid_cv[i], X=x_train, y=y_train, cv=outer_cv, scoring='neg_root_mean_squared_error')
    nested_scores.append((model_names[i], nested_score.mean(), nested_score.std()))
nested_scores

[('ElasticNet', -171.4879749722166, 20.39120129193363),
 ('KNN', -196.26660353704352, 12.741058252795098),
 ('Decision Tree', -183.67006001072227, 14.618268763998925),
 ('SVR', -178.6550862538711, 22.6560615418886),
 ('Neural Net', -178.75225932954584, 16.67739600504313),
 ('Random Forest', -169.0931000295526, 17.3370975418357)]

Tuning the hyperparameters for the best model - Random Forest

In [13]:
hp_model = GridSearchCV(estimator=clf6, param_grid=p_grid6, scoring='neg_root_mean_squared_error', cv=inner_cv)
hp_model.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=4, random_state=29, shuffle=True),
             estimator=RandomForestRegressor(random_state=29),
             param_grid=[{'n_estimators': [100, 200, 300]}],
             scoring='neg_root_mean_squared_error')

Evaluating the MSE performance of the best model on the test data

In [18]:
best_model = hp_model.best_estimator_
pred = best_model.predict(pd.DataFrame(x_test))
print(mean_squared_error(y_test, pred))
hp_model.best_params_

18921.39791300229


{'n_estimators': 300}

1-c)<br>
The model performance trained on the subset data with purchase = 1 is better than the one <br>
trained on the full dataset. This is expected because the complete dataset has a lot of puchase = 0 values. <br>
This skews model predictions towards spending values of 0. And since there are a lot of values in the test with <br>
spending = 0 and since there is a gap in spending values of purchase = 0 and 1, the error would be greater. <br>
In case of the model trained and tested only on the filtered dataset with purchase = 1, the model can do predictions <br>
better because most of the spending values are non-zero. Thereby allowing the model to both train and predict <br>
non-zero spending values more accurately.  