In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
import warnings
warnings.filterwarnings('ignore')
# from scikeras.wrappers import KerasRegressor

Reading the data

In [5]:
df = pd.read_csv('../data/HW3_new.csv')
values = df.values
X = values[:, 1:-2]
Y = values[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state = 29)

In [6]:
# def create_model(activation, nb_hidden, layers):
#     model = Sequential()
#     model.add(Dense(nb_hidden, input_dim=np.shape(X)[1], activation=activation))
#     for i in range(layers):
#         model.add(Dense(nb_hidden, input_dim=nb_hidden, activation=activation))
#     model.add(Dense(1))
#     model.compile(loss='mean_squared_error', optimizer='adam')
#     return model

1-a

In [7]:
# linear regression, k-NN, regression tree, SVM regreesion and Neural Network and ensembling models
model_names = []
clf1 = ElasticNet()
p_grid1 = [{'alpha': np.power(10, range(0,6)), 'normalize': [True, False], 'l1_ratio': [0,0.5,1]},
{'alpha': [0],'normalize': [True, False], 'l1_ratio': np.arange(0,1,0.3)}]
model_names.append('ElasticNet')

clf2 = KNeighborsRegressor()
p_grid2 = [{'clf2__n_neighbors': range(2,8), 'clf2__weights': ['uniform', 'distance'], 'clf2__p' : [1,2]}]
pipe2 = Pipeline([('std', StandardScaler()), ('clf2', clf2)])
model_names.append('KNN')

clf3 = DecisionTreeRegressor()
p_grid3 = [{'criterion': ['absolute_error','squared_error'], 'max_depth': range(2,6)}]
model_names.append('Decision Tree')

# epsilon ????
clf4 = SVR()
p_grid4 = [{'clf4__kernel': ['rbf', 'poly', 'sigmoid'], 'clf4__degree': range(1,4), 
'clf4__gamma': ['scale', 'auto'], 'clf4__C': np.power(10, range(0,4)), 'clf4__epsilon': np.arange(0.1, 0.5, 0.1)}]
pipe4 = Pipeline([('std', StandardScaler()), ('clf4', clf4)])
model_names.append('SVR')

# clf5 = KerasRegressor(build_fn=create_model, epochs=20, batch_size = 400, verbose=0)
# p_grid5 = [{'clf5__activation': ['relu', 'tanh', 'sigmoid', 'linear'], 'clf5__nb_hidden': [11,12,22], 'clf5__layers':range(0,4),
# 'clf5__epochs': [20, 50], 'clf5__batch_size': [200, 400]}]
# pipe5 = Pipeline([('std', StandardScaler()), ('clf5', clf5)])

clf5 = MLPRegressor(random_state=29)
p_grid5 = [{'clf5__activation': ['relu', 'tanh'], 'clf5__solver': ['lbfgs', 'adam'], 
'clf5__hidden_layer_sizes': [(22),(22,22),(12,12)]}]
pipe5 = Pipeline([('std', StandardScaler()), ('clf5', clf5)])
model_names.append('Neural Net')

clf6 = RandomForestRegressor(random_state=29)
p_grid6 = [{'n_estimators': [100, 200, 300]}]
model_names.append('Random Forest')

inner_cv = KFold(n_splits=4, shuffle=True, random_state=29)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=29)
grid_cv = []

for p_grid, est in zip((p_grid1, p_grid2, p_grid3, p_grid4, p_grid5, p_grid6), (clf1, pipe2, clf3, pipe4, pipe5, clf6)):
    gs = GridSearchCV(estimator=est, param_grid=p_grid, scoring='neg_root_mean_squared_error', cv=inner_cv)
    grid_cv.append(gs)

Getting the negative RMSE for multiple models

In [8]:
nested_scores = []
for i in range(len(grid_cv)):
    nested_score = cross_val_score(grid_cv[i], X=x_train, y=y_train, cv=outer_cv, scoring='neg_root_mean_squared_error')
    nested_scores.append((model_names[i], nested_score.mean(), nested_score.std()))
nested_scores

[('ElasticNet', -120.20231399897135, 6.295501022882377),
 ('KNN', -136.08393809265482, 11.289811353130084),
 ('Decision Tree', -136.89370044209645, 4.59889300505023),
 ('SVR', -127.92333381824372, 6.97373817181843),
 ('Neural Net', -118.25348389259082, 7.813668025294545),
 ('Random Forest', -124.18568393978822, 8.084968788435381)]

Tuning the hyperparameters for the best model - Neural Network

In [9]:
hp_model = GridSearchCV(estimator=pipe5, param_grid=p_grid5, scoring='neg_root_mean_squared_error', cv=inner_cv)
hp_model.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=4, random_state=29, shuffle=True),
             estimator=Pipeline(steps=[('std', StandardScaler()),
                                       ('clf5',
                                        MLPRegressor(random_state=29))]),
             param_grid=[{'clf5__activation': ['relu', 'tanh'],
                          'clf5__hidden_layer_sizes': [22, (22, 22), (12, 12)],
                          'clf5__solver': ['lbfgs', 'adam']}],
             scoring='neg_root_mean_squared_error')

Evaluating the MSE performance of the best model on the test data

In [15]:
best_model = hp_model.best_estimator_
pred = best_model.predict(pd.DataFrame(x_test))
print(mean_squared_error(y_test, pred))
hp_model.best_params_

23110.084690587


{'clf5__activation': 'relu',
 'clf5__hidden_layer_sizes': (22, 22),
 'clf5__solver': 'adam'}