In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from pyearth import Earth
import warnings
warnings.filterwarnings('ignore')

In [2]:
price_train = pd.read_csv("house_price_train.csv")
price_test = pd.read_csv("house_price_test.csv")
feature_train = pd.read_csv("house_feature_train.csv")
feature_test = pd.read_csv("house_feature_test.csv")

train = pd.merge(price_train, feature_train)
test = pd.merge(price_test, feature_test)

X = train.drop(columns = ['house_price', 'house_id'])
Xtest = test.drop(columns =  ['house_price', 'house_id'])
y = train['house_price']
ytest = test['house_price']

# Q1 {-}
Tune and develop the following models on the data *house_feature_train.csv* to predict *house_price*. Use all the predictors except *house_id*:

a) Random forest (*you can pick up your tuned random forest model from assignment 3 - Q1b, don't need to re-tune it*) \
b) AdaBoost \
c) Gradient boost with *huber* loss \
d) XGBoost \
e) MARS

Find the RMSE for each of the models on test data (*house_feature_test.csv*).

*(5 points for tuning each model, 1 point for RMSE of each model - total 30 points)*

In [3]:
#Random Forest Model
model = RandomForestRegressor(n_estimators=100, random_state=1,min_samples_leaf=3,min_samples_split=6,
                        oob_score=True,n_jobs=-1, max_features=4).fit(X, y)
#RMSE on test data
pred = model.predict(Xtest)
np.sqrt(mean_squared_error(test.house_price, pred))

328.8766835818962

In [4]:
model = AdaBoostRegressor(random_state=1)
grid = dict()
grid['n_estimators'] = [10, 50, 100,200]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['base_estimator'] = [DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=5), 
                          DecisionTreeRegressor(max_depth=10),DecisionTreeRegressor(max_depth=15)]
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param)

Best: -270700.669349 using {'base_estimator': DecisionTreeRegressor(max_depth=5), 'learning_rate': 0.0001, 'n_estimators': 200}


In [5]:
#AdaBoost Model
model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),n_estimators=200,learning_rate=0.0001,random_state=1).fit(X,y)

#RMSE on test data
pred = model.predict(Xtest)
np.sqrt(mean_squared_error(test.house_price, pred))

335.2751478786161

In [6]:
model = GradientBoostingRegressor(random_state=1,loss='huber')
grid = dict()
grid['n_estimators'] = [10, 50, 100,200,500]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['max_depth'] = [3,5,8,10,12,15]
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

KeyboardInterrupt: 

In [7]:
#Gradient Boost with Huber Loss Model
model = GradientBoostingRegressor(max_depth=3,n_estimators=100,learning_rate=0.1, random_state=1,loss='huber').fit(X,y)

#RMSE on test data
pred = model.predict(Xtest)
np.sqrt(mean_squared_error(test.house_price, pred))

358.3355886973277

In [8]:
#K-fold cross validation to find optimal parameters for XGBoost
param_grid = {'max_depth': [5,6,7],
              'learning_rate': [0.01,0.05,0.1,0.2],
               'reg_lambda':[0,0.01,0.001],
                'n_estimators':[150,175,250,500,1000]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,                             
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X,y)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Optimal parameter values = {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250, 'reg_lambda': 0.01}
Optimal cross validation R-squared =  0.7159891687438152


In [9]:
#XGBoost Model
model = xgb.XGBRegressor(random_state=1,max_depth=5,n_estimators=250,
                                         learning_rate = 0.01,reg_lambda=0.01)
model.fit(X,y)
np.sqrt(mean_squared_error(model.predict(Xtest),ytest))

325.40441816082824

In [27]:
n = {}
for i in range(1,20):
    model = Earth(max_terms=500, max_degree=i)
    model.fit(X,y)
    n[i] = np.sqrt(mean_squared_error(model.predict(Xtest),ytest))
    
n

{1: 387.6692610337935,
 2: 388.9160524789007,
 3: 388.4535600285385,
 4: 421.7251605168303,
 5: 406.09330716015495,
 6: 384.5930202801327,
 7: 384.5930202801327,
 8: 384.5930202801327,
 9: 384.5930202801327,
 10: 384.5930202801327,
 11: 384.5930202801327,
 12: 384.5930202801327,
 13: 384.5930202801327,
 14: 384.5930202801327,
 15: 384.5930202801327,
 16: 384.5930202801327,
 17: 384.5930202801327,
 18: 384.5930202801327,
 19: 384.5930202801327}

In [28]:
# MARS Model
model = Earth(max_terms=500, max_degree=6) # note, terms in brackets are the hyperparameters 
model.fit(X,y)

pred = model.predict(Xtest)
np.sqrt(mean_squared_error(test.house_price, pred))

384.5930202801327

# Q2 {-}
Use the following ensembling methods to create an ensemble models based on all the tuned models from *Q1*:

a) Voting ensemble *(5 points for code, 1 point for RMSE)* \
b) Stacking ensemble *(5 points for code, 1 point for RMSE)*

Find the RMSE of each of the ensemble models on test data (*house_feature_test.csv*). 

The RMSEs of both the ensemble models are likely to be lower than the RMSEs of each of the individual models in *Q1*.



In [29]:
#Voting Ensemble
m1 = xgb.XGBRegressor(random_state=1,max_depth=5,n_estimators=250,
                                         learning_rate = 0.01,reg_lambda=0.01)

m2 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),n_estimators=200,learning_rate=0.0001,random_state=1).fit(X,y)

m3 = RandomForestRegressor(n_estimators=100, random_state=1,min_samples_leaf=3,min_samples_split=6,
                        oob_score=True,n_jobs=-1, max_features=4).fit(X, y)

m4 = GradientBoostingRegressor(max_depth=3,n_estimators=100,learning_rate=0.1, random_state=1,loss='huber').fit(X,y)

m5 = Earth(max_terms=500, max_degree=6)

en=VotingRegressor(estimators = [('xgb',m1),('ada',m2),('rf',m3),('gb',m4), ('mars', m5)])
en.fit(X,y)
print("Voting Ensemble Model RMSE = ", np.sqrt(mean_squared_error(en.predict(Xtest),ytest)))

Voting Ensemble Model RMSE =  313.2734963193911


In [30]:
# Stacking Ensemble
en=StackingRegressor(estimators = [('xgb',m1),('ada',m2),('rf',m3),('gb',m4), ('mars', m5)],
                     final_estimator=LinearRegression(),                                          
                    cv = KFold(n_splits = 5, shuffle = True, random_state=1))
en.fit(X,y)
print("Stacking Ensemble Model RMSE = ", np.sqrt(mean_squared_error(en.predict(Xtest),ytest)))

Stacking Ensemble Model RMSE =  320.14455366735916


# Q3 {-}
Which are the most important and the least important models in the stacked ensemble model developed in *Q2(b)*? 

*(2 points for code, 1 point for answer)*

In [31]:
en.final_estimator_.coef_

array([0.05826766, 0.21547914, 0.47445034, 0.19809522, 0.05277925])

The stacked ensemble model gives the highest weight to the random forest regressor model, and the lowest weight to the MARS model.