# About this notebook

This notebook presents ensemble models for all pre-processed features on all training data.

The codes follow Geron's book (chapters 7).

## Retrieve Dataset

In [5]:
# setup random indicies
import numpy as np
import pandas as pd

df = pd.read_csv('../data/train_model.csv')
df.drop(['Unnamed: 0', 'Id'], axis=1, inplace = True)

# remove outliers and high-leverage point (124)
df.drop([907, 465, 124], inplace=True)

# log transform SalePrice
df['SalePrice'] = np.log(df.SalePrice)

# log-transform continuous variables
df['LotFrontage'] = np.log(df.LotFrontage)
df['LotArea'] = np.log(df.LotArea)

#modify zero YearBuilt
def impute(x):
    if x == 0:
        return np.log(1)
    else:
        return np.log(x)
df['YearBuilt'] = df['YearBuilt'].apply(lambda x: impute(x))
#df['YearBuilt'] = np.log(df.YearBuilt)

df['GrLivArea'] = np.log(df.GrLivArea)
df['YrSold'] = np.log(df.YrSold)

df.head(5)

Unnamed: 0,LotFrontage,LotArea,YearBuilt,GrLivArea,YrSold,SalePrice,MSZoning_RL,MSZoning_RMH,Alley_NoAccess,LotShape_Regular,...,OpenPorchSF_OpenPorch_Yes,Fence_no_fence,MoSold_peak_months,SaleType_non_deed,SaleCondition_not_normal,IsVinyl_yes,FullBaths_<=1FullBath,FullBaths_>=3FullBaths,HalfBaths_>=1HalfBaths,Enclosed_combined_EnclosedPorch_Yes
0,4.174387,9.041922,1.94591,7.444249,7.604894,12.247694,1,0,1,1,...,1,1,0,0,0,1,0,1,1,0
1,4.382027,9.169518,3.526361,7.140453,7.604396,12.109011,1,0,1,1,...,0,1,1,0,0,0,0,0,1,0
2,4.219508,9.328123,2.197225,7.487734,7.604894,12.317167,1,0,1,0,...,1,1,0,0,0,1,0,1,1,0
3,4.094345,9.164296,4.553877,7.448334,7.603898,11.849398,1,0,1,0,...,1,1,0,0,1,0,0,0,0,1
4,4.430817,9.565214,2.302585,7.695303,7.604894,12.429216,1,0,1,0,...,1,1,0,0,0,1,0,1,1,0


In [2]:
df.rename(columns={'BedroomAbvGr_=3Bedr': 'BedroomAbvGr_3','BedroomAbvGr_>=4Bedr': 'BedroomAbvGr_atleast4',\
                   'KitchenAbvGr_>=2Ktchn':'KitchenAbvGr_atleast2', \
                   'TotRmsAbvGrd_<=4TotRms': 'TotRmsAbvGrd_atmost4','TotRmsAbvGrd_>=8TotRms': 'TotRmsAbvGrd_atleast8',\
                   'FullBaths_>=3FullBaths':'FullBaths_atleast3','FullBaths_<=1FullBath': 'FullBaths_atmost1',\
                   'HalfBaths_>=1HalfBaths': 'HalfBaths_atleast1'}, inplace=True)
features = df.drop(['SalePrice'], axis = 1)
logsaleprice = df.SalePrice
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, logsaleprice, random_state=42)

# 1. Bagging Ensembles

In [3]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

bag_reg = BaggingRegressor(
    DecisionTreeRegressor(random_state=42), n_estimators=500,
    max_samples=800, bootstrap=True, n_jobs=-1, random_state=42)
bag_reg.fit(X_train, y_train)
y_pred = bag_reg.predict(X_test)

In [4]:
from sklearn.metrics import mean_squared_error

print("MSE: ", round(mean_squared_error(y_test, y_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))

MSE:  0.023
RMSE:  0.152


### - one tree

In [5]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)
y_pred_tree = tree_reg.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, y_pred_tree), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred_tree)), 3))

MSE:  0.051
RMSE:  0.225


# 2. Random Forests

In [6]:
bag_reg = BaggingRegressor(
    DecisionTreeRegressor(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)

In [7]:
bag_reg.fit(X_train, y_train)
y_pred = bag_reg.predict(X_test)

In [8]:
from sklearn.ensemble import RandomForestRegressor

rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_reg.fit(X_train, y_train)

y_pred_rf = rnd_reg.predict(X_test)

In [9]:
pd.DataFrame({'preds_rf': y_pred_rf, 'preds': y_pred, 'diff': y_pred_rf-y_pred}).head()

Unnamed: 0,diff,preds,preds_rf
0,-0.012145,11.779618,11.767472
1,0.057967,12.051952,12.109919
2,-0.039911,12.195841,12.155929
3,-0.018831,12.152024,12.133193
4,0.071926,12.447184,12.519111


In [10]:
print("MSE: ", round(mean_squared_error(y_test, y_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))

MSE:  0.033
RMSE:  0.181


# 3. AdaBoost

### a.

In [11]:
from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=1), n_estimators=200,
    #algorithm="SAMME.R", learning_rate=0.5, random_state=42)
    learning_rate=0.5, random_state=42)
ada_reg.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=0.5, loss='linear', n_estimators=200,
         random_state=42)

In [12]:
y_pred = ada_reg.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, y_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))

MSE:  0.076
RMSE:  0.276


### b. 

In [13]:
from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=1), n_estimators=400,
    #algorithm="SAMME.R", learning_rate=0.5, random_state=42)
    learning_rate=0.1, random_state=42)
ada_reg.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=0.1, loss='linear', n_estimators=400,
         random_state=42)

In [14]:
y_pred = ada_reg.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, y_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))

MSE:  0.056
RMSE:  0.236


In [15]:
pd.DataFrame({'diff': np.exp(y_test)-np.exp(y_pred), 'y_test': np.exp(y_test), 'preds': np.exp(y_pred) }).head(10)

Unnamed: 0,diff,preds,y_test
499,22189.901539,97810.098461,120000.0
1265,-30542.340346,214442.340346,183900.0
412,-13501.648527,235501.648527,222000.0
1050,29691.528896,146793.471104,176485.0
1037,30456.452899,256543.547101,287000.0
615,34675.431524,102824.568476,137500.0
219,38201.600563,129038.399437,167240.0
1398,-2460.184507,140460.184507,138000.0
1195,35539.815493,140460.184507,176000.0
887,19894.704306,115605.295694,135500.0


# 4. Gradient Boosting

### - large learning rate (1)

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=3, presort='auto',
             random_state=42, subsample=1.0, verbose=0, warm_start=False)

In [17]:
gbrt_pred = gbrt.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, gbrt_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, gbrt_pred)), 3))

MSE:  0.052
RMSE:  0.228


### - small learning rate (0.1) 

In [18]:
gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)
gbrt_slow.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=42, subsample=1.0, verbose=0,
             warm_start=False)

In [19]:
gbrt_slow_pred = gbrt_slow.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, gbrt_slow_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, gbrt_slow_pred)), 3))

MSE:  0.017
RMSE:  0.132


In [20]:
gbrt_slow = GradientBoostingRegressor(max_depth=10, n_estimators=500, learning_rate=0.5, random_state=42)
gbrt_slow.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.5, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             presort='auto', random_state=42, subsample=1.0, verbose=0,
             warm_start=False)

In [21]:
gbrt_slow_pred = gbrt_slow.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, gbrt_slow_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, gbrt_slow_pred)), 3))

MSE:  0.034
RMSE:  0.186


# 5. Gradient Boosting with Early Stopping

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(features, logsaleprice, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=119,
             presort='auto', random_state=42, subsample=1.0, verbose=0,
             warm_start=False)

In [23]:
gbrt_best_pred = gbrt_best.predict(X_val)
print("MSE: ", round(mean_squared_error(y_val, gbrt_best_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_val, gbrt_best_pred)), 3))

MSE:  0.022
RMSE:  0.149


# 6. XGBoost

In [24]:
from xgboost import XGBRegressor

In [25]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [26]:
xgb_pred = xgb.predict(X_test)
print("MSE: ", round(mean_squared_error(y_test, xgb_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, xgb_pred)), 3))

MSE:  0.012
RMSE:  0.108


In [28]:
pd.DataFrame({'diff': np.exp(y_test)-np.exp(xgb_pred),'y_test': np.exp(y_test),'preds': np.exp(xgb_pred)}).head(20)

Unnamed: 0,diff,preds,y_test
499,-7073.570312,127073.570312,120000.0
1265,5896.984375,178003.015625,183900.0
412,-9457.359375,231457.359375,222000.0
1050,-5445.078125,181930.078125,176485.0
1037,18646.34375,268353.65625,287000.0
615,5287.125,132212.875,137500.0
219,-3082.546875,170322.546875,167240.0
1398,-1424.671875,139424.671875,138000.0
1195,10960.234375,165039.765625,176000.0
887,-6185.796875,141685.796875,135500.0
