# About this notebook

This notebook presents decision trees model for all pre-processed features on all training data.

The codes follow Geron's book.

## Retrieve Dataset

In [1]:
# setup random indicies
import numpy as np

#np.random.seed(9)

#house_ids = range(1,1461)
#random_id = np.random.choice(house_ids, 1460, replace = False)


# load df
import pandas as pd
df = pd.read_csv('./data/train_model.csv')

# randomized index
df.drop(['Unnamed: 0', 'Id'], axis=1, inplace = True)
#df['random_id'] = random_id
#df.set_index('random_id', inplace = True)
#df.sort_index(inplace=True)
df.head(5)

# remove outliers
df.drop([907, 465], inplace=True)

# log transform SalePrice
df['SalePrice'] = np.log(df.SalePrice)

df.head(5)

Unnamed: 0,LotFrontage,LotArea,YearBuilt,GrLivArea,YrSold,SalePrice,MSZoning_RL,MSZoning_RMH,Alley_NoAccess,LotShape_Regular,...,OpenPorchSF_OpenPorch_Yes,Fence_no_fence,MoSold_peak_months,SaleType_non_deed,SaleCondition_not_normal,IsVinyl_yes,FullBaths_<=1FullBath,FullBaths_>=3FullBaths,HalfBaths_>=1HalfBaths,Enclosed_combined_EnclosedPorch_Yes
0,65.0,8450,7,1710,2008,12.247694,1,0,1,1,...,1,1,0,0,0,1,0,1,1,0
1,80.0,9600,34,1262,2007,12.109011,1,0,1,1,...,0,1,1,0,0,0,0,0,1,0
2,68.0,11250,9,1786,2008,12.317167,1,0,1,0,...,1,1,0,0,0,1,0,1,1,0
3,60.0,9550,95,1717,2006,11.849398,1,0,1,0,...,1,1,0,0,1,0,0,0,0,1
4,84.0,14260,10,2198,2008,12.429216,1,0,1,0,...,1,1,0,0,0,1,0,1,1,0


In [2]:
features = df.drop(['SalePrice'], axis = 1)
logsaleprice = df.SalePrice

# 1. Linear Regression

In [3]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

lin_reg = linear_model.LinearRegression()
lin_scores = cross_val_score(lin_reg, features, logsaleprice, scoring = "neg_mean_squared_error", cv=5)
lin_rmse_scores = np.sqrt(-lin_scores)

In [4]:
def display_scores(scores):
    print("Scores: ", scores,3)
    print("Mean: ", round(scores.mean(), 3))
    print("Standard Deviation: ", round(scores.std(),3))

In [5]:
display_scores(lin_rmse_scores)

Scores:  [0.12598109 0.15937176 0.13833349 0.137612   0.16601185] 3
Mean:  0.145
Standard Deviation:  0.015


# 2. Trees

In [6]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
scores = cross_val_score(tree_reg, features, logsaleprice, scoring = "neg_mean_squared_error", cv=5)
tree_rmse_scores = np.sqrt(-scores)

In [7]:
display_scores(tree_rmse_scores)

Scores:  [0.21687861 0.24666911 0.21273425 0.20859767 0.20787722] 3
Mean:  0.219
Standard Deviation:  0.014


### - from chapter 6, exercise 7

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, logsaleprice, test_size=0.2, random_state=42)

In [9]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeRegressor(random_state=42), params, n_jobs=-1, verbose=1)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    2.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=42,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [10]:
grid_search_cv.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=38, min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [11]:
#from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

y_pred = grid_search_cv.predict(X_test)
#accuracy_score(y_test, y_pred)
print("MSE: ", round(mean_squared_error(y_test, y_pred), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)), 3))

MSE:  0.042
RMSE:  0.205


In [12]:
pd.DataFrame({'test': y_test, 'preds': y_pred, 'diff': y_test-y_pred}).head()

Unnamed: 0,diff,preds,test
1322,0.074871,12.079909,12.154779
837,-0.137977,11.650903,11.512925
413,-0.099115,11.751802,11.652687
523,-1.027702,13.154461,12.126759
1037,0.053251,12.513986,12.567237


In [13]:
pd.DataFrame({'test': np.exp(y_test), 'preds': np.exp(y_pred), 'diff': np.exp(y_test)-np.exp(y_pred)}).head()

Unnamed: 0,diff,preds,test
1322,13705.964104,176294.035896,190000.0
837,-14794.939577,114794.939577,100000.0
413,-11982.222045,126982.222045,115000.0
523,-331558.940878,516308.940878,184750.0
1037,14883.257766,272116.742234,287000.0


# 3. Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg, features, logsaleprice, scoring = "neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)

In [15]:
display_scores(forest_rmse_scores)

Scores:  [0.15080789 0.17248488 0.16712688 0.15220764 0.15535659] 3
Mean:  0.16
Standard Deviation:  0.009


### - from chapter 6, exercise 8

In [16]:
len(X_train)

1166

In [17]:
from sklearn.model_selection import ShuffleSplit

n_trees = 500
n_instances = 10

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train.loc[mini_train_index] #X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [18]:
from sklearn.base import clone

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [85]:
mini_train_index in list(rs.split(X_train))[0][1]

  """Entry point for launching an IPython kernel.


False

In [77]:
mini_train_index

array([1072,  107,  733,  238,  230,  886,  645,   82, 1069, 1143])

In [76]:
sum(X_train.index == 107)

0

In [19]:
X_mini_train

Unnamed: 0,LotFrontage,LotArea,YearBuilt,GrLivArea,YrSold,MSZoning_RL,MSZoning_RMH,Alley_NoAccess,LotShape_Regular,LandContour_Unflat,...,OpenPorchSF_OpenPorch_Yes,Fence_no_fence,MoSold_peak_months,SaleType_non_deed,SaleCondition_not_normal,IsVinyl_yes,FullBaths_<=1FullBath,FullBaths_>=3FullBaths,HalfBaths_>=1HalfBaths,Enclosed_combined_EnclosedPorch_Yes
1123,,,,,,,,,,,...,,,,,,,,,,
87,40.0,3951.0,1.0,1224.0,2009.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
330,70.049958,10624.0,46.0,1728.0,2007.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
466,85.0,10628.0,40.0,1277.0,2007.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
121,50.0,6060.0,71.0,1123.0,2007.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1044,80.0,9600.0,29.0,2524.0,2009.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1095,,,,,,,,,,,...,,,,,,,,,,
1130,,,,,,,,,,,...,,,,,,,,,,
860,55.0,7642.0,92.0,1426.0,2007.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1126,53.0,3684.0,3.0,1555.0,2009.0,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [72]:
X_train

Unnamed: 0,LotFrontage,LotArea,YearBuilt,GrLivArea,YrSold,MSZoning_RL,MSZoning_RMH,Alley_NoAccess,LotShape_Regular,LandContour_Unflat,...,OpenPorchSF_OpenPorch_Yes,Fence_no_fence,MoSold_peak_months,SaleType_non_deed,SaleCondition_not_normal,IsVinyl_yes,FullBaths_<=1FullBath,FullBaths_>=3FullBaths,HalfBaths_>=1HalfBaths,Enclosed_combined_EnclosedPorch_Yes
254,70.000000,8400,53,1314,2010,1,0,1,1,0,...,0,1,1,0,0,0,0,0,0,0
1067,80.000000,9760,46,1611,2008,1,0,1,1,0,...,1,1,1,0,0,0,0,0,1,1
865,70.049958,8750,40,1002,2009,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
799,60.000000,7200,73,1768,2007,1,0,1,1,0,...,0,0,1,0,0,0,0,0,1,1
380,50.000000,5000,86,1691,2010,1,0,0,1,0,...,0,1,1,0,0,0,0,0,0,1
303,70.000000,9800,38,894,2006,1,0,1,1,0,...,0,0,1,0,1,1,0,0,0,0
86,122.000000,11911,5,1560,2009,1,0,1,0,0,...,1,1,0,0,0,1,0,0,1,0
1385,40.000000,5436,88,1154,2010,0,1,1,1,0,...,1,0,1,0,0,1,0,0,0,0
265,78.000000,12090,29,1422,2008,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
793,76.000000,9158,3,1496,2007,1,0,1,1,0,...,1,1,1,1,1,0,0,0,0,0


### - fine-tuning random forests

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(features, response)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [15]:
#The best hyperparameter combination found
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [16]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [17]:
#Check scores of hyperparameter combinations tested during grid search
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.21568097213241097 {'max_features': 2, 'n_estimators': 3}
0.17959547331040177 {'max_features': 2, 'n_estimators': 10}
0.16899891584566248 {'max_features': 2, 'n_estimators': 30}
0.21054024135435345 {'max_features': 4, 'n_estimators': 3}
0.17347486098144738 {'max_features': 4, 'n_estimators': 10}
0.1597423270221992 {'max_features': 4, 'n_estimators': 30}
0.19294958450530328 {'max_features': 6, 'n_estimators': 3}
0.16583701315534802 {'max_features': 6, 'n_estimators': 10}
0.15416358164636332 {'max_features': 6, 'n_estimators': 30}
0.19781785392070947 {'max_features': 8, 'n_estimators': 3}
0.1680079761507098 {'max_features': 8, 'n_estimators': 10}
0.15540185003441429 {'max_features': 8, 'n_estimators': 30}
0.2199207219266737 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.17706749168234392 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.20639672406024 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.17050605086371218 {'bootstrap': False, 'max_featur

In [18]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_bootstrap,param_max_features,param_n_estimators,params,rank_test_score,split0_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.006235,0.000851,-0.046518,-0.01193561,,2,3,"{'max_features': 2, 'n_estimators': 3}",17,-0.039916,...,-0.04645,-0.0113512,-0.049833,-0.01228712,-0.04856,-0.01152011,0.001436,9.9e-05,0.003478,0.0007946442
1,0.016913,0.001733,-0.032255,-0.006519447,,2,10,"{'max_features': 2, 'n_estimators': 10}",11,-0.029927,...,-0.030109,-0.006643156,-0.031073,-0.006671455,-0.031496,-0.006471046,0.002329,0.000433,0.003259,0.0002053938
2,0.036535,0.002736,-0.028561,-0.004728377,,2,30,"{'max_features': 2, 'n_estimators': 30}",7,-0.025567,...,-0.028928,-0.004425522,-0.026208,-0.005049019,-0.0287,-0.005031232,0.001398,0.000133,0.00276,0.0002600703
3,0.005261,0.000625,-0.044327,-0.01120561,,4,3,"{'max_features': 4, 'n_estimators': 3}",16,-0.03723,...,-0.049107,-0.01117693,-0.040203,-0.009886719,-0.049952,-0.01106679,0.000307,3.2e-05,0.004951,0.0008560115
4,0.016168,0.001342,-0.030094,-0.006134088,,4,10,"{'max_features': 4, 'n_estimators': 10}",9,-0.027701,...,-0.033774,-0.00641158,-0.025442,-0.005913036,-0.033333,-0.005944275,0.001226,9.7e-05,0.003207,0.0001855924
5,0.055448,0.002975,-0.025518,-0.004342397,,4,30,"{'max_features': 4, 'n_estimators': 30}",3,-0.021802,...,-0.028988,-0.004279294,-0.023409,-0.004259396,-0.026663,-0.004443236,0.005648,0.000377,0.002572,6.695352e-05
6,0.005863,0.000647,-0.03723,-0.01071088,,6,3,"{'max_features': 6, 'n_estimators': 3}",12,-0.03839,...,-0.040135,-0.009399002,-0.036325,-0.01033643,-0.041109,-0.01159931,0.000112,2e-05,0.003878,0.001049603
7,0.016486,0.001116,-0.027502,-0.005473881,,6,10,"{'max_features': 6, 'n_estimators': 10}",5,-0.023755,...,-0.030642,-0.005217558,-0.026685,-0.005809345,-0.029529,-0.005759223,0.000446,2.5e-05,0.00241,0.000256109
8,0.048292,0.002536,-0.023766,-0.00406651,,6,30,"{'max_features': 6, 'n_estimators': 30}",1,-0.021739,...,-0.025752,-0.003924621,-0.022532,-0.00424396,-0.023473,-0.004241844,0.002867,2.7e-05,0.001557,0.0001549928
9,0.006742,0.000756,-0.039132,-0.009619299,,8,3,"{'max_features': 8, 'n_estimators': 3}",14,-0.037429,...,-0.036373,-0.009700268,-0.038527,-0.0104666,-0.040422,-0.008910684,0.000928,0.000161,0.002315,0.0005088871
