### House Prices: Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
train = pd.read_csv('train_clean.csv')
train = train.set_index('Id')
test = pd.read_csv('test.csv')
test = test.set_index('Id')

In [5]:
train.head()

Unnamed: 0_level_0,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,EnclosedPorch,3SsnPorch,ScreenPorch,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,60,RL,65.0,8450,Pave,No Alley,Reg,Lvl,AllPub,...,0,0,0,No Fence,0,2,2008,WD,Normal,208500
2,1,20,RL,80.0,9600,Pave,No Alley,Reg,Lvl,AllPub,...,0,0,0,No Fence,0,5,2007,WD,Normal,181500
3,2,60,RL,68.0,11250,Pave,No Alley,IR1,Lvl,AllPub,...,0,0,0,No Fence,0,9,2008,WD,Normal,223500
4,3,70,RL,60.0,9550,Pave,No Alley,IR1,Lvl,AllPub,...,272,0,0,No Fence,0,2,2006,WD,Abnorml,140000
5,4,60,RL,84.0,14260,Pave,No Alley,IR1,Lvl,AllPub,...,0,0,0,No Fence,0,12,2008,WD,Normal,250000


### Cleaning Dataset and Removing Missingness

In [16]:
#Putting training and testing dataset together to evaluate NA's and Engineer Features
test['SalePrice'] = -1
full_set = pd.concat([train,test],axis=0)
full_set = full_set[train.columns]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
#Columns with at least one NA value
nas_train = np.sum(train.isna()).reset_index()
nas_train.columns = ['feature', 'NAs_Train']
nas_train.set_index('feature', inplace=True)

nas_test = np.sum(test.isna()).reset_index()
nas_test.columns = ['feature', 'NAs_Test']
nas_test.set_index('feature', inplace=True)

nas_total = pd.concat([nas_train, nas_test], axis='columns')
nas_total['Total'] = nas_total['NAs_Test']+nas_total['NAs_Train']
nas_total = nas_total[nas_total['Total']>0].sort_values('Total',ascending=False)
print(nas_train)

               NAs_Train
feature                 
Unnamed: 0             0
MSSubClass             0
MSZoning               0
LotFrontage          259
LotArea                0
...                  ...
MoSold                 0
YrSold                 0
SaleType               0
SaleCondition          0
SalePrice              0

[75 rows x 1 columns]


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


I will not use columns with plenty of NA's since they don't provide a lot of information.
Variables with a very small proportion, specially in the testing dataset I will fill them up with the most common value (median).

In [18]:
train.columns

Index(['Unnamed: 0', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'En

In [19]:
columns_to_remove = nas_total.index[0:6]
columns_to_fill = nas_total.index[6:]
print("Columns to Remove: ",columns_to_remove)
print("\nColumns to Fill: ",columns_to_fill)

Columns to Remove:  Index(['Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'GarageCond',
       'GarageQual'],
      dtype='object')

Columns to Fill:  Index(['GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'MSZoning', 'Functional', 'Utilities',
       'BsmtHalfBath', 'BsmtFullBath', 'GarageArea', 'GarageCars',
       'BsmtFinSF2', 'Exterior2nd', 'Exterior1st', 'BsmtUnfSF', 'KitchenQual',
       'SaleType', 'TotalBsmtSF', 'BsmtFinSF1'],
      dtype='object')


__Final dataframe with cleaned data.__

In [20]:
clean_df = full_set.drop(columns_to_remove, axis=1)
for feature in columns_to_fill:
    mode_value = clean_df[feature].mode()[0]
    clean_df[feature].fillna(value=mode_value,inplace=True)
print(clean_df.shape)
print(full_set.shape)

(2919, 69)
(2919, 75)


__Final dataframe with cleaned data and hot encoding.__

In [21]:
numeric_columns = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']
categorical_columns = ['MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageType','GarageFinish','PavedDrive','SaleType','SaleCondition']
data = pd.get_dummies(clean_df, prefix_sep='_', columns=categorical_columns, drop_first=False)


#removed "['GarageQual', 'MasVnrType', 'GarageCond']"

### Training Set and Testing Set

In [22]:
yTr = data.SalePrice[data['SalePrice']>=0]
xTr = data[data['SalePrice']>=0].drop(['SalePrice'], axis=1)
xTe = data[data['SalePrice']==-1].drop(['SalePrice'], axis=1)

#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state = 0)

### 1. Basic Linear Regression Model

In [23]:
ols = linear_model.LinearRegression()
area_columns = ['GrLivArea','LotArea','TotalBsmtSF','OverallQual']
x = xTr[area_columns]
y = yTr[:]
ols.fit(x, y)
print("Accuracy: ",round(ols.score(x, y),3))
print("MSE: ",round(mean_squared_error(ols.predict(x),y)**.5))
ols_scores = -cross_val_score(ols, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Cross-Val Score =",round(np.mean(ols_scores**.5),3))

Accuracy:  0.748
MSE:  39856.0
Cross-Val Score = 39534.779


### 2. Saturated Linear Regression Model

In [24]:
ols = linear_model.LinearRegression()
x = xTr
y = yTr[:]
ols.fit(x, y)
ols_scores = -cross_val_score(ols, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(ols.score(x, y),3))
print("MSE: ",round(mean_squared_error(ols.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(ols_scores**.5),3))

Accuracy:  0.928
MSE:  21278.0
Cross-Val Score = 69997436.307


### 3. Ridge Regression Model

In [26]:
x = xTr[:]
y = yTr[:]
ridge = linear_model.RidgeCV(alphas=np.arange(0.05,4,0.01),normalize=True)
ridge.fit(x, y)
print("Best alpha = ",ridge.alpha_)
ridge = linear_model.Ridge(ridge.alpha_,normalize=True)
ridge.fit(x, y)
ridge_scores = -cross_val_score(ridge, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(ridge.score(x, y),3))
print("MSE: ",round(mean_squared_error(ridge.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(ridge_scores**.5),3))

Best alpha =  0.4000000000000001
Accuracy:  0.909
MSE:  23924.0
Cross-Val Score = 29158.599


### 4. Lasso Regression Model

In [28]:
x = xTr[:]
y = yTr[:]

lasso = linear_model.LassoCV(eps=0.001, n_alphas=100, cv=5, normalize=True)

lasso.fit(x, y)
print("Best alpha = ",lasso.alpha_)
alpha=lasso.alpha_
lasso = linear_model.Lasso(alpha=lasso.alpha_, normalize=True)
lasso.fit(x, y)
lasso_scores = -cross_val_score(lasso, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(lasso.score(x, y),3))
print("MSE: ",round(mean_squared_error(lasso.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(lasso_scores**.5),3))

Best alpha =  33.03120314016188
Accuracy:  0.909
MSE:  23999.0
Cross-Val Score = 29678.702


### 5. CART Regression Tree

In [29]:
x = xTr[:]
y = yTr[:]

In [30]:
for i in range(3,16):
    tree = DecisionTreeRegressor(max_depth=i)
    tree.fit(x, y)
    tree_scores = -cross_val_score(tree, x, y, scoring='neg_mean_squared_error', cv = 10)
    print("Tree Depth = ",i,"\tAccuracy: ",round(tree.score(x, y),3),"\tMSE: ",round(mean_squared_error(tree.predict(x),y)**.5),"\tCross-Val Score =",round(np.mean(tree_scores**.5),3))

Tree Depth =  3 	Accuracy:  0.744 	MSE:  40142.0 	Cross-Val Score = 42869.114
Tree Depth =  4 	Accuracy:  0.81 	MSE:  34632.0 	Cross-Val Score = 40096.592
Tree Depth =  5 	Accuracy:  0.862 	MSE:  29524.0 	Cross-Val Score = 39962.158
Tree Depth =  6 	Accuracy:  0.903 	MSE:  24686.0 	Cross-Val Score = 37793.677
Tree Depth =  7 	Accuracy:  0.936 	MSE:  20053.0 	Cross-Val Score = 39026.151
Tree Depth =  8 	Accuracy:  0.958 	MSE:  16179.0 	Cross-Val Score = 38483.251
Tree Depth =  9 	Accuracy:  0.973 	MSE:  13019.0 	Cross-Val Score = 37697.522
Tree Depth =  10 	Accuracy:  0.983 	MSE:  10368.0 	Cross-Val Score = 37929.305
Tree Depth =  11 	Accuracy:  0.989 	MSE:  8317.0 	Cross-Val Score = 37601.332
Tree Depth =  12 	Accuracy:  0.993 	MSE:  6779.0 	Cross-Val Score = 39721.782
Tree Depth =  13 	Accuracy:  0.995 	MSE:  5430.0 	Cross-Val Score = 39006.23
Tree Depth =  14 	Accuracy:  0.997 	MSE:  4437.0 	Cross-Val Score = 37518.186
Tree Depth =  15 	Accuracy:  0.998 	MSE:  3414.0 	Cross-Val Score

### 6. Gradient Boosting Regression

In [32]:
x = xTr[:]
y = yTr[:]
boost = GradientBoostingRegressor(n_estimators=500)
boost.fit(x, y)
boost_scores = -cross_val_score(boost, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(boost.score(x, y),3))
print("MSE: ",round(mean_squared_error(boost.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(boost_scores**.5),3))

Accuracy:  0.992
MSE:  6997.0
Cross-Val Score = 24033.923


__GridSearch for Random Forest Regressor__

In [34]:
param_grid = [
    {'learning_rate':[0.05,0.1],'max_depth':[3,4,5,6,7,8]}
            ]
boost = GradientBoostingRegressor()
grid_search = GridSearchCV(boost,param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(x,y)
print(grid_search.best_params_)

{'learning_rate': 0.1, 'max_depth': 4}


In [36]:
#Best parameters
grid_search.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=4,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [37]:
cvres = grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print((-mean_score)**.5,params)

28186.750429624317 {'learning_rate': 0.05, 'max_depth': 3}
27253.430662397477 {'learning_rate': 0.05, 'max_depth': 4}
28699.55005318201 {'learning_rate': 0.05, 'max_depth': 5}
29674.946220692185 {'learning_rate': 0.05, 'max_depth': 6}
30315.019470375577 {'learning_rate': 0.05, 'max_depth': 7}
31671.990068568597 {'learning_rate': 0.05, 'max_depth': 8}
26268.429650054844 {'learning_rate': 0.1, 'max_depth': 3}
26171.231300810883 {'learning_rate': 0.1, 'max_depth': 4}
28707.943079087054 {'learning_rate': 0.1, 'max_depth': 5}
29586.506202916513 {'learning_rate': 0.1, 'max_depth': 6}
30329.827710869893 {'learning_rate': 0.1, 'max_depth': 7}
32082.405809831704 {'learning_rate': 0.1, 'max_depth': 8}


### 7. Random Forest Regressor

In [39]:
x = xTr[:]
y = yTr[:]
forest = RandomForestRegressor(n_estimators=100)
forest.fit(x, y)
forest_scores = -cross_val_score(forest, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(forest.score(x, y),3))
print("MSE: ",round(mean_squared_error(forest.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(forest_scores**.5),3))

Accuracy:  0.981
MSE:  10995.0
Cross-Val Score = 28669.239


__GridSearch for Random Forest Regressor__

In [40]:
param_grid = [
    {'n_estimators':[25,30,35,40,45,50],'max_features':[6,8,10,12,14,16,18,20]},
    {'bootstrap':[False],'n_estimators':[30,35,40,45,50,55,60,65,70],'max_features':[5,7,9,11,13,15,17,19]}
            ]
forest = RandomForestRegressor()
grid_search = GridSearchCV(forest,param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(x,y)
print(grid_search.best_params_)

{'bootstrap': False, 'max_features': 19, 'n_estimators': 70}


In [41]:
#Best parameters
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=19, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=70,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [42]:
cvres = grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print((-mean_score)**.5,params)

34290.10729536872 {'max_features': 6, 'n_estimators': 25}
34104.24005078565 {'max_features': 6, 'n_estimators': 30}
33885.391932792605 {'max_features': 6, 'n_estimators': 35}
33053.275624849484 {'max_features': 6, 'n_estimators': 40}
33567.16786415951 {'max_features': 6, 'n_estimators': 45}
33698.779162502346 {'max_features': 6, 'n_estimators': 50}
32981.157092264366 {'max_features': 8, 'n_estimators': 25}
32488.46371951676 {'max_features': 8, 'n_estimators': 30}
32913.05643747934 {'max_features': 8, 'n_estimators': 35}
32738.34956442204 {'max_features': 8, 'n_estimators': 40}
33137.84538421493 {'max_features': 8, 'n_estimators': 45}
32533.955222202152 {'max_features': 8, 'n_estimators': 50}
32246.386297746918 {'max_features': 10, 'n_estimators': 25}
32092.140243782796 {'max_features': 10, 'n_estimators': 30}
31489.443530333483 {'max_features': 10, 'n_estimators': 35}
31964.64679985951 {'max_features': 10, 'n_estimators': 40}
31277.67238701812 {'max_features': 10, 'n_estimators': 45}
3