> Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

> With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

In [1]:
%pwd

'/home/lao/notebook/HousePrices'

In [2]:
path = '/home/lao/notebook/HousePrices'

In [3]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

In [4]:
#housing_data = pd.read_csv(path + '/train.csv', keep_default_na = False)
housing_data = pd.read_csv(path + '/train.csv')

In [5]:
housing_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
housing_data.drop('Id', axis = 1, inplace = True)

In [7]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

In [8]:
housing_data.shape

(1460, 80)

In [9]:
housing_data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [10]:
housing_data.isnull().sum().sort_values(ascending = False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
LotFrontage       259
GarageType         81
GarageCond         81
GarageFinish       81
GarageQual         81
GarageYrBlt        81
BsmtFinType2       38
BsmtExposure       38
BsmtQual           37
BsmtCond           37
BsmtFinType1       37
MasVnrArea          8
MasVnrType          8
Electrical          1
RoofMatl            0
Exterior1st         0
RoofStyle           0
ExterQual           0
Exterior2nd         0
YearBuilt           0
ExterCond           0
Foundation          0
YearRemodAdd        0
SalePrice           0
OverallCond         0
                 ... 
GarageArea          0
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
3SsnPorch           0
BsmtUnfSF           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
Functional          0
TotRmsAbvGrd        0
KitchenQua

In [11]:
#[(vnr_area, vnr_type) for vnr_area, vnr_type in zip(housing_data['MasVnrArea'], housing_data['MasVnrType'])]

In [12]:
housing_data['MSSubClass'] = housing_data['MSSubClass'].astype(np.str)

In [13]:
lf_median = housing_data['LotFrontage'].median()
gyb_median = housing_data['GarageYrBlt'].median()

values = {'LotFrontage': lf_median, 'GarageYrBlt': gyb_median, 'MasVnrArea': 0, 'Electrical': 'SBrkr'  }

In [14]:
housing_data.fillna(value=values, inplace = True)
housing_data.fillna(value = 'None', inplace = True)

In [15]:
#housing_data['Electrical'].value_counts()
#housing_data[housing_data['Electrical'].isnull()]

In [16]:
numeric_col = housing_data.describe().columns
numeric_col

Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [17]:
#housing_data_num = housing_data[numeric_col]
housing_data_num = housing_data.select_dtypes(include = [np.number])
housing_data_cat = housing_data.select_dtypes(include = ['object'])

In [18]:
numeric_corr = housing_data_num.corr()['SalePrice'].sort_values(ascending = False)
numeric_corr.head(30)

SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
MasVnrArea      0.472614
Fireplaces      0.466929
GarageYrBlt     0.466754
BsmtFinSF1      0.386420
LotFrontage     0.334771
WoodDeckSF      0.324413
2ndFlrSF        0.319334
OpenPorchSF     0.315856
HalfBath        0.284108
LotArea         0.263843
BsmtFullBath    0.227122
BsmtUnfSF       0.214479
BedroomAbvGr    0.168213
ScreenPorch     0.111447
PoolArea        0.092404
MoSold          0.046432
3SsnPorch       0.044584
BsmtFinSF2     -0.011378
BsmtHalfBath   -0.016844
Name: SalePrice, dtype: float64

In [19]:
housing_num = housing_data_num.drop('SalePrice', axis = 1)
housing_labels = housing_data_num['SalePrice'].copy()

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_num)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
housing_cat_encoded = pd.get_dummies(housing_data_cat)

In [22]:
housing_cat_encoded.values

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]], dtype=uint8)

In [23]:
#Concatenate (Column-Bind) Processed Columns Back Together
housing = np.concatenate([housing_scaled, housing_cat_encoded.values], axis=1)

In [None]:
import xgboost as xgb
data_dmatrix = xgb.DMatrix(data = housing, label = housing_labels )

In [178]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state = 42)

In [179]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [180]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [181]:
display_scores(tree_rmse_scores)

Scores: [36577.29941554 40538.50231866 34557.3058155  45052.84463112
 38228.23044468 27850.243369   46328.49164317 35562.60256243
 55956.29273181 37232.8973488 ]
Mean: 39788.47102807136
Standard deviation: 7344.846960301038


In [187]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 5, max_iter = 4000)

scores = cross_val_score(lasso_reg, housing, housing_labels, scoring="neg_mean_squared_error", cv=10)
lasso_rmse_scores = np.sqrt(-scores)



In [188]:
display_scores(lasso_rmse_scores)

Scores: [22101.71602702 31950.28561764 23063.21028781 41646.54860571
 29698.92986311 42137.15798717 23614.04421014 22552.21584071
 66541.55695732 22767.92933428]
Mean: 32607.359473091812
Standard deviation: 13465.525663227902


In [189]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state = 42)

scores = cross_val_score(forest_reg, housing, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)



In [190]:
display_scores(forest_rmse_scores)

Scores: [26196.61314624 26694.09072819 25684.62603144 38617.69285994
 36056.61358608 28168.1261545  28200.53760257 24783.76486138
 45971.43823458 29717.15796   ]
Mean: 31009.066116490227
Standard deviation: 6585.985642815913


In [191]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [203]:
param_grid = [ 
    {'n_estimators' : [3, 6, 9 , 10, 15, 20, 30, 50], 'max_features': [2, 4, 6, 8, 10, 12, 14, 15]},
    {'bootstrap': [False], 'n_estimators' : [3, 6, 9 , 10, 15, 20, 30, 50], 'max_features':[2, 4, 6, 8, 10, 12, 14, 15] },
    
]

grid_search = GridSearchCV(forest_reg, param_grid, cv = 10, scoring = 'neg_mean_squared_error')

grid_search.fit(housing, housing_labels)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 6, 9, 10, 15, 20, 30, 50], 'max_features': [2, 4, 6, 8, 10, 12, 14, 15]}, {'bootstrap': [False], 'n_estimators': [3, 6, 9, 10, 15, 20, 30, 50], 'max_features': [2, 4, 6, 8, 10, 12, 14, 15]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [204]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 15, 'n_estimators': 50}

In [205]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

48232.79300257081 {'max_features': 2, 'n_estimators': 3}
43133.12450835211 {'max_features': 2, 'n_estimators': 6}
40745.817536448194 {'max_features': 2, 'n_estimators': 9}
40335.06939777563 {'max_features': 2, 'n_estimators': 10}
39306.65726631843 {'max_features': 2, 'n_estimators': 15}
38577.87718335806 {'max_features': 2, 'n_estimators': 20}
38102.479663711376 {'max_features': 2, 'n_estimators': 30}
36741.70247793934 {'max_features': 2, 'n_estimators': 50}
46926.69405263236 {'max_features': 4, 'n_estimators': 3}
41321.19456743308 {'max_features': 4, 'n_estimators': 6}
40020.77664439357 {'max_features': 4, 'n_estimators': 9}
39399.105831669964 {'max_features': 4, 'n_estimators': 10}
37937.42381754504 {'max_features': 4, 'n_estimators': 15}
37438.4521762263 {'max_features': 4, 'n_estimators': 20}
36216.47492503696 {'max_features': 4, 'n_estimators': 30}
35127.55290110366 {'max_features': 4, 'n_estimators': 50}
42470.909417671755 {'max_features': 6, 'n_estimators': 3}
38254.95216264214 

In [199]:
param_grid = {'n_estimators' : list(range(2,20)) , 'max_features': list(range(2, 50)), 'bootstrap': [False]}

grid_search = RandomizedSearchCV(forest_reg, param_grid, cv = 10, n_iter = 30, scoring = 'neg_mean_squared_error')

grid_search.fit(housing, housing_labels)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=30, n_jobs=None,
          param_distributions={'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'bootstrap': [False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_er

In [200]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 48, 'n_estimators': 11}

In [201]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

31037.69466170645 {'n_estimators': 7, 'max_features': 29, 'bootstrap': False}
31849.324438277887 {'n_estimators': 10, 'max_features': 28, 'bootstrap': False}
29386.803591067524 {'n_estimators': 19, 'max_features': 39, 'bootstrap': False}
29175.906581022467 {'n_estimators': 11, 'max_features': 48, 'bootstrap': False}
31585.811120012626 {'n_estimators': 11, 'max_features': 18, 'bootstrap': False}
32369.080589836158 {'n_estimators': 17, 'max_features': 7, 'bootstrap': False}
30841.608273353027 {'n_estimators': 11, 'max_features': 19, 'bootstrap': False}
31547.62658995039 {'n_estimators': 13, 'max_features': 18, 'bootstrap': False}
37347.81084351622 {'n_estimators': 13, 'max_features': 3, 'bootstrap': False}
31407.19256867478 {'n_estimators': 14, 'max_features': 28, 'bootstrap': False}
31120.648277481476 {'n_estimators': 14, 'max_features': 20, 'bootstrap': False}
30199.602149313472 {'n_estimators': 17, 'max_features': 37, 'bootstrap': False}
32352.7501361007 {'n_estimators': 4, 'max_featu

In [206]:
attributes = housing_num + housing_cat_encoded
feature_importances = grid_search.best_estimator_.feature_importances_
sorted(zip(feature_importances, attributes), reverse=True)

[(0.07736735996647816, 'BsmtFinType1_ALQ'),
 (0.07686354827279249, 'BsmtCond_None'),
 (0.06388745505310976, '3SsnPorch'),
 (0.05287612546436549, 'BldgType_Twnhs'),
 (0.047786348595794136, 'BldgType_TwnhsE'),
 (0.034337092928569686, 'BsmtFinType1_BLQ'),
 (0.03186350807092819, 'Alley_None'),
 (0.028853362161031515, 'LotConfig_Corner'),
 (0.027663377863865028, 'BsmtCond_Fa'),
 (0.024080892726134492, 'Neighborhood_ClearCr'),
 (0.020998053929453767, 'BldgType_1Fam'),
 (0.020607690642129983, 'KitchenQual_Ex'),
 (0.019942289123002832, 'BsmtExposure_Av'),
 (0.019740527437247347, 'Alley_Pave'),
 (0.017275926276007682, 'BsmtFinSF1'),
 (0.017177789954459836, 'Neighborhood_Edwards'),
 (0.016564874542191993, '2ndFlrSF'),
 (0.015619460879652807, 'BsmtFinSF2'),
 (0.015444841705844523, '1stFlrSF'),
 (0.014496649117163682, 'BsmtExposure_None'),
 (0.01434681321946598, 'Neighborhood_SWISU'),
 (0.013794980704955266, 'BsmtFinType1_LwQ'),
 (0.012505236149431047, 'LandSlope_Gtl'),
 (0.011785135918269922, 'Ho

In [None]:
param_grid = {'n_estimators' : list(range(3, 50)), 'max_features': list(range(2,30)), 'bootstrap': [True, False]}

grid_search = RandomizedSearchCV(forest_reg, param_grid, cv = 10, n_iter = 50, scoring = 'neg_mean_squared_error')

grid_search.fit(housing_scaled, housing_labels)

In [184]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 8, 'n_estimators': 34}

In [185]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

28747.29960551998 {'n_estimators': 34, 'max_features': 8, 'bootstrap': False}
