First, I import Kaggle's train and test sets. I clean up the train set by filling in missing numeric values with either 0 or the variable's median. I chose 0 for variables that describe attributes the house doesn't have, such as year the garage was built. I also generate binary variables for most of the qualitative values. Then I obtain a labeled data set from the train set. I label the sale price as y, and I include most of the other variables as X. I leave out several variables that have very little variation. After this is done, I make sure that all variables are numeric instead of strings. Then I use the standardscaler package to standardize X. Finally, I divide X and y into a training and test set.

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('housing_train.csv')
train.head(20)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Econ 484/auxiliaries


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,...,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,...,2008,WD,Normal,208500
1,2,20,RL,80.0,...,2007,WD,Normal,181500
2,3,60,RL,68.0,...,2008,WD,Normal,223500
3,4,70,RL,60.0,...,2006,WD,Abnorml,140000
4,5,60,RL,84.0,...,2008,WD,Normal,250000


In [None]:
train.describe()

In [None]:
test = pd.read_csv('housing_test.csv')
test.head() 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,...,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,...,6,2010,WD,Normal
1,1462,20,RL,81.0,...,6,2010,WD,Normal
2,1463,60,RL,74.0,...,3,2010,WD,Normal
3,1464,60,RL,78.0,...,6,2010,WD,Normal
4,1465,120,RL,43.0,...,1,2010,WD,Normal


In [None]:
test.describe()

In [None]:
train['LotFrontage'].fillna(train['LotFrontage'].median(), inplace=True)
train['MasVnrArea'].fillna(0, inplace=True)
train['GarageYrBlt'].fillna(0, inplace=True)

In [None]:
MSZoningdummy = pd.get_dummies(train['MSZoning'])
MSZoningdummy.columns = ['C (all)','FV','RH','RL','RM'] 

LotShapedummy = pd.get_dummies(train['LotShape'])
LotShapedummy.columns = ['Reg','IR1','IR2','IR3']

LotConfigdummy = pd.get_dummies(train['LotConfig'])
LotConfigdummy.columns = ['Inside','Corner','CulDSac','FR2','FR3']

Neighborhooddummy = pd.get_dummies(train['Neighborhood'])
Neighborhooddummy.columns = ['Blmngtn','Blueste','BrDale','BrkSide','ClearCR','CollgCr','Crawfor','Edwards','Gilbert','IDOTRR','MeadowV','Mitchel','Names','NoRidge','NPkVill','NridgHt','NWAmes','OldTown','SWISU','Sawyer','SawyerW','Somerst','StoneBr','Timber','Veenker']

BldgTypedummy = pd.get_dummies(train['BldgType'])
BldgTypedummy.columns = ['1Fam','2FmCon','Duplx','TwnhsE','TwnhsI']

HouseStyledummy = pd.get_dummies(train['HouseStyle'])
HouseStyledummy.columns = ['1Story','1.5Fin','1.5Unf','2Story','2.5Fin','2.5Unf','SFoyer','SLvl']
HouseStyledummy = HouseStyledummy.drop(['2.5Fin'], axis=1)
#I drop this because the test set has no observations where HouseStyle = 2.5Fin

RoofStyledummy = pd.get_dummies(train['RoofStyle'])
RoofStyledummy.columns = ['Flat','Gable','Gambrel','Hip','Mansard','Shed']

Exterior1stdummy = pd.get_dummies(train['Exterior1st'])
Exterior1stdummy.columns = ['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','ImStucc','MetalSd','Plywood','Stone','Stucco','VinylSd','Wd Sdng','WdShing']
Exterior1stdummy = Exterior1stdummy.drop(['ImStucc','Stone'], axis=1)
#I drop these because the test set has no observations where Exterior1st = ImStucc or Stone

Exterior2nddummy = pd.get_dummies(train['Exterior2nd'])
Exterior2nddummy.columns = ['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','ImStucc','MetalSd','Other','Plywood','Stone','Stucco','VinylSd','Wd Sdng','WdShing']
Exterior2nddummy = Exterior2nddummy.drop(['BrkComm'], axis=1)
#I drop this because the test set has no observations where Exterior2nd = BrkComm

MasVnrTypedummy = pd.get_dummies(train['MasVnrType']) 
MasVnrTypedummy.columns = ['BrkCmn','BrkFace','None','Stone']

Foundationdummy = pd.get_dummies(train['Foundation'])
Foundationdummy.columns = ['BrkTil','CBlock','PConc','Slab','Stone','Wood'] 

BsmtQualdummy = pd.get_dummies(train['BsmtQual'])
BsmtQualdummy.columns = ['Ex','Gd','TA','Fa'] 

BsmtConddummy = pd.get_dummies(train['BsmtCond'])
BsmtConddummy.columns = ['Gd','TA','Fa','Po'] 

BsmtExposuredummy = pd.get_dummies(train['BsmtExposure'])
BsmtExposuredummy.columns = ['Gd','Av','Mn','No'] 

HeatingQCdummy = pd.get_dummies(train['HeatingQC'])
HeatingQCdummy.columns = ['Ex','Gd','TA','Fa','Po'] 

Electricaldummy = pd.get_dummies(train['Electrical'])
Electricaldummy.columns = ['SBrkr','FuseA','FuseF','FuseP','Mix']
Electricaldummy = Electricaldummy.drop(['Mix'], axis=1)
#I drop this because the test set has no observations where Electrical = Mix

KitchenQualdummy = pd.get_dummies(train['KitchenQual'])
KitchenQualdummy.columns = ['Ex','Gd','TA','Fa']   

GarageTypedummy = pd.get_dummies(train['GarageType'])
GarageTypedummy.columns = ['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd'] 

GarageFinishdummy = pd.get_dummies(train['GarageFinish'])
GarageFinishdummy.columns = ['Fin','RFn','Unf'] 

GarageQualdummy = pd.get_dummies(train['GarageQual'])
GarageQualdummy.columns = ['Ex','Gd','TA','Fa','Po'] 
GarageQualdummy = GarageQualdummy.drop(['Ex'], axis=1)
#I drop this because the test set has no observations where GarageQual = Ex

PavedDrivedummy = pd.get_dummies(train['PavedDrive'])
PavedDrivedummy.columns = ['Y','P','N']

SaleTypedummy = pd.get_dummies(train['SaleType'])
SaleTypedummy.columns = ['WD','CWD','New','COD','Con','ConLw','ConLI','ConLD','Oth']

SaleConditiondummy = pd.get_dummies(train['SaleCondition'])
SaleConditiondummy.columns = ['Normal','Abnorml','AdjLand','Alloca','Family','Partial']

In [None]:
#Appends all of the binary variables to train and drops the original qualitative variables that were used to generate them
train = pd.concat([train,MSZoningdummy,LotShapedummy,LotConfigdummy,Neighborhooddummy,BldgTypedummy,HouseStyledummy,RoofStyledummy,Exterior1stdummy,Exterior2nddummy,MasVnrTypedummy,
                   Foundationdummy,BsmtQualdummy,BsmtConddummy,BsmtExposuredummy,HeatingQCdummy,Electricaldummy,KitchenQualdummy,GarageTypedummy,
                   GarageFinishdummy,GarageQualdummy,PavedDrivedummy,SaleTypedummy,SaleConditiondummy], axis=1) 

train = train.drop(['MSZoning','LotShape','LotConfig','Neighborhood','BldgType','HouseStyle','RoofStyle','Exterior1st','Exterior2nd','MasVnrType','Foundation','BsmtQual',
                    'BsmtCond','BsmtExposure','HeatingQC','Electrical','KitchenQual','GarageType','GarageFinish','GarageQual','PavedDrive',
                    'SaleType','SaleCondition'], axis=1)        

train.head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,...,AdjLand,Alloca,Family,Partial
0,1,60,65.0,8450,...,0,0,1,0
1,2,20,80.0,9600,...,0,0,1,0
2,3,60,68.0,11250,...,0,0,1,0
3,4,70,60.0,9550,...,0,0,0,0
4,5,60,84.0,14260,...,0,0,1,0


In [None]:
#Creates labeled data set
y = train['SalePrice']
X = train.drop(['SalePrice','Id','Street','Alley','LandContour','Utilities','LandSlope','Condition1','Condition2','RoofMatl','ExterQual','ExterCond','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','Heating','CentralAir','LowQualFinSF','Functional','FireplaceQu','GarageCond','PoolQC','Fence','MiscFeature'], axis=1)
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,...,AdjLand,Alloca,Family,Partial
0,60,65.0,8450,7,...,0,0,1,0
1,20,80.0,9600,6,...,0,0,1,0
2,60,68.0,11250,7,...,0,0,1,0
3,70,60.0,9550,7,...,0,0,0,0
4,60,84.0,14260,8,...,0,0,1,0


In [None]:
X = X.apply(pd.to_numeric, errors='coerce')
y = y.apply(pd.to_numeric, errors='coerce')
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
x_scaled = scaler.transform(X)

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, random_state=42)

Now, I tune several possible regression models. First, I import lasso, ridge, decision tree regressor, random forest regressor, k neighbors regressor, gradient boosting regressor, and support vector machine. For each of these methods, I choose tuning parameters by using the gridsearch package to perform cross-validation with 5 folds. Once gridsearch has chosen the best tuning parameters for the training set, I use these parameters to estimate the model on the test set.


In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC

In [None]:
#Lasso
from sklearn.model_selection import GridSearchCV
alpha_grid = {'alpha': [.008, .01, .012, .014, .016 ,.018, .02 ]}
grid_search = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(X_train,y_train)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test))) 

Best alpha:  0.02
Test set score: 0.86


In [None]:
#Ridge
alpha_grid = {'alpha': [.008, .01, .012, .014, .016 ,.018, .02 ]}
grid_search = GridSearchCV(Ridge(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(X_train,y_train)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test))) 

Best alpha:  0.02
Test set score: 0.86


In [None]:
#Decision Tree
grid = {'max_depth': [2,3,5,8,10,12]}
grid_search = GridSearchCV(DecisionTreeRegressor(),grid,cv=5,return_train_score=True)
best_model=grid_search.fit(X_train,y_train)
print("Best max depth: ",best_model.best_estimator_.get_params()['max_depth'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test)))

Best max depth:  12
Test set score: 0.77


In [None]:
#Random Forest
grid = {'max_depth': [2,3,5,8,10], 'n_estimators': [10,20,50,75,100,125], 'min_samples_leaf': [1,2,3,5], 'max_features': ['auto','sqrt','log2']}
grid_search = GridSearchCV(RandomForestRegressor(),grid,cv=5,return_train_score=True)
best_model=grid_search.fit(X_train,y_train)
print("Best number of estimators: ",best_model.best_estimator_.get_params()['n_estimators'])
print("Best min samples per leaf: ",best_model.best_estimator_.get_params()['min_samples_leaf'])
print("Best max features: ",best_model.best_estimator_.get_params()['max_features'])
print("Best max depth: ",best_model.best_estimator_.get_params()['max_depth'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test))) 

Best number of estimators:  100
Best min samples per leaf:  1
Best max features:  sqrt
Best max depth:  10
Test set score: 0.86


In [None]:
#K Neighbors
grid = {'n_neighbors': [2,3,5,8,10], 'weights': ['uniform','distance'], 'algorithm': ['auto','ball_tree','kd_tree','brute']} 
grid_search = GridSearchCV(KNeighborsRegressor(),grid,cv=5,return_train_score=True) 
best_model=grid_search.fit(X_train,y_train)
print("Best number of neighbors: ",best_model.best_estimator_.get_params()['n_neighbors'])
print("Best weights: ",best_model.best_estimator_.get_params()['weights'])
print("Best algorithm: ",best_model.best_estimator_.get_params()['algorithm'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test))) 

Best number of neighbors:  10
Best weights:  distance
Best algorithm:  auto
Test set score: 0.75


In [None]:
#Gradient Boosting
grid = {'max_depth': [2,3,5,8,10], 'n_estimators': [10,20,50,75,100,125], 'min_samples_leaf': [1,2,3,5], 'max_features': ['auto','sqrt','log2'], 'loss': ['ls','lad','huber','quantile']} 
grid_search = GridSearchCV(GradientBoostingRegressor(),grid,cv=5,return_train_score=True) 
best_model=grid_search.fit(X_train,y_train)
print("Best number of estimators: ",best_model.best_estimator_.get_params()['n_estimators'])
print("Best min samples per leaf: ",best_model.best_estimator_.get_params()['min_samples_leaf'])
print("Best max features: ",best_model.best_estimator_.get_params()['max_features'])
print("Best max depth: ",best_model.best_estimator_.get_params()['max_depth'])
print("Best loss function: ",best_model.best_estimator_.get_params()['loss'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test))) 

Best number of estimators:  100
Best min samples per leaf:  3
Best max features:  sqrt
Best max depth:  10
Best loss function:  ls
Test set score: 0.90


In [None]:
#Support Vector Machine
grid = {'C': [0.10, 1.00, 10.00, 1000.00],'gamma': [0.10, 1.00, 10.00],'kernel': ['rbf', 'poly', 'linear']}
grid_search = GridSearchCV(SVC(random_state=0),grid,cv=5,return_train_score=True) 
best_model=grid_search.fit(X_train,y_train)
print("Best C: ",best_model.best_estimator_.get_params()['C'])
print("Best gamma: ",best_model.best_estimator_.get_params()['gamma'])
print("Best kernel: ",best_model.best_estimator_.get_params()['kernel'])
print("Test set score: {:.2f}".format(grid_search.score(X_test,y_test))) 

Finally, now that I have tuned several possible models, I choose the gradient boosted regressor model because it had the highest score on the test set. Then I use all of X and y (not just X_train and y_train) to fit the model, and then I predict sale prices on the test set provided by Kaggle. Before I can generate those predictions, however, I clean up Kaggle's test set and label X_final in the same way as I did for X and the train set (filling in missing values, generating the same binary variables, selecting the same columns for X_final, standardizing X_final).

In [None]:
boost = GradientBoostingRegressor(max_depth=5,n_estimators=125,min_samples_leaf=1,max_features='sqrt',loss='ls') 
boost.fit(X, y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=5,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=125,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
#Makes the test set usable
test['LotFrontage'].fillna(test['LotFrontage'].median(), inplace=True) 
test['MasVnrArea'].fillna(0, inplace=True)
test['GarageYrBlt'].fillna(0, inplace=True) 


MSZoningdummy = pd.get_dummies(test['MSZoning'])
MSZoningdummy.columns = ['C (all)','FV','RH','RL','RM'] 

LotShapedummy = pd.get_dummies(test['LotShape'])
LotShapedummy.columns = ['Reg','IR1','IR2','IR3']

LotConfigdummy = pd.get_dummies(test['LotConfig'])
LotConfigdummy.columns = ['Inside','Corner','CulDSac','FR2','FR3']

Neighborhooddummy = pd.get_dummies(test['Neighborhood'])
Neighborhooddummy.columns = ['Blmngtn','Blueste','BrDale','BrkSide','ClearCR','CollgCr','Crawfor','Edwards','Gilbert','IDOTRR','MeadowV','Mitchel','Names','NoRidge','NPkVill','NridgHt','NWAmes','OldTown','SWISU','Sawyer','SawyerW','Somerst','StoneBr','Timber','Veenker']

BldgTypedummy = pd.get_dummies(test['BldgType'])
BldgTypedummy.columns = ['1Fam','2FmCon','Duplx','TwnhsE','TwnhsI']

HouseStyledummy = pd.get_dummies(test['HouseStyle'])
HouseStyledummy.columns = ['1Story','1.5Fin','1.5Unf','2Story','2.5Unf','SFoyer','SLvl'] 

RoofStyledummy = pd.get_dummies(test['RoofStyle'])
RoofStyledummy.columns = ['Flat','Gable','Gambrel','Hip','Mansard','Shed']

Exterior1stdummy = pd.get_dummies(test['Exterior1st']) 
Exterior1stdummy.columns = ['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','MetalSd','Plywood','Stucco','VinylSd','Wd Sdng','WdShing']

Exterior2nddummy = pd.get_dummies(test['Exterior2nd'])
Exterior2nddummy.columns = ['AsbShng','AsphShn','BrkFace','CBlock','CemntBd','HdBoard','ImStucc','MetalSd','Other','Plywood','Stone','Stucco','VinylSd','Wd Sdng','WdShing']

MasVnrTypedummy = pd.get_dummies(test['MasVnrType']) 
MasVnrTypedummy.columns = ['BrkCmn','BrkFace','None','Stone']

Foundationdummy = pd.get_dummies(test['Foundation'])
Foundationdummy.columns = ['BrkTil','CBlock','PConc','Slab','Stone','Wood'] 

BsmtQualdummy = pd.get_dummies(test['BsmtQual']) 
BsmtQualdummy.columns = ['Ex','Gd','TA','Fa'] 

BsmtConddummy = pd.get_dummies(test['BsmtCond'])
BsmtConddummy.columns = ['Gd','TA','Fa','Po'] 

BsmtExposuredummy = pd.get_dummies(test['BsmtExposure'])
BsmtExposuredummy.columns = ['Gd','Av','Mn','No'] 

HeatingQCdummy = pd.get_dummies(test['HeatingQC'])
HeatingQCdummy.columns = ['Ex','Gd','TA','Fa','Po'] 

Electricaldummy = pd.get_dummies(test['Electrical'])
Electricaldummy.columns = ['SBrkr','FuseA','FuseF','FuseP']

KitchenQualdummy = pd.get_dummies(test['KitchenQual'])
KitchenQualdummy.columns = ['Ex','Gd','TA','Fa']

GarageTypedummy = pd.get_dummies(test['GarageType'])
GarageTypedummy.columns = ['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd'] 

GarageFinishdummy = pd.get_dummies(test['GarageFinish'])
GarageFinishdummy.columns = ['Fin','RFn','Unf'] 

GarageQualdummy = pd.get_dummies(test['GarageQual'])
GarageQualdummy.columns = ['Gd','TA','Fa','Po'] 

PavedDrivedummy = pd.get_dummies(test['PavedDrive'])
PavedDrivedummy.columns = ['Y','P','N']

SaleTypedummy = pd.get_dummies(test['SaleType'])
SaleTypedummy.columns = ['WD','CWD','New','COD','Con','ConLw','ConLI','ConLD','Oth']

SaleConditiondummy = pd.get_dummies(test['SaleCondition'])
SaleConditiondummy.columns = ['Normal','Abnorml','AdjLand','Alloca','Family','Partial']

In [None]:
test = pd.concat([test,MSZoningdummy,LotShapedummy,LotConfigdummy,Neighborhooddummy,BldgTypedummy,HouseStyledummy,RoofStyledummy,Exterior1stdummy,Exterior2nddummy,MasVnrTypedummy,
                   Foundationdummy,BsmtQualdummy,BsmtConddummy,BsmtExposuredummy,HeatingQCdummy,Electricaldummy,KitchenQualdummy,GarageTypedummy,
                   GarageFinishdummy,GarageQualdummy,PavedDrivedummy,SaleTypedummy,SaleConditiondummy], axis=1) 

test = test.drop(['MSZoning','LotShape','LotConfig','Neighborhood','BldgType','HouseStyle','RoofStyle','Exterior1st','Exterior2nd','MasVnrType','Foundation','BsmtQual',
                    'BsmtCond','BsmtExposure','HeatingQC','Electrical','KitchenQual','GarageType','GarageFinish','GarageQual','PavedDrive',
                    'SaleType','SaleCondition'], axis=1)           

test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,...,AdjLand,Alloca,Family,Partial
0,1461,20,80.0,11622,...,0,0,1,0
1,1462,20,81.0,14267,...,0,0,1,0
2,1463,60,74.0,13830,...,0,0,1,0
3,1464,60,78.0,9978,...,0,0,1,0
4,1465,120,43.0,5005,...,0,0,1,0


In [None]:
#creates X_final and gives it the same features as X
X_final = test.drop(['Id','Street','Alley','LandContour','Utilities','LandSlope','Condition1','Condition2','RoofMatl','ExterQual','ExterCond','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','Heating','CentralAir','LowQualFinSF','Functional','FireplaceQu','GarageCond','PoolQC','Fence','MiscFeature'], axis=1)
X_final.head()
X_final = X_final.apply(pd.to_numeric, errors='coerce')
scaler = StandardScaler().fit(X_final) 
x_scaled = scaler.transform(X_final) 
X_final.fillna(0, inplace=True)

In [None]:
prices_pred = boost.predict(X_final)