<a href="https://colab.research.google.com/github/mzignis/advance_house_pricing/blob/master/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install xgboost
!pip install lightgbm



In [2]:
HOME = '/content/drive/My Drive/ml_competition/advance_house_pricing'
%cd $HOME

/content/drive/My Drive/ml_competition/advance_house_pricing


In [62]:
import os
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_log_error, r2_score

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.kernel_ridge import KernelRidge

## Prepare data

In [4]:
sns.set()
warnings.filterwarnings('ignore')

In [5]:
data_dir = os.path.join(HOME, 'data')

In [6]:
train_df = pd.read_csv(os.path.join(data_dir, 'preprocessed', 'train.csv'), index_col=0)
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,...,SaleCondition_Alloca_oh-encoded,SaleCondition_Family_oh-encoded,SaleCondition_Normal_oh-encoded,Id_scaled,MSSubClass_scaled,LotFrontage_scaled,LotArea_scaled,YearBuilt_scaled,YearRemodAdd_scaled,MasVnrArea_scaled,BsmtFinSF1_scaled,BsmtFinSF2_scaled,BsmtUnfSF_scaled,TotalBsmtSF_scaled,1stFlrSF_scaled,2ndFlrSF_scaled,LowQualFinSF_scaled,GrLivArea_scaled,GarageYrBlt_scaled,GarageArea_scaled,WoodDeckSF_scaled,OpenPorchSF_scaled,EnclosedPorch_scaled,3SsnPorch_scaled,ScreenPorch_scaled,PoolArea_scaled,MiscVal_scaled,YrSold_scaled,OverallQual_unscaled,OverallCond_unscaled,BsmtFullBath_unscaled,BsmtHalfBath_unscaled,FullBath_unscaled,HalfBath_unscaled,BedroomAbvGr_unscaled,KitchenAbvGr_unscaled,TotRmsAbvGrd_unscaled,Fireplaces_unscaled,GarageCars_unscaled,MoSold_unscaled
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,...,0.0,0.0,1.0,-1.0,0.2,-0.210526,-0.254076,0.652174,0.243243,1.193303,0.45279,0.0,-0.559829,-0.269652,-0.453608,1.173077,0.0,0.38007,0.589744,0.281573,0.0,0.529412,0.0,0.0,0.0,0.0,0.0,0.0,7,5,1,0,2,1,3,1,8,0,2,2
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,...,0.0,0.0,1.0,-0.998629,-0.6,0.578947,0.030015,0.065217,-0.486486,0.0,0.834679,0.0,-0.330769,0.538308,0.343643,0.0,0.0,-0.31209,-0.102564,-0.082816,1.77381,-0.367647,0.0,0.0,0.0,0.0,0.0,-0.5,6,8,0,1,2,0,3,1,6,1,2,5
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,...,0.0,0.0,1.0,-0.997258,0.2,-0.052632,0.437624,0.608696,0.216216,0.986301,0.14391,0.0,-0.074359,-0.142289,-0.327933,1.18956,0.0,0.497489,0.538462,0.530021,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,7,5,1,0,2,1,3,1,6,1,2,9
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,...,0.0,0.0,0.0,-0.995888,0.4,-0.473684,0.017663,-1.26087,-0.648649,0.0,-0.23517,0.0,0.106838,-0.468657,-0.247423,1.038462,0.0,0.390885,0.461538,0.670807,0.0,0.147059,272.0,0.0,0.0,0.0,0.0,-1.0,7,5,1,0,1,0,3,1,7,1,3,2
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,...,0.0,0.0,1.0,-0.994517,0.2,0.789474,1.181201,0.586957,0.162162,2.130898,0.381186,0.0,0.021368,0.305473,0.113893,1.446429,0.0,1.134029,0.512821,1.47412,1.142857,0.867647,0.0,0.0,0.0,0.0,0.0,0.0,8,5,1,0,2,1,4,1,9,1,3,12


In [7]:
targets_df = pd.read_csv(os.path.join(data_dir, 'preprocessed', 'targets.csv'), index_col=0)
targets_df.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [8]:
test_df = pd.read_csv(os.path.join(data_dir, 'preprocessed', 'test.csv'), index_col=0)
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,...,SaleCondition_Alloca_oh-encoded,SaleCondition_Family_oh-encoded,SaleCondition_Normal_oh-encoded,Id_scaled,MSSubClass_scaled,LotFrontage_scaled,LotArea_scaled,YearBuilt_scaled,YearRemodAdd_scaled,MasVnrArea_scaled,BsmtFinSF1_scaled,BsmtFinSF2_scaled,BsmtUnfSF_scaled,TotalBsmtSF_scaled,1stFlrSF_scaled,2ndFlrSF_scaled,LowQualFinSF_scaled,GrLivArea_scaled,GarageYrBlt_scaled,GarageArea_scaled,WoodDeckSF_scaled,OpenPorchSF_scaled,EnclosedPorch_scaled,3SsnPorch_scaled,ScreenPorch_scaled,PoolArea_scaled,MiscVal_scaled,YrSold_scaled,OverallQual_unscaled,OverallCond_unscaled,BsmtFullBath_unscaled,BsmtHalfBath_unscaled,FullBath_unscaled,HalfBath_unscaled,BedroomAbvGr_unscaled,KitchenAbvGr_unscaled,TotRmsAbvGrd_unscaled,Fireplaces_unscaled,GarageCars_unscaled,MoSold_unscaled
0,1461,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,...,0.0,0.0,1.0,-1.0,-0.6,0.722222,0.538713,-0.25,-0.756098,0.0,0.15625,144.0,-0.32872,-0.203846,-0.359528,0.0,0.0,-0.888152,-0.444444,0.968992,0.833333,-0.388889,0.0,0.0,120.0,0.0,0.0,1.0,5,6,0.0,0.0,1,0,2,1,5,0,1.0,6
1,1462,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,...,0.0,0.0,1.0,-0.998628,-0.6,0.777778,1.179692,-0.3125,-0.829268,0.666667,0.761303,0.0,-0.093426,0.655769,0.491159,0.0,0.0,-0.170671,-0.518519,-0.651163,2.339286,0.111111,0.0,0.0,0.0,0.0,12500.0,1.0,6,6,0.0,0.0,1,1,3,1,6,0,1.0,6
2,1463,60,RL,74.0,13830,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,...,0.0,0.0,1.0,-0.997257,0.2,0.388889,1.073791,0.5,0.146341,0.0,0.585771,0.0,-0.558824,-0.115385,-0.29666,1.036982,0.0,0.326429,0.444444,0.007752,1.261905,0.083333,0.0,0.0,0.0,0.0,0.0,1.0,5,5,0.0,0.0,2,1,3,1,6,1,2.0,3
3,1464,60,RL,78.0,9978,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,...,0.0,0.0,1.0,-0.995885,0.2,0.611111,0.140313,0.520833,0.146341,0.123457,0.334441,0.0,-0.235294,-0.119231,-0.300589,1.002959,0.0,0.285004,0.469136,-0.03876,2.142857,0.111111,0.0,0.0,0.0,0.0,0.0,1.0,6,6,0.0,0.0,2,1,3,1,7,1,2.0,6
4,1465,120,RL,43.0,5005,Pave,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,...,0.0,0.0,1.0,-0.994513,1.4,-1.333333,-1.064825,0.395833,0.0,0.0,-0.116356,0.0,0.963668,0.561538,0.394892,0.0,0.0,-0.251864,0.320988,0.100775,0.0,0.75,0.0,0.0,144.0,0.0,0.0,1.0,8,5,0.0,0.0,2,0,2,1,5,0,2.0,1


In [9]:
features = [x for x in train_df.columns if '_scaled' in x or '_unscaled' in x or '_oh-encoded' in x]
len(features)

233

In [10]:
x = train_df[features].values
y = targets_df.values

x.shape, y.shape

((1460, 233), (1460, 1))

In [11]:
x_test = test_df[features].values

In [12]:
x_train, x_val, y_train, y_val = train_test_split(x, y)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((1095, 233), (365, 233), (1095, 1), (365, 1))

## Default models comparations

In [63]:
def fit_and_score_model(model):
    model.fit(x_train, y_train)

    print(model.__class__.__name__)
    print(f'train score: {r2_score(y_train, model.predict(x_train)):.5f}')
    print(f'val score:   {r2_score(y_val, model.predict(x_val)):.5f}')
    print()


models = [DummyRegressor(), LinearRegression(), Ridge(), Lasso(), LinearSVR(), SVR(), SGDRegressor(),
          KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), 
          XGBRegressor(), LGBMRegressor(), KernelRidge(), ElasticNet()]
for model in models:
    fit_and_score_model(model)

DummyRegressor
train score: 0.00000
val score:   -0.00473

LinearRegression
train score: 0.94143
val score:   0.83606

Ridge
train score: 0.92389
val score:   0.83773

Lasso
train score: 0.94139
val score:   0.83750

LinearSVR
train score: -0.18780
val score:   -0.32338

SVR
train score: -0.04622
val score:   -0.08012

SGDRegressor
train score: -2826148297262558720.00000
val score:   -306135123845566464.00000

KNeighborsRegressor
train score: 0.72461
val score:   0.57655

DecisionTreeRegressor
train score: 1.00000
val score:   0.71069

RandomForestRegressor
train score: 0.97870
val score:   0.83557

XGBRegressor
train score: 0.96707
val score:   0.86262

LGBMRegressor
train score: 0.97481
val score:   0.84075

KernelRidge
train score: 0.92380
val score:   0.83861

ElasticNet
train score: 0.82289
val score:   0.81051



## Model

In [16]:
xgb = XGBRegressor()
param_grid = {
    'n_estimators': [150, 200, 250, 300, 350],
    'max_depth': [2, 4, 6, 8, ],
    # 'learning_rate': []
}

model = GridSearchCV(xgb, param_grid=param_grid, scoring='neg_mean_squared_log_error', n_jobs=2, cv=5)
model.fit(x_train, y_train)

best_model = model.best_estimator_
print(best_model)

fit_and_score_model(best_model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=None, n_estimators=350,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)
XGBRegressor
train score: 0.00728
val score:   0.01762



## Stacking model

In [111]:
estimators = [
    ('KernelRidge', LinearRegression()),
    ('ElasticNet', ElasticNet()), 
    ('XGBRegressor', XGBRegressor(max_depth=2, n_estimators=550)),
]

stack_model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10))
fit_and_score_model(stack_model)

StackingRegressor
train score: 0.95063
val score:   0.85697



## Voting model

In [115]:
estimators = [
    ('Stacking', stack_model),
    # ('RandomForestRegressor', RandomForestRegressor(n_estimators=500)),
    ('XGBRegressor', XGBRegressor(max_depth=2, n_estimators=550)), 
    ('LGBMRegressor', LGBMRegressor(max_depth=2, n_estimators=550)),
]

model = VotingRegressor(estimators=estimators, weights=[2, 2, 2])
fit_and_score_model(model)

VotingRegressor
train score: 0.97544
val score:   0.87243



In [116]:
test_df['SalePrice'] = model.predict(x_test)
test_df[['Id', 'SalePrice']].to_csv(os.path.join(data_dir, 'results', 'result008.csv'), index=False)