<a href="https://colab.research.google.com/github/mzignis/advance_house_pricing/blob/master/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
HOME = '/content/drive/My Drive/ml_competition/advance_house_pricing'
%cd $HOME

/content/drive/My Drive/ml_competition/advance_house_pricing


In [95]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, cross_validate
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_log_error

import warnings

warnings.filterwarnings('ignore')

sns.set()

In [122]:
train_data = pd.read_csv(os.path.join(HOME, 'data', 'preprocessed_train.csv'), index_col=0)
test_data = pd.read_csv(os.path.join(HOME, 'data', 'preprocessed_test.csv'), index_col=0)

In [4]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

In [5]:
x, y = train_data.drop(columns=['SalePrice']).values, train_data['SalePrice'].values

In [6]:
x.shape, y.shape

((1175, 230), (1175,))

In [86]:
n_folds = 5

def rmsle_cv(model):
    model.fit(x, y)
    scores = np.sqrt(-cross_validate(model, x, y, cv=5, scoring='neg_mean_squared_log_error')['test_score'])
    return np.mean(scores), np.std(scores)

In [74]:
lasso = Lasso(alpha =0.0005, random_state=1)
rmsle_cv(lasso)

(0.13639326900327547, 0.014125677290640943)

In [75]:
e_net = make_pipeline(RobustScaler(), ElasticNet())
rmsle_cv(e_net)

(0.13360767583805194, 0.0066240542359323334)

In [90]:
lin = LinearRegression()
rmsle_cv(lin)

(0.1366942405993384, 0.014044236102700313)

In [78]:
g_boost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
rmsle_cv(g_boost)

(0.11680205179673901, 0.007825203040739909)

In [85]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

rmsle_cv(model_xgb)

(0.11348432915522277, 0.00956858750546073)

In [87]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

rmsle_cv(model_lgb)

(0.1160712632376445, 0.009293075651214958)

In [98]:
estimators = [
    ('Linear', lin),
    ('ElasticNet', e_net), 
    # ('Lasso', lasso), 
    ('GradientBoost', g_boost),
]

stack_model = StackingRegressor(estimators=estimators, final_estimator=lasso)
rmsle_cv(stack_model)

(0.11064409833934337, 0.006202875386233558)

In [143]:
estimators = [
    ('Stacking', stack_model),
    # ('XGBRegressor', model_xgb), 
    ('LGBMRegressor', model_lgb),
]

voting_model = VotingRegressor(estimators=estimators, weights=[0.50, 0.50])
rmsle_cv(model)

(0.11348432915522277, 0.00956858750546073)

In [110]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_log_error(y, y_pred))

In [111]:
stack_model.fit(x, y)
stacked_train_pred = stack_model.predict(x)
# stacked_pred = np.expm1(stack_model.predict(test.values))
print(rmsle(y, stacked_train_pred))

0.056558399014851775


In [112]:
model_xgb.fit(x, y)
xgb_train_pred = model_xgb.predict(x)
# xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y, xgb_train_pred))

0.01955277474239371


In [113]:
model_lgb.fit(x, y)
lgb_train_pred = model_lgb.predict(x)
# lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y, lgb_train_pred))

0.07385717794609563


In [155]:
model_lgb.fit(x, y)
model_lgb_pred = model_lgb.predict(x)
# voting_pred = np.expm1(model_voting.predict(test.values))

In [156]:
print(rmsle(y, model_lgb_pred))

0.07385717794609563


In [157]:
pred = stack_model.predict(test_data.values)
pred

array([120234.11023907, 155995.81170457, 187059.73688024, ...,
       151375.15137956, 114513.04291155, 203267.51400486])

In [158]:
sub = pd.DataFrame()
sub['Id'] = test_data.index
sub['SalePrice'] = pred
sub.to_csv('submission.csv', index=False)