In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LinearRegression, RidgeCV

train = pd.read_csv('house-prices-advanced-regression-techniques\\train.csv', usecols=[1, 17, 18, 19, 43, 44, 46, 49, 50, 70, 77, 80])
test  = pd.read_csv('house-prices-advanced-regression-techniques\\test.csv', usecols=[1, 17, 18, 19, 43, 44, 46, 49, 50, 70, 77])

In [None]:
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

In [2]:
n_folds = 5
kfolds = KFold(n_splits=10, shuffle=True, random_state=420)

#Root mean squared logarithmic error
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [3]:
X_train, y_train = train.loc[:, train.columns != 'SalePrice'], np.log(train['SalePrice'])

In [4]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kfolds))

In [5]:
ridge_score = rmsle_cv(ridge)
print("Ridge: ",ridge_score.mean(),ridge_score.std())

lasso_score = rmsle_cv(lasso)
print("Lasso: ",lasso_score.mean(), lasso_score.std())

elasticnet_score = rmsle_cv(ENet)
print("ENet: ",elasticnet_score.mean(), elasticnet_score.std())

KRR_score = rmsle_cv(KRR)
print("KRR: ",KRR_score.mean(), KRR_score.std())

GBoost_score = rmsle_cv(GBoost)
print("GBoost: ",GBoost_score.mean(), GBoost_score.std())

Ridge:  0.10559496908864836 0.010035116638289836
Lasso:  0.1053345289274696 0.009880980969159014
ENet:  0.10532964659103321 0.009890147283246775




KRR:  0.16773531473805003 0.0841215233170332
GBoost:  0.04153160891086459 0.004578851618782206


In [7]:
lasso_model = lasso.fit(X_train,y_train)
ridge_model = ridge.fit(X_train,y_train)
elasticnet_model = ENet.fit(X_train,y_train)
KRR_model = KRR.fit(X_train,y_train)
GBoost_model = GBoost.fit(X_train,y_train)

  overwrite_a=False)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.05, loss='huber', max_depth=4,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=15, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=3000,
                          n_iter_no_change=None, presort='auto', random_state=5,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

In [12]:
def blend_models(X):
    return (
        ((.1) * lasso_model.predict(X)) +
        ((.1) * ridge_model.predict(X)) +
        ((.1) * elasticnet_model.predict(X)) +
        ((.1) * KRR_model.predict(X)) +
        ((.6) * GBoost_model.predict(X))
           )

In [13]:
print(rmsle(y_train,blend_models(X_train)))

TypeError: estimator should be an estimator implementing 'fit' method, array([12.21973264, 12.10087929, 12.27728748, ..., 12.44632196,
       11.75670782, 11.90345719]) was passed

In [9]:
test['Prediction']=blend_models(test)

In [10]:
test.head()

Unnamed: 0,MSSubClass,OverallQual,OverallCond,YearBuilt,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,ScreenPorch,YrSold,Prediction
0,20,5,6,1961,896,0,896,1,0,120,2010,11.75101
1,20,6,6,1958,1329,0,1329,1,1,0,2010,11.971719
2,60,5,5,1997,928,701,1629,2,1,0,2010,12.07243
3,60,6,6,1998,926,678,1604,2,1,0,2010,12.137646
4,120,8,5,1992,1280,0,1280,2,0,144,2010,12.184792


In [11]:
np.exp(test.Prediction)

0       126881.676818
1       158216.420196
2       174980.476150
3       186772.458510
4       195788.889144
5       178669.375165
6       180321.732687
7       170334.042516
8       185377.503759
9       106170.549005
10      188966.440540
11      101138.415543
12       93963.002492
13      147568.199164
14      134854.428560
15      404318.535823
16      255809.178232
17      304439.019112
18      276283.238076
19      455533.731040
20      310616.835158
21      215406.958961
22      187967.324254
23      171619.059630
24      177966.753628
25      208560.037335
26      335328.104807
27      252375.741371
28      201521.047338
29      212503.822536
            ...      
1429     74467.715338
1430    146296.464327
1431     66432.203387
1432    132196.635184
1433     66406.083955
1434    280721.185950
1435    266235.330454
1436    200210.271205
1437    157003.568115
1438    241321.481308
1439    159785.291438
1440    194646.902691
1441    176710.853077
1442    334581.899323
1443    33

In [12]:
output=pd.DataFrame({
    'Id':np.arange(1461,2920),
    'SalePrice':np.exp(test.Prediction)
})

In [13]:
output.to_csv('house-prices-advanced-regression-techniques\\bt_prediction14.csv',index=False)