In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV, LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.kernel_ridge import KernelRidge

In [2]:
#df_train = pd.read_csv('house-prices-advanced-regression-techniques\\train.csv')
train = pd.read_csv('house-prices-advanced-regression-techniques\\train.csv', usecols=[1, 17, 18, 19, 43, 44, 46, 49, 50, 70, 77, 80])
test  = pd.read_csv('house-prices-advanced-regression-techniques\\test.csv', usecols=[1, 17, 18, 19, 43, 44, 46, 49, 50, 70, 77])

In [3]:
#Cleaning the data

#Deleting outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)


In [4]:
#train['SalePrice'].describe()

In [5]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=420)

In [6]:
#create training model based on SalePrice
X_train, y_train = train.loc[:, train.columns != 'SalePrice'], np.log(train['SalePrice'])

In [7]:
#define mean square root error
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
def rmse(model, X=X_train,y=y_train):
    rmse = np.sqrt(-cross_val_score(model, X, y,scoring="neg_mean_squared_error",cv=kfolds))
    return (rmse)

In [8]:
ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, cv=kfolds))
rf=RandomForestRegressor()
lr=LinearRegression()
tree=DecisionTreeRegressor(random_state=1,max_depth=4)
KRR = KernelRidge()
GBoost = GradientBoostingRegressor()

In [17]:
#models

In [None]:
#for

In [9]:
ridge_score = rmse(ridge)
print(ridge_score.mean(),ridge_score.std())

lasso_score = rmse(lasso)
print(lasso_score.mean(), lasso_score.std())

elasticnet_score = rmse(elasticnet)
print(elasticnet_score.mean(), elasticnet_score.std())

rf_score = rmse(rf)
print(rf_score.mean(), rf_score.std())

lr_score = rmse(lr)
print(lr_score.mean(), lr_score.std())

tree_score = rmse(tree)
print(tree_score.mean(), tree_score.std())

KRR_score = rmse(KRR)
print(KRR_score.mean(), KRR_score.std())

GBoost_score = rmse(GBoost)
print(GBoost_score.mean(), GBoost_score.std())

0.1479630123222037 0.010450655696767005
0.14823988657128737 0.010253389322763469
0.14827394735177143 0.010273839897275948




0.15973315366787172 0.013844714620668904
0.14788819327462252 0.010554149238535482
0.20855409439616776 0.01496959560281795
0.1480029135736407 0.010526558852274624
0.14326067118090358 0.011284268359540226


In [10]:
lasso_model = lasso.fit(X_train,y_train)
ridge_model = ridge.fit(X_train,y_train)
elasticnet_model = elasticnet.fit(X_train,y_train)
randomforest_model = rf.fit(X_train,y_train)
lr_model = lr.fit(X_train,y_train)
tree_model =tree.fit(X_train,y_train)
KRR_model = KRR.fit(X_train,y_train)
GBoost_model = GBoost.fit(X_train,y_train)



In [14]:
def blend_models(X):
    return (
        ((1/7) * lasso_model.predict(X)) + \
        ((1/7) * ridge_model.predict(X)) + \
        ((1/7) * elasticnet_model.predict(X)) + \
        ((1/7) * KRR_model.predict(X)) + \
        ((1/7) * lr_model.predict(X)) + \
        ((1/7) * tree_model.predict(X)) + \
        ((1/7) * GBoost_model.predict(X))
           )

In [15]:
#print(rmsle(y_train, blend_models_predict(X_train)))
print(rmsle(y_train,blend_models(X_train)))

0.14033966732188033


In [16]:
test['Prediction']=blend_models(test)

ValueError: operands could not be broadcast together with shapes (1459,12) (11,) (1459,12) 

In [16]:
test.head()

Unnamed: 0,MSSubClass,OverallQual,OverallCond,YearBuilt,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,ScreenPorch,YrSold,Prediction
0,20,5,6,1961,896,0,896,1,0,120,2010,11.720033
1,20,6,6,1958,1329,0,1329,1,1,0,2010,11.969654
2,60,5,5,1997,928,701,1629,2,1,0,2010,11.982485
3,60,6,6,1998,926,678,1604,2,1,0,2010,12.132429
4,120,8,5,1992,1280,0,1280,2,0,144,2010,12.241012


In [17]:
np.exp(test.Prediction)

0       123011.482710
1       157889.951870
2       159928.920970
3       185800.539453
4       207111.413351
5       173339.755968
6       172468.501599
7       167678.996695
8       190272.750072
9       107488.096104
10      183706.125821
11      111578.861868
12      106425.835178
13      153542.831766
14      144564.285447
15      361993.956529
16      250423.480256
17      303195.142114
18      274788.553378
19      451460.240645
20      306667.671538
21      216922.736865
22      174315.366511
23      174053.049287
24      178023.966470
25      199782.759886
26      338840.717210
27      250193.912186
28      192101.951668
29      202574.229736
            ...      
1429     77882.706805
1430    153938.980709
1431     76584.429732
1432    127153.084510
1433     70288.036085
1434    269613.621670
1435    254965.494148
1436    196453.215494
1437    180235.656264
1438    238356.888276
1439    154020.899841
1440    178464.834898
1441    184053.851127
1442    304622.049591
1443    32

In [18]:
output=pd.DataFrame({
    'Id':np.arange(1461,2920),
    'SalePrice':np.exp(test.Prediction)
})

In [19]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,123011.48271
1,1462,157889.95187
2,1463,159928.92097
3,1464,185800.539453
4,1465,207111.413351


In [20]:
output.to_csv('house-prices-advanced-regression-techniques\\bt_prediction12.csv',index=False)