In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, StopWordsRemover,RegexTokenizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString, StandardScaler
import shutil
import os
from pyspark.ml.stat import Correlation
import pandas as pd
import seaborn as sns

In [2]:
#spark.stop()

In [3]:
#open Spark Session
spark = SparkSession.builder.appName('prices_houses_v5').master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
#spark.sparkContext.addPyFile("sparkxgb.zip")

In [4]:
#read source I remove last 10 entries for predictions 
data=spark.read.csv("train.csv", inferSchema=True,sep=',',header=True)
datatest=spark.read.csv("test.csv", schema=data.schema,sep=',',header=True)

In [5]:
len(data.columns),len(datatest.columns)

(81, 81)

In [6]:
columnListS = [item[0] for item in data.dtypes if item[1].startswith('string')]
columnListI = [item[0] for item in data.dtypes if item[1].startswith('int')]
columnListS_T = [item[0] for item in datatest.dtypes if item[1].startswith('string')]
columnListI_T = [item[0] for item in datatest.dtypes if item[1].startswith('int')]

In [7]:
datatest=datatest.fillna("NO_Value",subset=columnListS_T)
#datatest=datatest.fillna(0,subset=columnListI_T)
data=data.fillna("NO_Value",subset=columnListS)

In [8]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index",handleInvalid="keep").fit(data) for column in list(columnListS) ]
pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(data).transform(data)

indexers2 = [StringIndexer(inputCol=column, outputCol=column+"_index",handleInvalid="keep").fit(datatest) for column in list(columnListS_T) ]
pipeline2 = Pipeline(stages=indexers2)
df_rt = pipeline2.fit(datatest).transform(datatest)

In [9]:
columnListS_Cat=[item for item in df_r.columns if str(item).endswith('_index')]

In [10]:
len(columnListS_Cat)+len(columnListI)

81

In [11]:

lr_data = df_r.select(sf.col("SalePrice").alias("label"), *columnListI,*columnListS_Cat).dropna()

columns = lr_data.columns

In [12]:
lr_datat = df_rt.select(sf.col("SalePrice").alias("label"), *columnListI_T,*columnListS_Cat)
columnst = lr_datat.columns

In [13]:
lr_datat=lr_datat.fillna(0)

In [14]:
lr_datat.count()

1459

In [15]:
resumen2=lr_data.toPandas()
resumen3=lr_datat.toPandas()
resumen2.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1460.0,180921.195890,79442.502883,34900.0,129975.00,163000.0,214000.00,755000.0
Id,1460.0,730.500000,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.897260,42.300571,20.0,20.00,50.0,70.00,190.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.50,9478.5,11601.50,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.00,6.0,7.00,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.00,5.0,6.00,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.00,1973.0,2000.00,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.00,1994.0,2004.00,2010.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.00,383.5,712.25,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.00,0.0,0.00,1474.0


In [16]:
resumen3.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1459.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Id,1459.0,2185.279644,432.726126,0.0,1822.5,2188.0,2553.5,2919.0
MSSubClass,1459.0,57.302947,42.802319,0.0,20.0,50.0,70.0,190.0
LotArea,1459.0,9776.661412,4877.960604,0.0,7379.5,9392.0,11507.5,56600.0
OverallQual,1459.0,6.069225,1.461440,0.0,5.0,6.0,7.0,10.0
OverallCond,1459.0,5.540096,1.140477,0.0,5.0,5.0,6.0,9.0
YearBuilt,1459.0,1967.363263,94.349573,0.0,1953.0,1973.0,2001.0,2010.0
YearRemodAdd,1459.0,1979.588074,92.332975,0.0,1963.0,1992.0,2004.0,2010.0
BsmtFinSF1,1459.0,438.527073,455.393086,0.0,0.0,350.0,752.0,4010.0
BsmtFinSF2,1459.0,52.583276,176.698671,0.0,0.0,0.0,0.0,1526.0


In [17]:
corr=resumen2.corr()

In [18]:
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

In [19]:
corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

Unnamed: 0,label,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,MSZoning_index,LotFrontage_index,Street_index,Alley_index,LotShape_index,LandContour_index,Utilities_index,LotConfig_index,LandSlope_index,Neighborhood_index,Condition1_index,Condition2_index,BldgType_index,HouseStyle_index,RoofStyle_index,RoofMatl_index,Exterior1st_index,Exterior2nd_index,MasVnrType_index,MasVnrArea_index,ExterQual_index,ExterCond_index,Foundation_index,BsmtQual_index,BsmtCond_index,BsmtExposure_index,BsmtFinType1_index,BsmtFinType2_index,Heating_index,HeatingQC_index,CentralAir_index,Electrical_index,KitchenQual_index,Functional_index,FireplaceQu_index,GarageType_index,GarageYrBlt_index,GarageFinish_index,GarageQual_index,GarageCond_index,PavedDrive_index,PoolQC_index,Fence_index,MiscFeature_index,SaleType_index,SaleCondition_index
label,1.0,-0.022,-0.084,0.26,0.79,-0.078,0.52,0.51,0.39,-0.011,0.21,0.61,0.61,0.32,-0.026,0.71,0.23,-0.017,0.56,0.28,0.17,-0.14,0.53,0.47,0.64,0.62,0.32,0.32,-0.13,0.045,0.11,0.092,-0.021,0.046,-0.029,1.0,-0.18,0.29,-0.041,-0.093,0.27,0.073,-0.014,0.092,0.051,0.13,-0.029,0.0044,-0.17,-0.087,0.16,0.078,-0.17,-0.14,0.38,0.34,0.57,-0.13,-0.44,0.3,-0.17,0.15,-0.24,-0.089,-0.11,-0.37,-0.25,-0.23,0.46,-0.11,0.34,-0.25,-0.22,0.29,-0.16,-0.21,-0.21,0.092,-0.15,-0.061,0.084,0.0046
Id,-0.022,1.0,0.011,-0.033,-0.028,0.013,-0.013,-0.022,-0.005,-0.006,-0.0079,-0.015,0.01,0.0056,-0.044,0.0083,0.0023,-0.02,0.0056,0.0068,0.038,0.003,0.027,-0.02,0.017,0.018,-0.03,-0.00048,0.0029,-0.047,0.0013,0.057,-0.0062,0.021,0.00071,-0.022,-0.022,-0.03,-0.0089,-0.00029,-0.024,0.0083,0.013,-0.014,0.0058,-0.0018,-0.017,-0.0083,0.001,0.0058,0.014,-0.0033,0.0084,0.019,-0.012,-0.027,0.0027,-0.041,0.018,-0.04,-0.014,0.02,-0.016,-0.046,0.062,-0.011,-0.0098,-0.045,0.0041,-0.01,-0.019,-0.0043,-0.033,-3.4e-05,-0.013,0.00057,0.0089,0.043,0.0046,-0.014,-0.02,-0.017
MSSubClass,-0.084,0.011,1.0,-0.14,0.033,-0.059,0.028,0.041,-0.07,-0.066,-0.14,-0.24,-0.25,0.31,0.046,0.075,0.0035,-0.0023,0.13,0.18,-0.023,0.28,0.04,-0.046,-0.04,-0.099,-0.013,-0.0061,-0.012,-0.044,-0.026,0.0083,-0.0077,-0.014,-0.021,-0.084,0.28,0.03,0.025,0.18,-0.12,-0.025,-0.023,-0.038,-0.026,0.29,-0.048,0.063,0.78,0.36,-0.089,-0.014,0.031,0.063,-0.013,-0.018,0.032,0.011,-0.027,0.064,0.017,0.04,-0.085,-0.015,0.031,0.0087,0.1,-0.023,0.0035,0.0016,-0.018,0.16,-0.052,0.052,0.055,0.041,0.026,-0.0011,-0.1,-0.016,0.015,0.0048
LotArea,0.26,-0.033,-0.14,1.0,0.11,-0.0056,0.014,0.014,0.21,0.11,-0.0026,0.26,0.3,0.051,0.0048,0.26,0.16,0.048,0.13,0.014,0.12,-0.018,0.19,0.27,0.15,0.18,0.17,0.085,-0.018,0.02,0.043,0.078,0.038,0.0012,-0.014,0.26,-0.16,0.11,0.2,-0.084,0.32,0.34,0.01,0.13,0.44,0.068,0.036,0.028,-0.092,-0.024,0.11,0.18,0.068,0.066,0.071,0.072,0.038,-0.02,-0.0022,0.021,-0.026,0.14,0.029,0.048,0.023,-0.016,-0.05,-0.046,0.032,0.029,0.13,-0.066,0.022,0.047,-0.04,-0.056,-0.003,0.058,-0.035,0.09,-0.012,-0.016
OverallQual,0.79,-0.028,0.033,0.11,1.0,-0.092,0.57,0.55,0.24,-0.059,0.31,0.54,0.48,0.3,-0.03,0.59,0.11,-0.04,0.55,0.27,0.1,-0.18,0.43,0.4,0.6,0.56,0.24,0.31,-0.11,0.03,0.065,0.065,-0.031,0.071,-0.027,0.79,-0.094,0.26,-0.059,-0.027,0.2,-0.0016,-0.0019,0.064,-0.066,0.1,-0.011,0.017,-0.11,-0.04,0.096,0.089,-0.22,-0.2,0.39,0.33,0.58,-0.15,-0.48,0.28,-0.22,0.057,-0.36,-0.15,-0.12,-0.39,-0.27,-0.24,0.44,-0.14,0.28,-0.29,-0.25,0.28,-0.16,-0.23,-0.2,0.062,-0.17,-0.083,0.075,0.0025
OverallCond,-0.078,0.013,-0.059,-0.0056,-0.092,1.0,-0.38,0.074,-0.046,0.04,-0.14,-0.17,-0.14,0.029,0.025,-0.08,-0.055,0.12,-0.19,-0.061,0.013,-0.087,-0.058,-0.024,-0.19,-0.15,-0.0033,-0.033,0.07,0.026,0.055,-0.002,0.069,-0.0035,0.044,-0.078,-0.017,-0.15,-0.043,0.038,-0.034,-0.007,0.01,0.012,0.01,-0.024,0.071,0.048,-0.088,0.15,0.043,0.017,0.089,0.042,-0.17,-0.094,-0.2,0.17,0.25,-0.21,-0.099,-0.056,0.064,0.043,-0.072,0.07,-0.12,-0.1,-0.073,-0.12,-0.054,0.0035,0.25,-0.17,0.086,0.023,0.082,-0.015,0.14,0.062,-0.11,-0.13
YearBuilt,0.52,-0.013,0.028,0.014,0.57,-0.38,1.0,0.59,0.25,-0.049,0.15,0.39,0.28,0.01,-0.18,0.2,0.19,-0.038,0.47,0.24,-0.071,-0.17,0.096,0.15,0.54,0.48,0.22,0.19,-0.39,0.031,-0.05,0.0049,-0.034,0.012,-0.014,0.52,-0.17,0.2,-0.021,-0.17,0.23,-0.048,-0.012,0.083,-0.074,0.028,-0.083,-0.075,-0.024,-0.28,-0.037,-0.0038,-0.43,-0.35,0.4,0.27,0.49,-0.27,-0.68,0.3,-0.19,0.091,-0.26,-0.072,-0.18,-0.43,-0.38,-0.3,0.29,-0.14,0.17,-0.32,-0.49,0.34,-0.35,-0.32,-0.4,0.0014,-0.19,-0.071,0.082,0.016
YearRemodAdd,0.51,-0.022,0.041,0.014,0.55,0.074,0.59,1.0,0.13,-0.068,0.18,0.29,0.24,0.14,-0.062,0.29,0.12,-0.012,0.44,0.18,-0.041,-0.15,0.19,0.11,0.42,0.37,0.21,0.23,-0.19,0.045,-0.039,0.0058,-0.01,0.021,0.036,0.51,-0.041,0.19,-0.065,0.01,0.18,-0.025,-0.034,0.076,-0.059,0.016,-0.039,-0.012,-0.055,-0.14,0.0029,-0.012,-0.37,-0.34,0.27,0.14,0.47,-0.09,-0.49,0.25,-0.14,0.037,-0.36,-0.11,-0.16,-0.42,-0.3,-0.32,0.39,-0.072,0.11,-0.18,-0.38,0.31,-0.16,-0.2,-0.19,0.012,-0.15,-0.088,0.04,-0.018
BsmtFinSF1,0.39,-0.005,-0.07,0.21,0.24,-0.046,0.25,0.13,1.0,-0.05,-0.5,0.52,0.45,-0.14,-0.065,0.21,0.65,0.067,0.059,0.0043,-0.11,-0.081,0.044,0.26,0.22,0.3,0.2,0.11,-0.1,0.026,0.062,0.14,0.0036,-0.016,0.014,0.39,-0.17,0.088,0.016,-0.13,0.16,0.11,-0.019,0.066,0.11,0.07,0.0056,0.0068,-0.022,-0.12,0.12,0.17,0.012,0.031,0.24,0.2,0.14,-0.057,-0.19,0.041,-0.17,0.13,0.17,-0.087,-0.072,-0.11,-0.17,-0.16,0.11,-0.051,0.19,-0.2,-0.036,0.13,-0.15,-0.13,-0.18,0.12,-6.5e-05,-0.0068,0.02,0.0053
BsmtFinSF2,-0.011,-0.006,-0.066,0.11,-0.059,0.04,-0.049,-0.068,-0.05,1.0,-0.21,0.1,0.097,-0.099,0.015,-0.0096,0.16,0.071,-0.076,-0.032,-0.016,-0.041,-0.035,0.047,-0.038,-0.018,0.068,0.0031,0.037,-0.03,0.089,0.042,0.0049,-0.015,0.032,-0.011,-0.086,0.0032,0.038,-0.055,0.06,0.02,0.05,0.011,0.086,0.011,0.019,0.0074,-0.043,-0.045,0.079,0.076,0.11,0.1,-0.059,-0.045,-0.067,0.0066,0.048,-0.11,-0.033,0.063,0.27,0.69,-0.011,0.025,-0.04,-0.029,-0.065,0.13,0.051,-0.041,0.054,-0.051,-0.049,-0.023,-0.068,0.069,0.11,0.0092,0.012,-0.02


In [20]:
resumen2.columns

Index(['label', 'Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice',
       'MSZoning_index', 'LotFrontage_index', 'Street_index', 'Alley_index',
       'LotShape_index', 'LandContour_index', 'Utilities_index',
       'LotConfig_index', 'LandSlope_index', 'Neighborhood_index',
       'Condition1_index', 'Condition2_index', 'BldgType_index',
       'HouseStyle_index', 'RoofStyle_index', 'RoofMatl_index',
       'Exterior1st_index', 'Exterior2nd_index', 'MasVnrType_index',
       'MasVnrArea_index', 'ExterQual_index', 'ExterCond_ind

In [21]:
features_extra=['OverallQual','YearBuilt','YearRemodAdd','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','TotRmsAbvGrd','GarageCars','GarageArea','ExterQual_index']

In [55]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.grid_search import GridSearchCV   #Perforing grid search
from scipy.stats import skew
from collections import OrderedDict
import numpy as np

In [27]:
df_train=resumen2[features_extra]
df_test=resumen3[features_extra]

In [28]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df_train)
scaled_df = pd.DataFrame(scaled_df, columns=features_extra)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df, resumen2['label'], test_size=0.20, random_state=42)

In [34]:
import xgboost
from xgboost import plot_importance

In [35]:
model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 

In [36]:
model.fit(X_train,y_train)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, gamma=0, learning_rate=0.07, max_delta_step=0,
       max_depth=3, min_child_weight=1.5, missing=None, n_estimators=10000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
       silent=True, subsample=0.6)

In [43]:
#for tuning parameters
parameters_for_testing = {
    'colsample_bytree':[0.4,0.6,0.8],
    'gamma':[0,0.03,0.1,0.3],
    'min_child_weight':[1.5,6,10],
    'learning_rate':[0.1,0.07],
    'max_depth':[3,5],
    'n_estimators':[10000],
    'reg_alpha':[1e-5, 1e-2,  0.75],
    'reg_lambda':[1e-5, 1e-2, 0.45],
    'subsample':[0.6,0.95]  
}

                   
xgb_model = xgboost.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=5,
     min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=6, scale_pos_weight=1, seed=27)

gsearch1 = GridSearchCV(estimator = xgb_model, param_grid = parameters_for_testing, n_jobs=6,iid=False, verbose=10,scoring='neg_mean_squared_error')
gsearch1.fit(X_train,y_train)
print (gsearch1.grid_scores_)
print('best params')
print (gsearch1.best_params_)
print('best score')
print (gsearch1.best_score_)

Fitting 3 folds for each of 2592 candidates, totalling 7776 fits


[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   22.0s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   46.7s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  2.6min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  3.4min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  4.4min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  5.6min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  6.8min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  8.0min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  9.3min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed: 10.8min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed: 12.3min
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed: 13.8min
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed: 16.2min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed: 19.1min
[Parallel(n_jobs=6)]: Do

[mean: -1126795250.68597, std: 151299208.16993, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1.5, 'n_estimators': 10000, 'reg_alpha': 1e-05, 'reg_lambda': 1e-05, 'subsample': 0.6}, mean: -1060308655.35342, std: 143810189.49095, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1.5, 'n_estimators': 10000, 'reg_alpha': 1e-05, 'reg_lambda': 1e-05, 'subsample': 0.95}, mean: -1128619788.44726, std: 157247181.35718, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1.5, 'n_estimators': 10000, 'reg_alpha': 1e-05, 'reg_lambda': 0.01, 'subsample': 0.6}, mean: -1048836201.57415, std: 138144045.03041, params: {'colsample_bytree': 0.4, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1.5, 'n_estimators': 10000, 'reg_alpha': 1e-05, 'reg_lambda': 0.01, 'subsample': 0.95}, mean: -1133690934.67244, std: 163572907.175

In [44]:
best_xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0.03,                 
                 learning_rate=0.01,
                 max_depth=5,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=1e-05,
                 reg_lambda=1e-05,
                 subsample=0.95)
#                 seed=42)
best_xgb_model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, gamma=0.03, learning_rate=0.01,
       max_delta_step=0, max_depth=5, min_child_weight=1.5, missing=None,
       n_estimators=10000, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=1e-05, reg_lambda=1e-05,
       scale_pos_weight=1, seed=None, silent=True, subsample=0.95)

In [50]:
ypred = best_xgb_model.predict(X_test)

In [58]:
RMSE= np.sqrt(mean_squared_error(y_test, ypred))

In [56]:
r2_score(y_test, ypred)

0.88476591236752655

In [59]:
print(RMSE)

29730.1705735


In [60]:
resumen3['Prediction'] = best_xgb_model.predict(df_test)

In [61]:
filename = 'submission.csv'
pd.DataFrame({'Id': resumen3.Id, 'SalePrice': resumen3.Prediction}).to_csv(filename, index=False)

In [63]:
print(resumen3['Prediction'].head())
print(resumen3['Prediction'].count())

0    248527.078125
1    248527.078125
2    248527.078125
3    248527.078125
4    239355.234375
Name: Prediction, dtype: float32
1459
