In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor
#from xgboost import XGBRegressor, DMatrix
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
%matplotlib inline

In [2]:
np.random.seed(21)

In [3]:
train = pd.read_csv("transform/train_transform_after_preparation.csv")
test = pd.read_csv("transform/test_transform_after_preparation.csv")

In [4]:
train.shape

(1456, 346)

In [5]:
test.shape

(1459, 345)

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,1,0.235294,0.207668,0.03342,0.666667,0.5,0.050725,0.116667,0.1225,0.322669,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,208500
1,2,0.0,0.255591,0.038795,0.555556,0.875,0.246377,0.566667,0.0,0.446984,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500
2,3,0.235294,0.217252,0.046507,0.666667,0.5,0.065217,0.133333,0.10125,0.222121,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,4,0.294118,0.191693,0.038561,0.666667,0.5,0.688406,0.666667,0.0,0.09872,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,140000
4,5,0.235294,0.268371,0.060576,0.777778,0.5,0.072464,0.166667,0.21875,0.29936,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,250000


In [7]:
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,0.0,0.4,0.184147,0.444444,0.625,0.374046,0.816667,0.0,0.116708,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1462,0.0,0.405,0.232124,0.555556,0.625,0.396947,0.866667,0.083721,0.230175,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1463,0.235294,0.37,0.224197,0.444444,0.5,0.099237,0.2,0.0,0.197257,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1464,0.235294,0.39,0.154326,0.555556,0.625,0.091603,0.2,0.015504,0.150125,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1465,0.588235,0.215,0.064121,0.777778,0.5,0.137405,0.3,0.0,0.065586,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
X_train = train.drop(['SalePrice', 'Id'], 1)
y_train = np.log(train['SalePrice'])
#y = train['SalePrice'].reset_index(drop=True)

  X_train = train.drop(['SalePrice', 'Id'], 1)


In [9]:
X_test = test.drop('Id', 1)

  X_test = test.drop('Id', 1)


In [10]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring = "neg_mean_squared_error", cv = 5).mean())
    return(rmse)

In [11]:
def rmse_cv_search(mse_mean):
    rmse= np.sqrt(-mse_mean)
    return(rmse)

In [12]:
def get_results(search, name='NAN'):
    
    rcols = ['Name','Model', 'BestParameters', 'Scorer', 'Index', 'BestScore', 'BestScoreStd', 'Best Score Search']
    res = pd.DataFrame(columns=rcols)
    
    results = search.cv_results_
    model = search.best_estimator_

    scoring = {'MEA': 'neg_mean_absolute_error', 'R2': 'r2', 'RMSE': 'neg_mean_squared_error'}

    for scorer in sorted(scoring):
        best_index = search.best_index_
        if scorer == 'RMSE': 
            best = np.sqrt(-results['mean_test_%s' % scoring[scorer]][best_index])
            best_std = np.sqrt(results['std_test_%s' % scoring[scorer]][best_index])
        elif scorer == 'MEA':
            best = (-results['mean_test_%s' % scoring[scorer]][best_index])
            best_std = results['std_test_%s' % scoring[scorer]][best_index]
        else:
            best = results['mean_test_%s' % scoring[scorer]][best_index]*100
            best_std = results['std_test_%s' % scoring[scorer]][best_index]*100
        
        r1 = pd.DataFrame([(name, model, search.best_params_, scorer, best_index, best, best_std, search.best_score_)],
                          columns = rcols)
        res = res.append(r1)
        
        bestscore = np.sqrt(-search.best_score_)
        
    print("Best Score: {:.6f}".format(bestscore))
    print('---------------------------------------')
    print('Best Parameters:')
    print(search.best_params_)
    
    return res

# Linear regression

In [310]:
linReg = LinearRegression()

In [311]:
linReg.fit(X_train,y_train)

LinearRegression()

In [312]:
print("RMSE on Training set :", rmse_cv(linReg))

RMSE on Training set : 6095943657.116753


# Ridge

In [313]:
ridgeEst = Ridge()

In [314]:
alpha = [0.0003, 0.0007, 0.0005, 0.05, 0.5, 1.0, 2.0, 5.0, 10.0]
max_iter = [5] # , 10, 100, 200, 300, 400, 500, 600, 1000]
tol = [2e-03, 0.003, 0.001, 0.0005]

param_ridge =\
            dict(
                  alpha = alpha,
                  max_iter = max_iter,
                  tol = tol
                ) 

### RandomizedSearch

In [315]:
ridge = RandomizedSearchCV(estimator = ridgeEst, param_distributions = param_ridge, n_iter = 30,
                           scoring=['neg_mean_squared_error' , 'neg_mean_absolute_error', 'r2'],
                           n_jobs=-1, refit = 'neg_mean_squared_error',
                           cv = 5, verbose=1)

In [316]:
ridge.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


RandomizedSearchCV(cv=5, estimator=Ridge(), n_iter=30, n_jobs=-1,
                   param_distributions={'alpha': [0.0003, 0.0007, 0.0005, 0.05,
                                                  0.5, 1.0, 2.0, 5.0, 10.0],
                                        'max_iter': [5],
                                        'tol': [0.002, 0.003, 0.001, 0.0005]},
                   refit='neg_mean_squared_error',
                   scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                            'r2'],
                   verbose=1)

In [317]:
ridge.best_params_

{'tol': 0.0005, 'max_iter': 5, 'alpha': 5.0}

In [318]:
ridge_results = get_results(ridge, 'ridge')
display(ridge_results.loc[:, 'Scorer' : 'BestScoreStd'])

Best Score: 0.116646
---------------------------------------
Best Parameters:
{'tol': 0.0005, 'max_iter': 5, 'alpha': 5.0}


Unnamed: 0,Scorer,Index,BestScore,BestScoreStd
0,MEA,11,0.081512,0.003166
0,R2,11,91.306854,0.727997
0,RMSE,11,0.116646,0.040267


#  Lasso

In [319]:
lassoEst = Lasso()

In [320]:
alpha = [0.0003, 0.0007, 0.0005, 0.05, 0.5, 1.0]
max_iter = [5 , 10, 100, 200, 300, 400, 500, 600, 1000]
tol = [2e-03, 0.003, 0.001, 0.0005]
selection = ['random', 'cyclic'] 

param_lasso =\
            dict(
                alpha = alpha,
                max_iter = max_iter,
                tol = tol,
                selection = selection
                ) 

In [321]:
lasso = RandomizedSearchCV(estimator = lassoEst, param_distributions = param_lasso, n_iter = 30,
                           scoring=['neg_mean_squared_error' , 'neg_mean_absolute_error', 'r2'],
                           n_jobs=-1, refit = 'neg_mean_squared_error',
                           cv = 5, verbose=1)

In [322]:
lasso.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


  model = cd_fast.enet_coordinate_descent(


RandomizedSearchCV(cv=5, estimator=Lasso(), n_iter=30, n_jobs=-1,
                   param_distributions={'alpha': [0.0003, 0.0007, 0.0005, 0.05,
                                                  0.5, 1.0],
                                        'max_iter': [5, 10, 100, 200, 300, 400,
                                                     500, 600, 1000],
                                        'selection': ['random', 'cyclic'],
                                        'tol': [0.002, 0.003, 0.001, 0.0005]},
                   refit='neg_mean_squared_error',
                   scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                            'r2'],
                   verbose=1)

In [323]:
lasso.best_params_

{'tol': 0.0005, 'selection': 'random', 'max_iter': 300, 'alpha': 0.0003}

In [324]:
lasso_results = get_results(lasso, 'lasso')
display(lasso_results.loc[:, 'Scorer' : 'BestScoreStd'])

Best Score: 0.110943
---------------------------------------
Best Parameters:
{'tol': 0.0005, 'selection': 'random', 'max_iter': 300, 'alpha': 0.0003}


Unnamed: 0,Scorer,Index,BestScore,BestScoreStd
0,MEA,23,0.077609,0.00231
0,R2,23,92.139543,0.785565
0,RMSE,23,0.110943,0.040799


# ElasticNet

In [325]:
elasticnetEst = ElasticNet()

In [326]:
alpha = [0.0003, 0.0007, 0.0005, 0.05, 0.5, 1.0, 2.0, 5.0, 10.0]
l1_ratio = [0, 0.01, 0.1, 0.2, 0.5, 0.7, 0.9, 0.99, 1.0,]
max_iter = [5, 10, 100, 200, 300, 400, 500, 600, 1000]
tol = [2e-03, 0.003, 0.001, 0.0005]
selection = ['random', 'cyclic'] 

param_elasticnet =\
            dict(
                alpha = alpha,
                l1_ratio = l1_ratio,
                max_iter = max_iter,
                tol = tol,
                selection = selection
                ) 

In [327]:
elasticnet = RandomizedSearchCV(estimator = elasticnetEst, param_distributions = param_elasticnet, n_iter = 75,
                           scoring=['neg_mean_squared_error' , 'neg_mean_absolute_error', 'r2'],
                           n_jobs=-1, refit = 'neg_mean_squared_error',
                           cv = 5, verbose=1)

In [328]:
elasticnet.fit(X_train,y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


RandomizedSearchCV(cv=5, estimator=ElasticNet(), n_iter=75, n_jobs=-1,
                   param_distributions={'alpha': [0.0003, 0.0007, 0.0005, 0.05,
                                                  0.5, 1.0, 2.0, 5.0, 10.0],
                                        'l1_ratio': [0, 0.01, 0.1, 0.2, 0.5,
                                                     0.7, 0.9, 0.99, 1.0],
                                        'max_iter': [5, 10, 100, 200, 300, 400,
                                                     500, 600, 1000],
                                        'selection': ['random', 'cyclic'],
                                        'tol': [0.002, 0.003, 0.001, 0.0005]},
                   refit='neg_mean_squared_error',
                   scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                            'r2'],
                   verbose=1)

In [329]:
elasticnet.best_params_

{'tol': 0.001,
 'selection': 'cyclic',
 'max_iter': 1000,
 'l1_ratio': 1.0,
 'alpha': 0.0003}

In [330]:
elasticnet_results = get_results(elasticnet, 'elasticnet')
display(elasticnet_results.loc[:, 'Scorer' : 'BestScoreStd'])

Best Score: 0.110979
---------------------------------------
Best Parameters:
{'tol': 0.001, 'selection': 'cyclic', 'max_iter': 1000, 'l1_ratio': 1.0, 'alpha': 0.0003}


Unnamed: 0,Scorer,Index,BestScore,BestScoreStd
0,MEA,73,0.077606,0.002243
0,R2,73,92.134347,0.773922
0,RMSE,73,0.110979,0.040623


# Random Forest Regressor

In [331]:
randomForestRegEst = RandomForestRegressor()

In [332]:
n_estimators = [5, 10, 20, 40, 80, 100, 150, 300, 600, 1000]
criterion = ['squared_error', 'absolute_error', 'poisson']
max_depth = [5, 10, 15, 20, 30, None]
min_samples_split = [2, 3, 5, 7, 10]
min_samples_leaf = [2, 3, 4]

param_RFR =\
            dict(
                n_estimators = n_estimators,
                criterion = criterion,
                max_depth = max_depth,
                min_samples_split = min_samples_split,
                min_samples_leaf = min_samples_leaf
                ) 

In [333]:
randomForestReg = RandomizedSearchCV(estimator = randomForestRegEst, param_distributions = param_RFR, n_iter = 15,
                           scoring=['neg_mean_squared_error' , 'neg_mean_absolute_error', 'r2'],
                           n_jobs=-1, refit = 'neg_mean_squared_error',
                           cv = 5, verbose=1)

In [334]:
randomForestReg.fit(X_train,y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=15,
                   n_jobs=-1,
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'poisson'],
                                        'max_depth': [5, 10, 15, 20, 30, None],
                                        'min_samples_leaf': [2, 3, 4],
                                        'min_samples_split': [2, 3, 5, 7, 10],
                                        'n_estimators': [5, 10, 20, 40, 80, 100,
                                                         150, 300, 600, 1000]},
                   refit='neg_mean_squared_error',
                   scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                            'r2'],
                   verbose=1)

In [335]:
randomForestReg.best_params_

{'n_estimators': 80,
 'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_depth': 15,
 'criterion': 'absolute_error'}

In [336]:
randomForestReg_results = get_results(randomForestReg, 'Random Forest')
display(randomForestReg_results.loc[:, 'Scorer' : 'BestScoreStd'])

Best Score: 0.134471
---------------------------------------
Best Parameters:
{'n_estimators': 80, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_depth': 15, 'criterion': 'absolute_error'}


Unnamed: 0,Scorer,Index,BestScore,BestScoreStd
0,MEA,5,0.092414,0.001603
0,R2,5,88.414701,0.598364
0,RMSE,5,0.134471,0.035261


# LightGBM

In [13]:
lgbEst = LGBMRegressor()

In [14]:
param_lgb = {"learning_rate": [0.01, 0.1, 0.5],
               "n_estimators": [50, 500, 1000],
               "max_depth": [3, 5, 30],
               "num_leaves": [2, 3, 4, 5, 20, 100],
               "reg_lambda": [0, 3],
               "colsample_bytree": [0.5, 0.8, 0.99, 1],
               "min_child_weight": [1e-3, 1, 10, 30]
               }

In [15]:
'''lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )'''

"lightgbm = LGBMRegressor(objective='regression', \n                                       num_leaves=4,\n                                       learning_rate=0.01, \n                                       n_estimators=5000,\n                                       max_bin=200, \n                                       bagging_fraction=0.75,\n                                       bagging_freq=5, \n                                       bagging_seed=7,\n                                       feature_fraction=0.2,\n                                       feature_fraction_seed=7,\n                                       verbose=-1,\n                                       )"

In [16]:
lgb = RandomizedSearchCV(estimator = lgbEst, param_distributions = param_lgb, n_iter = 30,
                           scoring=['neg_mean_squared_error' , 'neg_mean_absolute_error', 'r2'],
                           n_jobs=-1, refit = 'neg_mean_squared_error',
                           cv = 5, verbose=1)

In [17]:
lgb.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


RandomizedSearchCV(cv=5, estimator=LGBMRegressor(), n_iter=30, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.5, 0.8, 0.99, 1],
                                        'learning_rate': [0.01, 0.1, 0.5],
                                        'max_depth': [3, 5, 30],
                                        'min_child_weight': [0.001, 1, 10, 30],
                                        'n_estimators': [50, 500, 1000],
                                        'num_leaves': [2, 3, 4, 5, 20, 100],
                                        'reg_lambda': [0, 3]},
                   refit='neg_mean_squared_error',
                   scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                            'r2'],
                   verbose=1)

In [18]:
lgb.best_params_

{'reg_lambda': 3,
 'num_leaves': 3,
 'n_estimators': 500,
 'min_child_weight': 0.001,
 'max_depth': 3,
 'learning_rate': 0.1,
 'colsample_bytree': 0.5}

In [19]:
lgb_results = get_results(lgb, 'lgb')
display(lgb_results.loc[:, 'Scorer' : 'BestScoreStd'])

Best Score: 0.115342
---------------------------------------
Best Parameters:
{'reg_lambda': 3, 'num_leaves': 3, 'n_estimators': 500, 'min_child_weight': 0.001, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.5}


Unnamed: 0,Scorer,Index,BestScore,BestScoreStd
0,MEA,7,0.078267,0.002436
0,R2,7,91.512738,0.375567
0,RMSE,7,0.115342,0.03872


# Xgboost

In [167]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin

In [199]:
class XGBRegressor(BaseEstimator, RegressorMixin):
    """
    (Example)
    XGBRegressor in xgboost for sklearn doesnt have ALL parameters accessible, a simple wrapper to expose them
    params = {colsample_bytree=0.9,
              learning_rate=0.01,
              max_depth=5,
              min_child_weight=1,
              n_estimators=300,
              nthread=-1,
              objective='binary:logistic',
              seed=0,
              silent=True,
              subsample=0.8}
    a = XGBRegressor(params=params)
    a.fit(X_train, y_train)
    """

    def __init__(self, num_round=150, eval_metric=None, early_stopping_rounds=None, **params):
        self.params = params
        self.num_boost_round = num_round
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        self.early_stopping_rounds = early_stopping_rounds
        self.eval_metric = eval_metric
        self.xgb = None

    def fit(self, X, y, x_val=None, y_val=None):
        dtrain = xgb.DMatrix(X, label=y)
        if x_val is not None:
            dtest = xgb.DMatrix(x_val, label=y_val)
            watchlist = [(dtrain, 'train'), (dtest, 'validation')]
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_boost_round,
                                 early_stopping_rounds=self.early_stopping_rounds,
                                 evals=watchlist)
        else:
            self.xgb = xgb.train(params=self.params,
                                 dtrain=dtrain,
                                 num_boost_round=self.num_boost_round,
                                 early_stopping_rounds=self.early_stopping_rounds)
        return

    def predict(self, X):
        dtest = xgb.DMatrix(X)
        preds = self.xgb.predict(dtest)
        return preds
    
    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        self.params.update(params)
        return self

In [200]:
xgbEst = XGBRegressor()

In [211]:
num_round = [500, 750, 1000, 3500] 
max_depth = [3, 4]
learning_rate = [0.01, 0.03, 0.1, 0.05]
reg_lambda = [0.1, 1e-03, 1e-05, 1, 0.0] 
reg_alpha= [0.5, 1, 0.0]
booster = ['gblinear', 'dart', 'gbtree']  
objective = ['reg:tweedie', 'reg:squarederror', 'reg:gamma']

param_xgb =\
            dict(
                num_boost_round = num_round,
                booster = booster,
                objective = objective,
                learning_rate = learning_rate,
                reg_lambda = reg_lambda,
                reg_alpha = reg_alpha,
                max_depth = max_depth
                ) 

In [220]:
## Не стоит запускать Xgboost c n_jobs=-1 или какой-то другой параллельной обработкой
xgbSearch = RandomizedSearchCV(estimator = xgbEst, param_distributions = param_xgb, n_iter = 30,
                           scoring=['neg_mean_squared_error' , 'neg_mean_absolute_error', 'r2'],
                           n_jobs=1, refit = 'neg_mean_squared_error',
                           cv = 5, verbose=1)

In [221]:
xgbSearch.fit(X_train,y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagge

Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } 

Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } 

Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such case

Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } 

Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "max_depth", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such case

Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_boost_round" } 

RandomizedSearchCV(cv=5, estimator=XGBRegressor(), n_iter=30, n_jobs=1,
                   param_distributions={'booster': ['gblinear', 'dart',
                                                    'gbtree'],
                                        'learning_rate': [0.01, 0.03, 0.1,
                                                          0.05],
                                        'max_depth': [3, 4],
                                        'num_boost_round': [500, 750, 1000,
                                                            3500],
                                        'objective': ['reg:tweedie',
                                                      'reg:squarederror',
                                                      'reg:gamma'],
                                        'reg_alpha': [0.5, 1, 0.0],
                                        'reg_lambda': [0.1, 0.001, 1e-05, 1,
                                                       0.0]},
                   refit='neg_m

In [296]:
xgbSearch.best_params_

{'reg_lambda': 1,
 'reg_alpha': 0.0,
 'objective': 'reg:tweedie',
 'num_boost_round': 1000,
 'max_depth': 3,
 'learning_rate': 0.1,
 'booster': 'dart'}

In [297]:
results = get_results(xgbSearch, 'xgb')
display(results.loc[:, 'Scorer' : 'BestScoreStd'])

Best Score: 0.116075
---------------------------------------
Best Parameters:
{'reg_lambda': 1, 'reg_alpha': 0.0, 'objective': 'reg:tweedie', 'num_boost_round': 1000, 'max_depth': 3, 'learning_rate': 0.1, 'booster': 'dart'}


Unnamed: 0,Scorer,Index,BestScore,BestScoreStd
0,MEA,12,0.080333,0.003636
0,R2,12,91.385388,0.679612
0,RMSE,12,0.116075,0.038242


# Final Model

In [22]:
# LB 0.18378
lgb_preds = np.exp(lgb.best_estimator_.predict(X_test))

In [23]:
ens = lgb_preds
submission = pd.DataFrame()
submission['Id'] = test['Id']
submission["SalePrice"] = ens
submission.to_csv("submit_ensemble.csv", index=False)