In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

np.random.seed(42)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn.model_selection import KFold, cross_val_score
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
!pip install --upgrade lightgbm
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge, ElasticNet

Collecting lightgbm
  Downloading lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 884 kB/s 
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 2.3.1
    Uninstalling lightgbm-2.3.1:
      Successfully uninstalled lightgbm-2.3.1
Successfully installed lightgbm-3.1.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
X_train = np.loadtxt("../input/house-prices-parameter-tuning/xtrain.csv", delimiter=",")
y_train = np.loadtxt("../input/house-prices-parameter-tuning/ytrain.csv", delimiter=",")
X_test = np.loadtxt("../input/house-prices-parameter-tuning/xtest.csv", delimiter=",")
test_id = np.loadtxt("../input/house-prices-parameter-tuning/testid.csv", delimiter=",")

In [4]:
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

def rmse_cv(model, X = X_train, y = y_train):    
    return np.sqrt(-cross_val_score(model, X, y, scoring = 'neg_mean_squared_error', cv = kf)).mean()

# BayesSearchCV

In this notebook, I won't try to explain meaning of parameters. I just want to share my codes on tuning parameters in https://www.kaggle.com/mustafacicek/comprehensive-eda-feature-engineering-top-4


And, of course, I want to show the performance of BayesSearchCV

**Pros**

-Instead of grid or distributions, you can give a space for parameter

-You can add a callback. It gives us a chance to monitoring best score or best parameters.
This process stops when you reach the best score you want.

Of course, Bayesian optimization

In [5]:
def on_step(optim_result):
    """
    Callback meant to view scores after
    each iteration while performing Bayesian
    Optimization in Skopt"""
    score = opt.best_score_
    print("best score: %s" % score)
    if score >= -0.11:
        print('Interrupting!')
        return True

This callback function defined in https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html#sphx-glr-auto-examples-sklearn-gridsearchcv-replacement-py

# Parameter Tuning For LightGBM, XGBoost, GradientBoosting

For detailed information of parameters, you can take a look https://sites.google.com/view/lauraepp/parameters

Boosting algorithms have lots of parameters but you don't have to tune all of them.

https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html

In [6]:
lgbr = lgb.LGBMRegressor(objective = "regression", n_jobs = -1, random_state = 42, learning_rate = 0.1, n_estimators = 200)

rmse_cv(lgbr)

0.1275962354965227

For default LGBMRegressor model, our RMSLE is 0.127596 on cv

**Parameter tuning for boosting algorithms**;

1- Tune the parameters except n_estimators and learning rate,

2- Then Increase the n_estimators(number of boosting rounds) and reduce the learning rate.

In [7]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(lgbr, 
#                     {
#                         "max_depth": Integer(3, 13),
#                         "num_leaves": Integer(20, 200),
#                         "min_child_samples": Integer(7, 75),
#                         "colsample_bytree": Real(0.25, 1),
#                         "subsample": Real(0.25, 1),
#                         "subsample_freq": Integer(1, 50),
#                         "reg_alpha": Real(0, 1),
#                         "reg_lambda": Real(0, 1),
#                         "min_split_gain": Real(0, 0.5)
#                     },
#                     n_iter = 150,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )


# opt.fit(X_train, y_train, callback = on_step)

# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# print("Best Parameters: ", opt.best_params_, "\n")

# lgbr2 = opt.best_estimator_
# lgbr2

In [8]:
lgbr2 = lgb.LGBMRegressor(colsample_bytree=0.25, max_depth=13, min_child_samples=7,
                          n_estimators=200, num_leaves=20, objective='regression',
                          random_state=42, subsample=0.9330025956033094, subsample_freq=1)

rmse_cv(lgbr2)

0.11875287245960472

This tuning process for LGBMRegressor took 14 minutes.

Remember, our default LGBMRegressor model has 0.127596 RMSLE score.
After the tuning, RMSLE score on local cv is 0.1187528, and the score on leaderboard is 0.1253.

In [9]:
xgbr = xgb.XGBRegressor(objective = "reg:squarederror", n_jobs = -1, random_state = 42, learning_rate = 0.1, n_estimators = 200)

rmse_cv(xgbr)

0.1244116985968082

In [10]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(xgbr, 
#                     {
#                         "max_depth": Integer(3, 13),
#                         "colsample_bytree": Real(0.25, 1),
#                         "subsample": Real(0.25, 1),
#                         "reg_alpha": Real(0, 1),
#                         "reg_lambda": Real(0, 1),
#                         "gamma": Real(0, 0.5)
#                     },
#                     n_iter = 150,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )


# opt.fit(X_train, y_train, callback = on_step)


# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# print("Best Parameters: ", opt.best_params_, "\n")

# xgbr2 = opt.best_estimator_
# xgbr2

In [11]:
xgbr2 = xgb.XGBRegressor(colsample_bytree=0.25, gamma=0.0, learning_rate=0.1, max_depth=3,
                         n_estimators=200, n_jobs=-1, random_state=42, reg_alpha=0.24206673672530965, 
                         reg_lambda=0.40464485640717085, subsample=1.0)

rmse_cv(xgbr2)

0.11642098813435972

This tuning process for XGBRegressor took 21 minutes.

Remember, our default XGBRegressor model has 0.124411 RMSLE score.
After the tuning, RMSLE score on local cv is 0.116420, and the score on leaderboard is 0.1236.

In [12]:
gbr = GradientBoostingRegressor(n_estimators = 200, learning_rate = 0.1, loss = "huber", 
                                random_state = 42, n_iter_no_change = 20)

rmse_cv(gbr)

0.12331130948866256

In [13]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(gbr,
#                     {
#                         "max_depth": Integer(3, 13),
#                         "max_features": Real(0.1, 1, prior = "log-uniform"),
#                         "subsample": Real(0.25, 1),
#                         "min_samples_split": Integer(20, 120),
#                         "min_samples_leaf": Integer(1, 10),
#                         "alpha": Real(0.75, 0.95),
#                         "min_impurity_decrease": Real(0, 0.5)
#                     },
#                     n_iter = 150,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )


# opt.fit(X_train, y_train, callback = on_step)


# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# print("Best Parameters: ", opt.best_params_, "\n")

# gbr2 = opt.best_estimator_
# gbr2

In [14]:
gbr2 = GradientBoostingRegressor(alpha=0.8979588317644014, loss='huber', max_depth=13,
                                 max_features=0.1, min_samples_split=109,
                                 n_estimators=200, n_iter_no_change=20,
                                 random_state=42)
rmse_cv(gbr2)

0.11835717290151222

This tuning process for GradientBoostingRegressor took 18 minutes.

Remember, our default GradientBoostingRegressor model has 0.123311 RMSLE score.
After the tuning, RMSLE score on local cv is 0.118357, and the score on leaderboard is 0.1228.

In [15]:
svr = SVR(max_iter = 15000)

rmse_cv(svr)

0.13025558077628668

In [16]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(svr, 
#                     {
#                         "kernel": Categorical(["linear", "poly", "rbf"]),
#                         "degree": Integer(2, 4),
#                         "gamma": Real(1e-4, 0.1, prior = "log-uniform"),
#                         "C": Real(1e-4, 10,  prior = "log-uniform"),
#                         "epsilon": Real(1e-4, 0.1, prior = "log-uniform"),
#                         "coef0": Real(1e-4, 10, prior = "log-uniform"),
#                     },
#                     n_iter = 150,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )


# opt.fit(X_train, y_train, callback = on_step)


# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# print("Best Parameters: ", opt.best_params_, "\n")

# svr2 = opt.best_estimator_
# svr2

In [17]:
svr2 = SVR(C=0.7682824405204463, coef0=0.0001, degree=2, epsilon=0.0001,
           gamma=0.0042151786393578635, max_iter=15000)

rmse_cv(svr2)

0.11564674136937295

This tuning process for SVR regression took 15 minutes.

Remember, our default SVR regression model has 0.130255 RMSLE score.
After the tuning, RMSLE score on local cv is 0.115646, and the score on leaderboard is 0.1226.

In [18]:
lasso = Lasso(random_state = 42, max_iter = 5000)

rmse_cv(lasso)

0.3980572321407768

In [19]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(lasso, 
#                     {
#                         "alpha": Real(1e-4, 100, prior = "log-uniform")
#                     },
#                     n_iter = 100,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )

# opt.fit(X_train, y_train, callback = on_step)


# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# print("Best Parameters: ", opt.best_params_, "\n")

# lasso2 = opt.best_estimator_
# lasso2

In [20]:
lasso2 = Lasso(alpha=0.00012609086150256233, max_iter=5000, random_state=42)

rmse_cv(lasso2)

0.122355450899966

This tuning process for Lasso regression took 5 minutes.

Remember, our default Lasso regression model has 0.3980 RMSLE score.
After the tuning, RMSLE score on local cv is 0.122355, and the score on leaderboard is 0.1278.

In [21]:
ridge = Ridge(random_state = 42, max_iter = 5000)

rmse_cv(ridge)

0.12304913866117542

In [22]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(ridge, 
#                     {
#                         "alpha": Real(1e-4, 100, prior = "log-uniform")
#                     },
#                     n_iter = 100,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )

# opt.fit(X_train, y_train, callback = on_step)


# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# print("Best Parameters: ", opt.best_params_, "\n")

# ridge2 = opt.best_estimator_
# ridge2

In [23]:
ridge2 = Ridge(alpha=2.651347536470113, max_iter=5000, random_state=42)

rmse_cv(ridge2)

0.12284056974454602

This tuning process for Ridge regression took 2 minutes.

Remember, our default Ridge regression model has 0.123049 RMSLE score.
After the tuning, RMSLE score on local cv is 0.122840, and the score on leaderboard is 0.1266.

In [24]:
enet = ElasticNet(random_state = 42, max_iter = 5000)

rmse_cv(enet)

0.3980572321407768

In [25]:
# %%time
# start = datetime.now()
# print(start)

# opt = BayesSearchCV(enet, 
#                     {
#                         "alpha": Real(1e-4, 100, prior = "log-uniform"),
#                         "l1_ratio": Real(1e-4, 100, prior = "log-uniform")
#                     },
#                     n_iter = 100,
#                     cv = kf,
#                     n_jobs = -1,
#                     scoring = "neg_root_mean_squared_error",
#                     random_state = 42
#                    )

# opt.fit(X_train, y_train, callback = on_step)


# end = datetime.now()
# print(end)

# print("Best Score is: ", opt.best_score_, "\n")

# enet2 = opt.best_estimator_
# enet2

In [26]:
enet2 = ElasticNet(alpha=0.0002286518512853544, l1_ratio=0.6510386358323069, max_iter=5000, random_state=42)

rmse_cv(enet2)

0.12240435140496686

This tuning process for ElasticNet regression took 5 minutes.

Remember, our default Ridge regression model has 0.3980 RMSLE score.
After the tuning, RMSLE score on local cv is 0.1224043, and the score on leaderboard is 0.1273.

For boosting models, I just increase the n_estimators parameters 200 to 10000. This is a boosting parameter, number of boosting rounds. You will set early_stopping_rounds in fit parameters for lightgbm and xgboost, and "n_iter_no_change" parameter that you define in GradientBoosting model. For that reason, you can set n_estimators to higher number.

I decrease learning_rate from 0.1 to 0.01.

In [27]:
lgb_model = lgb.LGBMRegressor(colsample_bytree=0.25, learning_rate=0.01,
                              max_depth=13, min_child_samples=7, n_estimators=10000,
                              num_leaves=20, objective='regression', random_state=42,
                              subsample=0.9330025956033094, subsample_freq=1)

xgb_model = xgb.XGBRegressor(colsample_bytree=0.25, gamma=0.0, learning_rate=0.01, max_depth=3,
                             n_estimators=10000, n_jobs=-1, random_state=42, 
                             reg_alpha=0.24206673672530965, reg_lambda=0.40464485640717085, subsample=1.0)

gbr_model = GradientBoostingRegressor(alpha=0.8979588317644014,
                                      learning_rate=0.01, loss='huber',
                                      max_depth=13, max_features=0.1, min_samples_split=109,
                                      n_estimators=10000, n_iter_no_change=100, random_state=42)

svr_model = SVR(C=0.7682824405204463, coef0=0.0001, degree=2, epsilon=0.0001, gamma=0.0042151786393578635, max_iter=10000)

lasso_model = Lasso(alpha=0.00012609086150256233, max_iter=5000, random_state=42)

ridge_model = Ridge(alpha=2.651347536470113, max_iter=5000, random_state=42)

enet_model = ElasticNet(alpha=0.0002286518512853544, l1_ratio=0.6510386358323069, max_iter=5000, random_state=42)

In [28]:
# rmse_cv(lgb_model)
# 0.11686439891203042

In [29]:
# rmse_cv(xgb_model)
# 0.11479692696444313

In [30]:
# rmse_cv(gbr_model)
# 0.11698399499241512

# OOF Predictions

In [31]:
%%time
models = {
    "LGBMRegressor": lgb_model,
    "XGBRegressor": xgb_model,
    "GradientBoostingRegressor": gbr_model,
    "SVR": svr_model,
    "Lasso": lasso_model,
    "Ridge": ridge_model,
#     "ElasticNet": enet_model,
         }

oof_df = pd.DataFrame()
predictions_df = pd.DataFrame()


for name, model in models.items():
    
    print("For model ", name, "\n")
    i = 1
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_test))
    
    for train_ix, test_ix in kf.split(X_train):
        
        print("Out of fold predictions generating for fold ", i)
        
        train_X, train_y = X_train[train_ix], y_train[train_ix]
        test_X, test_y = X_train[test_ix], y_train[test_ix]
        
        if name == "LGBMRegressor":
            model.fit(train_X, train_y,
                      eval_set = [(test_X, test_y)],
                      eval_metric = "rmse",
                      early_stopping_rounds=200,
                      verbose=0)
            
        elif name == "XGBRegressor":
            model.fit(train_X, train_y,
                      eval_set = [(test_X, test_y)],
                      eval_metric = "rmse",
                      early_stopping_rounds=250,
                      verbose=0)
        else:
            model.fit(train_X, train_y)
            
        oof[test_ix] = oof[test_ix] + model.predict(X_train[test_ix])
        predictions = predictions + model.predict(X_test)
        
        i = i + 1
        
        oof_df[name] = oof
        predictions_df[name] = predictions / 10
        
        
    print("\nDone \n")

For model  LGBMRegressor 

Out of fold predictions generating for fold  1
Out of fold predictions generating for fold  2
Out of fold predictions generating for fold  3
Out of fold predictions generating for fold  4
Out of fold predictions generating for fold  5
Out of fold predictions generating for fold  6
Out of fold predictions generating for fold  7
Out of fold predictions generating for fold  8
Out of fold predictions generating for fold  9
Out of fold predictions generating for fold  10

Done 

For model  XGBRegressor 

Out of fold predictions generating for fold  1
Out of fold predictions generating for fold  2
Out of fold predictions generating for fold  3
Out of fold predictions generating for fold  4
Out of fold predictions generating for fold  5
Out of fold predictions generating for fold  6
Out of fold predictions generating for fold  7
Out of fold predictions generating for fold  8
Out of fold predictions generating for fold  9
Out of fold predictions generating for fold  

In [32]:
predictions_df

Unnamed: 0,LGBMRegressor,XGBRegressor,GradientBoostingRegressor,SVR,Lasso,Ridge
0,11.748541,11.762736,11.751305,11.717839,11.657140,11.650488
1,11.988315,11.996193,11.998019,12.011693,12.008598,11.992237
2,12.107837,12.109984,12.128069,12.154825,12.137422,12.133347
3,12.180335,12.198591,12.167821,12.233828,12.218998,12.221573
4,12.146219,12.128007,12.159422,12.161629,12.185252,12.187210
...,...,...,...,...,...,...
1454,11.316778,11.336552,11.323174,11.441505,11.375150,11.370010
1455,11.315574,11.331060,11.331213,11.360464,11.303356,11.285439
1456,11.986408,12.006536,12.023309,12.034027,11.976249,11.959039
1457,11.673435,11.679955,11.662022,11.641669,11.747630,11.744732


**Out of folds predictions on public leaderboard**

LGBMRegressor: 12.012

XGBRegressor: 11.932

GradientBoostingRegressor: 11.653

SVR: 12.23

Lasso: 12.77

**Predict method**

LGBMRegressor: 12.09

XGBRegressor: 12.07

GradientBoostingRegressor: 12.00

SVR: 12.26

Lasso: 12.78

In [33]:
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))
i = 1

for train_ix, test_ix in kf.split(oof_df):

    print("Out of fold predictions generating for fold ", i)

    train_X, train_y = oof_df.values[train_ix], y_train[train_ix]
    test_X, test_y = oof_df.values[test_ix], y_train[test_ix]
    
    model = gbr_model
    model.fit(train_X, train_y)

#     model.fit(train_X, train_y,
#                   eval_set = [(test_X, test_y)],
#                   eval_metric = "rmse",
#                   early_stopping_rounds=250,
#                   verbose=0)        

    oof[test_ix] = oof[test_ix] + model.predict(oof_df.values[test_ix])
    predictions = predictions + model.predict(predictions_df)
    
    i = i + 1

    oof_stacked = oof
    stack_preds  = predictions / 10      

Out of fold predictions generating for fold  1
Out of fold predictions generating for fold  2
Out of fold predictions generating for fold  3
Out of fold predictions generating for fold  4
Out of fold predictions generating for fold  5
Out of fold predictions generating for fold  6
Out of fold predictions generating for fold  7
Out of fold predictions generating for fold  8
Out of fold predictions generating for fold  9
Out of fold predictions generating for fold  10


In [34]:
preds = (4 * stack_preds +
         predictions_df["LGBMRegressor"] +
         predictions_df["XGBRegressor"] +
         2 * predictions_df["GradientBoostingRegressor"] +
         predictions_df["SVR"] +
         predictions_df["Lasso"]) / 10

sub = pd.DataFrame({"Id": test_id, "SalePrice": np.expm1(preds)})
sub.to_csv("BlendedModel120121.csv", index = False)

sub

Unnamed: 0,Id,SalePrice
0,1461.0,125542.102906
1,1462.0,165300.092575
2,1463.0,185849.594145
3,1464.0,199328.447900
4,1465.0,189672.448139
...,...,...
1454,2915.0,88424.836791
1455,2916.0,83663.269066
1456,2917.0,163320.492257
1457,2918.0,119495.959590
