# Lightgbm tuning with Bayesian Optimization 

By: Traci 

Lightgbm was tuned using bayesian optimization. 

More details can be found in readme.

In [1]:
import numpy as np
from scipy import sparse
import pandas as pd
import xgboost as xgb
import re
import string
import time
import seaborn as sns
import itertools
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import seaborn as sns
import gc

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn import preprocessing, pipeline, metrics, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import Imputer
%matplotlib inline
pd.set_option('display.max_columns', 100)

from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix

from sklearn.model_selection import KFold


## Save/load data

In [3]:
from scipy import sparse

#sparse.save_npz("X.npz", X)
X = sparse.load_npz("X.npz")

#y.to_pickle('y.pkl')    #to save the dataframe, df to 123.pkl
y = pd.read_pickle('y.pkl')

In [2]:
testing = sparse.load_npz("testing.npz")

In [15]:
import pickle
# with open("tfvocab.txt", "wb") as fp:   #Pickling
#     pickle.dump(tfvocab, fp)

with open("tfvocab.txt", "rb") as fp:   # Unpickling
    tfvocab = pickle.load(fp)

In [16]:
# For validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=23)

In [17]:
# LGBM Dataset Formatting
lgtrain = lgb.Dataset(X_train, y_train,
                     )
lgvalid = lgb.Dataset(X_valid, y_valid,
                     )

In [8]:
lgb_clf_final = open('lgb_clf_final.txt')

## Bayesian Optimization

In [None]:
from bayes_opt import BayesianOptimization

def lgb_evaluate(max_bin,
                 num_leaves,
                 min_gain_to_split,
                 feature_fraction,
                 bagging_fraction,
                 bagging_freq,
                 lambda_l1,
                 lambda_l2
                 ):
    params = dict()
    params['objective'] = 'regression'
    params['learning_rate'] = 0.2
    params['boosting_type'] = 'gbdt'
    params['metrics'] = 'rmse'
    params['max_bin'] = int(max_bin)
    params['num_leaves'] = int(num_leaves)    
    params['min_gain_to_split'] = min_gain_to_split    
    params['feature_fraction'] = feature_fraction
    params['bagging_fraction'] = bagging_fraction
    params['bagging_freq'] = int(bagging_freq)


    lgb_clf = lgb.train(params,
                           lgtrain,
                            valid_sets=[lgtrain, lgvalid],
                            valid_names=['train','valid'],
                            num_boost_round=50000,
                            #nfold=5,
                            early_stopping_rounds=50,
                            #metrics='rmse',
                            #stratified=False,
                            #shuffle=True,
                            verbose_eval=False
                           )

    return -np.sqrt(metrics.mean_squared_error(y_valid, lgb_clf.predict(X_valid)))


lgb_BO = BayesianOptimization(lgb_evaluate, 
                              {
                                'max_bin': (255,700),
                              'num_leaves': (100, 300),
                              'min_gain_to_split': (0,1),
                              'feature_fraction': (0.5, 0.8),
                              'bagging_fraction': (0.5,0.9),
                              'bagging_freq': (1,5),
                              'lambda_l1': (0,2),
                              'lambda_l2': (0,2)
                             }
                            )

lgb_BO.maximize(init_points=4, n_iter=20)

In [8]:
# Show tuning results
lgb_BO_scores = pd.DataFrame(lgb_BO.res['all']['params'])

lgb_BO_scores['score'] = pd.DataFrame(lgb_BO.res['all']['values'])

lgb_BO_scores = lgb_BO_scores.sort_values(by='score',ascending=False)
lgb_BO_scores

Unnamed: 0,bagging_fraction,bagging_freq,feature_fraction,lambda_l1,lambda_l2,max_bin,min_gain_to_split,num_leaves,score
7,0.832493,4.952672,0.598908,1.92528,0.134296,352.072435,0.140583,229.655521,-0.218156
15,0.893277,1.798229,0.537523,1.616637,0.606078,325.143306,0.029934,145.003845,-0.218222
17,0.878944,4.651486,0.59752,1.790466,0.084355,467.775227,0.02287,249.349329,-0.218229
10,0.881402,4.599764,0.585805,0.037786,1.983252,409.32433,0.124096,200.938474,-0.218234
19,0.8919,1.330146,0.55177,1.462721,0.231633,698.734747,0.231905,137.680223,-0.218238
5,0.885416,4.744184,0.785217,1.89859,1.906142,310.048828,0.094239,101.789619,-0.218265
13,0.825906,2.499046,0.616984,1.744791,0.009505,511.085531,0.025667,295.967637,-0.218292
16,0.894235,2.169236,0.709576,1.970531,0.54658,485.932589,0.04044,161.154025,-0.218299
9,0.783581,2.800925,0.715756,0.039713,0.025579,699.910645,0.256329,207.510256,-0.218411
14,0.886987,1.07481,0.760296,1.918598,0.655052,607.450986,0.013266,105.560628,-0.218467


In [9]:
lgb_BO_scores.to_pickle('lgb_BO_scores.pkl')    #to save the dataframe, df to 123.pkl
#y = pd.read_pickle('y.pkl')

In [13]:
# Train the model with smaller learning rate
params = lgb_BO_scores.iloc[0].to_dict() # get the set of params with highest score
lgb_params = dict()
lgb_params['objective'] = 'regression'
lgb_params['learning_rate'] = 0.02 # Smaller learning rate
lgb_params['boosting_type'] = 'gbdt'
lgb_params['metrics'] = 'root_mean_squared_error'
 
lgb_params['max_bin'] = int(params['max_bin'])   
lgb_params['num_leaves'] = int(params['num_leaves'])    
lgb_params['min_gain_to_split'] = params['min_gain_to_split']     
lgb_params['feature_fraction'] = params['feature_fraction']
lgb_params['bagging_fraction'] = params['bagging_fraction']
lgb_params['bagging_freq'] = int(params['bagging_freq'])
lgb_params['lambda_l1'] = int(params['lambda_l1'])
lgb_params['lambda_l2'] = int(params['lambda_l2'])


best_lgb_clf = lgb.train(lgb_params,
                lgtrain,
                valid_sets=[lgtrain, lgvalid],
                valid_names=['train','valid'],
                num_boost_round=1000000,
                #nfold=5,
                early_stopping_rounds=200, # Bigger stopping rounds
                #metrics='rmse',
                #shuffle=True, stratified=False,
                verbose_eval=100
               )

best_lgb_iteration = best_lgb_clf.current_iteration()
best_lgb_score = np.sqrt(metrics.mean_squared_error(y_valid, best_lgb_clf.predict(X_valid)))

print (best_lgb_iteration, best_lgb_score)


Training until validation scores don't improve for 200 rounds.
[100]	train's l2: 0.0494789	valid's l2: 0.050374
[200]	train's l2: 0.0474524	valid's l2: 0.0488577
[300]	train's l2: 0.0463288	valid's l2: 0.0482089
[400]	train's l2: 0.0454828	valid's l2: 0.047797
[500]	train's l2: 0.0448062	valid's l2: 0.04754
[600]	train's l2: 0.0442328	valid's l2: 0.0473618
[700]	train's l2: 0.0437354	valid's l2: 0.0472423
[800]	train's l2: 0.0432879	valid's l2: 0.0471556
[900]	train's l2: 0.0428721	valid's l2: 0.04708
[1000]	train's l2: 0.0424885	valid's l2: 0.0470195
[1100]	train's l2: 0.0421271	valid's l2: 0.0469712
[1200]	train's l2: 0.0417989	valid's l2: 0.0469257
[1300]	train's l2: 0.0414975	valid's l2: 0.0468891
[1400]	train's l2: 0.0411949	valid's l2: 0.0468484
[1500]	train's l2: 0.0409203	valid's l2: 0.0468147
[1600]	train's l2: 0.040676	valid's l2: 0.0467939
[1700]	train's l2: 0.0404391	valid's l2: 0.0467688
[1800]	train's l2: 0.0402227	valid's l2: 0.0467564
[1900]	train's l2: 0.0400241	valid'

In [12]:
lgb_params

{'objective': 'regression',
 'learning_rate': 0.02,
 'boosting_type': 'gbdt',
 'metrics': 'rmse',
 'max_bin': 352,
 'num_leaves': 229,
 'min_gain_to_split': 0.14058266065471703,
 'feature_fraction': 0.5989083426274472,
 'bagging_fraction': 0.8324926777545907,
 'bagging_freq': 4,
 'lambda_l1': 1,
 'lambda_l2': 0,
 'verbose': 1}

In [18]:
lgb_params = {'objective': 'regression',
             'learning_rate': 0.02,
             'boosting_type': 'gbdt',
             'metrics': 'rmse',
             'max_bin': 352,
             'num_leaves': 229,
             'min_gain_to_split': 0.14058266065471703,
             'feature_fraction': 0.5989083426274472,
             'bagging_fraction': 0.8324926777545907,
             'bagging_freq': 4,
             'lambda_l1': 1,
             'lambda_l2': 0,
             'verbose': 1}

best_lgb_iteration = 2152

In [19]:
# Prepare submission for best single model
start = time.time()
lgtrain = lgb.Dataset(X, y)
    #del X; gc.collect()
    # Go Go Go
lgb_clf = lgb.train(
                    lgb_params,
                    lgtrain,
                    num_boost_round=best_lgb_iteration,
                    verbose_eval=100
                )
print ("Training finished in %d seconds." % (time.time()-start))

Training finished in 5727 seconds.


In [20]:
from sklearn.externals import joblib
# save model
joblib.dump(lgb_clf, 'lgb_clf.pkl')
# load model
#lgb_clf = joblib.load('lgb_clf.pkl')

['lgb_clf.pkl']

In [5]:
from sklearn.externals import joblib
lgb_clf = joblib.load('lgb_clf.pkl')

In [6]:
lgb_clf

<lightgbm.basic.Booster at 0x1e86cc2c550>

In [11]:
# Prepare submission for best single model
start = time.time()
clf = lgb.LGBMRegressor(learning_rate = 0.02
                        , n_estimators =best_lgb_iteration 
                        , max_bin = lgb_params['max_bin']
                        , num_leaves = lgb_params['num_leaves']
                        , min_split_gain = lgb_params['min_gain_to_split'] 
                        , colsample_bytree = lgb_params['feature_fraction']
                        , subsample = lgb_params['bagging_fraction']
                        , subsample_freq = lgb_params['bagging_freq']
                        , lambda_l1 = lgb_params['lambda_l1']
                        , lambda_l2 = lgb_params['lambda_l2']
                        , random_state = 1234
                        , objective = 'root_mean_squared_error'
                       )

print (clf)

clf.fit(X, y)

print ("Training finished in %d seconds." % (time.time()-start))

LGBMRegressor(boosting_type='gbdt', class_weight=None,
       colsample_bytree=0.5989083426274472, lambda_l1=1, lambda_l2=0,
       learning_rate=0.02, max_bin=352, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.14058266065471703,
       n_estimators=2152, n_jobs=-1, num_leaves=229,
       objective='root_mean_squared_error', random_state=1234,
       reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=0.8324926777545907, subsample_for_bin=200000,
       subsample_freq=4)
Training finished in 5426 seconds.


In [None]:
preds = lgb_clf.predict(testing)



In [None]:
preds = clf.predict(testing)
lgsub = pd.DataFrame(preds,columns=["deal_probability"],index=testdex)
lgsub['deal_probability'] = lgsub['deal_probability'].clip(0.0, 1.0) # Between 0 and 1
lgsub.to_csv("bow-meta-text-and-dense-features-lgbm-.csv",index=True,header=True)



## Stacking

In [None]:
preds = clf.predict(testing)
lgsub = pd.DataFrame(preds,columns=["deal_probability"],index=testdex)
lgsub['deal_probability'] = lgsub['deal_probability'].clip(0.0, 1.0) # Between 0 and 1
lgsub.to_csv("bow-meta-text-and-dense-features-lgbm-.csv",index=True,header=True)