In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
import gc

In [2]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def xgb_evaluate(max_depth, subsample,gamma, colsample_bytree, max_leaves,max_bin, min_child_weight,colsample_bylevel,\
                reg_alpha,reg_lambda, random_seed=6):
    params = {'eval_metric': 'rmse',\
              'objective': 'reg:linear',\
              'booster':'gbtree',\
              'max_depth': int(max_depth),\
              'subsample': subsample,\
              'eta': 0.05,\
              'tree_method':'gpu_hist',\
              'gamma': gamma,\
              'colsample_bytree': colsample_bytree,\
              'max_leaves': int(max_leaves),\
              'max_bin':int(max_bin),\
              'min_child_weight':min_child_weight,\
              'colsample_bylevel':colsample_bylevel,\
              'reg_alpha':reg_alpha,\
              'reg_lambda':reg_lambda,
              'n_gpus': 2}
    cv_result = xgb.cv(params, dtrain, num_boost_round=1000, nfold=3,seed=random_seed, stratified=False, verbose_eval=100,early_stopping_rounds=50)    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [5]:
df_merge = reduce_mem_usage(pd.read_csv('../output/outlier_remove.csv'))
target = df_merge.logerror
features = df_merge.drop(['logerror'], axis = 1)
dtrain = xgb.DMatrix(features, label=target)

Memory usage of dataframe is 58.98 MB
Memory usage after optimization is: 23.37 MB
Decreased by 60.4%


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [7]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 9), \
                                             'gamma': (0.8, 1),\
                                             'subsample': (0.6, 0.9),\
                                            'max_leaves': (1000, 2000),\
                                            'colsample_bytree': (0.4, 0.7),\
                                            'reg_lambda': (1, 2),\
                                            'reg_alpha': (1, 2),\
                                            'max_bin':(250,350),\
                                            'colsample_bylevel':(0.4,0.7),\
                                            'min_child_weight': (14, 20)})
# Use the expected improvement acquisition function to handle negative numbers
xgb_bo.maximize(init_points=3, n_iter=10, acq='ei')

|   iter    |  target   | colsam... | colsam... |   gamma   |  max_bin  | max_depth | max_le... | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
[0]	train-rmse:0.474469+2.57207e-05	test-rmse:0.474469+7.15557e-05
[100]	train-rmse:0.0822457+9.14525e-05	test-rmse:0.0824603+0.000191771
[200]	train-rmse:0.0819063+8.3288e-05	test-rmse:0.0822737+0.000198611
[300]	train-rmse:0.0816927+8.14221e-05	test-rmse:0.0821917+0.000198847
[400]	train-rmse:0.08151+8.20853e-05	test-rmse:0.0821383+0.00019766
[500]	train-rmse:0.081341+8.53385e-05	test-rmse:0.082093+0.000197246
[600]	train-rmse:0.0811867+8.88907e-05	test-rmse:0.0820573+0.000194438
[700]	train-rmse:0.0810477+9.40792e-05	test-rmse:0.0820253+0.000195813
[800]	train-rmse:0.080921+9.48789e-05	test-rmse:0.082003+0.000198103
[900]	train-rmse:0.0808003+9.90769e-05	test-rmse:0.0819853+0.000197377
[999]	trai

[0]	train-rmse:0.474446+4.39394e-05	test-rmse:0.474446+5.40432e-05
[100]	train-rmse:0.080837+5.48695e-05	test-rmse:0.0821417+0.000221512
[200]	train-rmse:0.079731+7.44222e-05	test-rmse:0.081917+0.000222757
[300]	train-rmse:0.0788423+9.14525e-05	test-rmse:0.0818493+0.000219432
[400]	train-rmse:0.0780417+8.4917e-05	test-rmse:0.0818343+0.000239444
| [0m 8       [0m | [0m-0.08183 [0m | [0m 0.6798  [0m | [0m 0.6559  [0m | [0m 0.9685  [0m | [0m 250.2   [0m | [0m 7.362   [0m | [0m 1.998e+0[0m | [0m 15.06   [0m | [0m 1.964   [0m | [0m 1.198   [0m | [0m 0.6541  [0m |
[0]	train-rmse:0.474447+3.22594e-05	test-rmse:0.474448+6.33772e-05
[100]	train-rmse:0.082032+8.24783e-05	test-rmse:0.082376+0.000198912
[200]	train-rmse:0.0815927+9.23231e-05	test-rmse:0.0821773+0.000195623
[300]	train-rmse:0.081284+9.57914e-05	test-rmse:0.082086+0.000195153
[400]	train-rmse:0.081017+0.000111053	test-rmse:0.0820273+0.000186523
[500]	train-rmse:0.0807747+0.000110228	test-rmse:0.0819787+0.0001

In [None]:
|  2        | -0.08181  |  0.4715   |  0.5507   |  0.8069   |  330.5    |  8.922    |  1.919e+0 |  14.06    |  1.665    |  1.874    |  0.7119   |
|   iter    |  target   | colsam... | colsam... |   gamma   |  max_bin  | max_depth | max_le... | min_ch... | reg_alpha | reg_la... | subsample |

In [None]:
params = {'eval_metric': 'rmse',\
              'objective': 'reg:squarederror',\
              'booster':'gbtree',\
              'nthread' : 4,\
              'eta' : 0.05,\
              'max_leaves': 1919,\
              'max_depth' : 9,\
              'subsample' : 0.7119,\
              'colsample_bytree' : 0.5507,\
              'colsample_bylevel' : 0.4715,\
              'gamma':0.8069,\
              'max_bin':330,\
              'min_child_weight':14,\
              'reg_alpha':1.665,\
              'reg_lambda':1.874,
              'tree_method': 'gpu_hist',
              'n_gpus': 2}