In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
import pandas as pd
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm

In [2]:
import pandas as pd
hdf = pd.HDFStore("../../store/storage4.h5")

train = hdf["train"]
valid = hdf["valid"]
test = hdf["test"]

hdf.close()

In [3]:
train.columns

Index(['app', 'channel', 'device', 'ip', 'is_attributed', 'os', 'hour', 'day',
       'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'ip_tcount',
       'ip_app_count', 'ip_app_os_count', 'ip_tchan_count', 'ip_app_os_var',
       'ip_app_channel_var_day', 'ip_app_channel_mean_hour', 'nextClick'],
      dtype='object')

In [4]:
features = ['nextClick',
 'app',
 'device',
 'os',
 'channel',
 'hour',
 'day',
 'ip_tcount',
 'ip_tchan_count',
 'ip_app_count',
 'ip_app_os_count',
 'ip_app_os_var',
 'ip_app_channel_var_day',
 'ip_app_channel_mean_hour',
 'X0',
 'X1',
 'X2',
 'X3',
 'X4',
 'X5',
 'X6',
 'X7',
 'X8']
cat = ['app', 'device', 'os', 'channel', 'hour', 'day']

In [5]:
params = {'learning_rate': 0.05, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 
          'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 
          'scale_pos_weight': 230, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 
          'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt',
          'objective': 'binary', 'metric': 'auc'}

In [6]:
dtrain = lgb.Dataset(train[features], label=train.is_attributed, categorical_feature=cat, free_raw_data=False)
dtest = lgb.Dataset(valid[features], valid.is_attributed, categorical_feature=cat, free_raw_data=False, reference=dtrain)

In [7]:
for n in [200, 220, 240, 260]:
    params['scale_pos_weight'] = n
    print('scale_pos_weight', n)
    print(params)
    
    dtrain = lgb.Dataset(train[features], label=train.is_attributed, categorical_feature=cat)
    dtest = lgb.Dataset(valid[features], valid.is_attributed, categorical_feature=cat, reference=dtrain)

    model =  lgb.train(params = params, train_set = dtrain, num_boost_round = 1000,
                             valid_sets = [dtest], valid_names = ['valid'],
                             verbose_eval = 50,
                             early_stopping_rounds = 30)
    
    print(model.best_score)

    pred = model.predict(test[features], model.best_iteration)

    sub = pd.DataFrame()
    sub['click_id'] = test['click_id'].astype('int')
    sub['is_attributed'] = pred

    sub.to_csv(str(n)+".csv", index=False, float_format='%.9f')

scale_pos_weight 200
{'learning_rate': 0.05, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 200, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}




Training until validation scores don't improve for 30 rounds.
[50]	valid's auc: 0.983766
[100]	valid's auc: 0.987797
[150]	valid's auc: 0.989528
[200]	valid's auc: 0.990178
[250]	valid's auc: 0.990442
[300]	valid's auc: 0.990695
[350]	valid's auc: 0.990844
[400]	valid's auc: 0.990939
[450]	valid's auc: 0.99114
[500]	valid's auc: 0.991174
Early stopping, best iteration is:
[506]	valid's auc: 0.991182
defaultdict(<class 'dict'>, {'valid': {'auc': 0.9911821582585557}})
scale_pos_weight 220
{'learning_rate': 0.05, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 220, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'categorical_column': [1, 2, 3, 4, 5, 6]}
Training until validation scores don't improve for 30 rounds.
[50]	valid's au

In [8]:
params = {'learning_rate': 0.05, 'num_leaves': 28, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 
          'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 
          'scale_pos_weight': 230, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 
          'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt',
          'objective': 'binary', 'metric': 'auc'}

In [9]:
for n in [200, 220, 240, 260]:
    params['scale_pos_weight'] = n
    print('scale_pos_weight', n)
    print(params)
    model =  lgb.train(params = params, train_set = dtrain, num_boost_round = 1000,
                             valid_sets = [dtest], valid_names = ['valid'],
                             verbose_eval = 50,
                             early_stopping_rounds = 30)
    
    print(model.best_score)

    pred = model.predict(test[features], model.best_iteration)

    sub = pd.DataFrame()
    sub['click_id'] = test['click_id'].astype('int')
    sub['is_attributed'] = pred

    sub.to_csv(str(n)+"l28.csv", index=False, float_format='%.9f')

scale_pos_weight 200
{'learning_rate': 0.05, 'num_leaves': 28, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 200, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}




Training until validation scores don't improve for 30 rounds.
[50]	valid's auc: 0.984122
[100]	valid's auc: 0.988093
[150]	valid's auc: 0.989615
[200]	valid's auc: 0.990182
[250]	valid's auc: 0.990599
[300]	valid's auc: 0.990814
[350]	valid's auc: 0.991001
[400]	valid's auc: 0.991077
[450]	valid's auc: 0.991205
[500]	valid's auc: 0.991295
[550]	valid's auc: 0.991316
Early stopping, best iteration is:
[520]	valid's auc: 0.991319
defaultdict(<class 'dict'>, {'valid': {'auc': 0.991319258577067}})
scale_pos_weight 220
{'learning_rate': 0.05, 'num_leaves': 28, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 220, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}
Training until validation scores don't improve for 30 rounds.
[50]	valid's auc: 0.984139
[1

In [10]:
params = {'learning_rate': 0.05, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 
          'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 
          'scale_pos_weight': 230, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 
          'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt',
          'objective': 'binary', 'metric': 'auc'}

In [11]:
for n in [230]:
    params['scale_pos_weight'] = n
    print('scale_pos_weight', n)
    print(params)
    model =  lgb.train(params = params, train_set = dtrain, num_boost_round = 1000,
                             valid_sets = [dtest], valid_names = ['valid'],
                             verbose_eval = 50,
                             early_stopping_rounds = 30)
    
    print(model.best_score)

    pred = model.predict(test[features], model.best_iteration)

    sub = pd.DataFrame()
    sub['click_id'] = test['click_id'].astype('int')
    sub['is_attributed'] = pred

    sub.to_csv(str(n)+".csv", index=False, float_format='%.9f')

scale_pos_weight 230
{'learning_rate': 0.05, 'num_leaves': 24, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 230, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}




Training until validation scores don't improve for 30 rounds.
[50]	valid's auc: 0.984178
[100]	valid's auc: 0.987768
[150]	valid's auc: 0.989378
[200]	valid's auc: 0.990084
[250]	valid's auc: 0.990447
[300]	valid's auc: 0.990654
[350]	valid's auc: 0.990785
[400]	valid's auc: 0.990933
[450]	valid's auc: 0.991049
[500]	valid's auc: 0.991114
[550]	valid's auc: 0.991121
[600]	valid's auc: 0.991195
[650]	valid's auc: 0.991219
[700]	valid's auc: 0.991246
[750]	valid's auc: 0.991272
[800]	valid's auc: 0.991282
[850]	valid's auc: 0.991298
[900]	valid's auc: 0.991334
[950]	valid's auc: 0.991374
Early stopping, best iteration is:
[940]	valid's auc: 0.991377
defaultdict(<class 'dict'>, {'valid': {'auc': 0.9913773220203835}})


In [12]:
params = {'learning_rate': 0.05, 'num_leaves': 34, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 
          'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 
          'scale_pos_weight': 230, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 
          'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt',
          'objective': 'binary', 'metric': 'auc'}

In [None]:
for n in [200, 220]:
    params['scale_pos_weight'] = n
    print('scale_pos_weight', n)
    print(params)
    model =  lgb.train(params = params, train_set = dtrain, num_boost_round = 1000,
                             valid_sets = [dtest], valid_names = ['valid'],
                             verbose_eval = 50,
                             early_stopping_rounds = 30)
    
    print(model.best_score)

    pred = model.predict(test[features], model.best_iteration)

    sub = pd.DataFrame()
    sub['click_id'] = test['click_id'].astype('int')
    sub['is_attributed'] = pred

    sub.to_csv(str(n)+"l32.csv", index=False, float_format='%.9f')

scale_pos_weight 200
{'learning_rate': 0.05, 'num_leaves': 34, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 200, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}




Training until validation scores don't improve for 30 rounds.
[50]	valid's auc: 0.984699
[100]	valid's auc: 0.988456
[150]	valid's auc: 0.989799
[200]	valid's auc: 0.990397
[250]	valid's auc: 0.990671
[300]	valid's auc: 0.990871
[350]	valid's auc: 0.991004
[400]	valid's auc: 0.991137
[450]	valid's auc: 0.991257
[500]	valid's auc: 0.991334
[550]	valid's auc: 0.991398
[600]	valid's auc: 0.991442
[650]	valid's auc: 0.991461
[700]	valid's auc: 0.991458
Early stopping, best iteration is:
[683]	valid's auc: 0.991475
defaultdict(<class 'dict'>, {'valid': {'auc': 0.9914754711753446}})
scale_pos_weight 220
{'learning_rate': 0.05, 'num_leaves': 34, 'max_depth': 6, 'min_child_samples': 100, 'max_bin': 100, 'subsample': 0.7, 'subsample_freq': 1, 'colsample_bytree': 0.7, 'min_child_weight': 0, 'scale_pos_weight': 220, 'subsample_for_bin': 200000, 'min_split_gain': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': 20, 'verbose': 0, 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc'}
Traini