In [24]:
import pandas as pd
import numpy as np
# import random

import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import gc
import utils 

plt.style.use('ggplot')
%matplotlib inline

# TUNE LGBM

In [25]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

data = pd.concat((train.drop('cardio', axis=1), test), axis=0)
dic = data.groupby('age_group_orig')['age'].mean().to_dict()
X_train['age_dif'] = X_train[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)
X_test['age_dif'] = X_test[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)

X = X_train[[
'gender',
'height',
'weight',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'ap_hi_mul_weight',
'age_dif',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
]]
Y = y_train

In [26]:
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score

def modelfit(estimator, X_train, y_train, X_test=None, fit_params=None, n_splits=10, early_stopping_rounds=100):
#     if fit_params is None:
#         # fit_params = {'eval_metric':'error'} # for lgb
# #         fit_params = {'eval_metric':'error'} # for xgb
#         fit_params = {} # default
#     else:
#         fit_params = fit_params.copy()
#     fit_params['early_stopping_rounds'] = early_stopping_rounds
#     fit_params['verbose'] = False
    
#     np.random.seed(1205)
#     random.seed(1205)

#     kf = StratifiedKFold(random_state=1205, n_splits=n_splits, shuffle=True)
#     accuracy = []
#     logloss = []

#     dummies = pd.get_dummies(y_train)
#     train_result = np.zeros(dummies.shape)
#     X_train = pd.DataFrame(X_train)
    
#     best_iterations = []
#     for train_idx, test_idx in kf.split(X_train, y_train):
#         clf = clone(estimator)

#         eval_set = [(X_train.iloc[test_idx], y_train[test_idx])]
#         clf.fit(X_train.iloc[train_idx], y_train[train_idx], eval_set=eval_set, **fit_params)
#         #best_iterations.append(clf.best_iteration) # for lgb
#         best_iterations.append(clf.best_ntree_limit) # for xgboost
        
#         train_result[test_idx] = clf.predict_proba(X_train.iloc[test_idx])
#         logloss.append(log_loss(dummies.iloc[test_idx], train_result[test_idx]))
#         accuracy.append(accuracy_score(y_train[test_idx], np.argmax(train_result[test_idx], axis=1)))
        
    lgb_param = estimator.get_params()
#     xgtrain = lgb.DMatrix(X_train.values, label=y_train)
#     def _construct_dataset(X, y, sample_weight, init_score, group, params):
#             ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
#             ret.set_init_score(init_score)
#             return ret
    dataset = lgb.Dataset(X_train, label = y_train)
    cvresult = lgb.cv(lgb_param, dataset, num_boost_round=estimator.get_params()['n_estimators'], nfold=n_splits,
            metrics='binary_logloss', early_stopping_rounds=early_stopping_rounds, seed=1205, shuffle=True)
    estimator.set_params(n_estimators=len(cvresult['binary_logloss-mean']))
    print("N:", len(cvresult['binary_logloss-mean']))
    print(cvresult['binary_logloss-mean'][-1])
#     print("N:", cvresult.shape[0], best_iterations)
#     print("Accuracy:")
#     print(accuracy)
#     print("mean:", np.mean(accuracy))
#     print("std:", np.std(accuracy))
#     print("")
#     print("Logloss:")
#     print(logloss)
#     print("mean:", np.mean(logloss))
#     print("std:", np.std(logloss))
    return len(cvresult['binary_logloss-mean'])

In [27]:
lgb1 = lgb.LGBMClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 num_leaves=2**5,
 min_child_weight=1,

 subsample=0.8,
 colsample_bytree=0.8,
# objective='multi:softmax',
# num_class=5,
 silent=True,
 nthread=4,

 seed=2707)

best_n_estimators = modelfit(lgb1, X, Y)

N: 53
0.539173203918


In [28]:
kf = StratifiedKFold(random_state=2225, n_splits=7, shuffle=True)

In [29]:
lgb2 = lgb.LGBMClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    num_leaves=2**5,
    min_child_weight=1,

    subsample=0.8,
    colsample_bytree=0.8,
    nthread=4,
   
    seed=1205
)

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
param_test1 = {
 'num_leaves':[2**l for l in range(3,10,2)],
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = lgb2, param_grid = param_test1, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: -0.53936, std: 0.00433, params: {'min_child_weight': 1, 'num_leaves': 8},
  mean: -0.53870, std: 0.00465, params: {'min_child_weight': 1, 'num_leaves': 32},
  mean: -0.54231, std: 0.00482, params: {'min_child_weight': 1, 'num_leaves': 128},
  mean: -0.55643, std: 0.00566, params: {'min_child_weight': 1, 'num_leaves': 512},
  mean: -0.53934, std: 0.00438, params: {'min_child_weight': 3, 'num_leaves': 8},
  mean: -0.53855, std: 0.00465, params: {'min_child_weight': 3, 'num_leaves': 32},
  mean: -0.54310, std: 0.00500, params: {'min_child_weight': 3, 'num_leaves': 128},
  mean: -0.55636, std: 0.00512, params: {'min_child_weight': 3, 'num_leaves': 512},
  mean: -0.53933, std: 0.00441, params: {'min_child_weight': 5, 'num_leaves': 8},
  mean: -0.53837, std: 0.00488, params: {'min_child_weight': 5, 'num_leaves': 32},
  mean: -0.54262, std: 0.00496, params: {'min_child_weight': 5, 'num_leaves': 128},
  mean: -0.55820, std: 0.00517, params: {'min_child_weight': 5, 'num_leaves': 512}],


In [35]:
import math
math.log(gsearch1.best_params_['num_leaves'], 2)

5.0

In [36]:
param_test1 = {
 'num_leaves':[2**l for l in [4, 5, 6]],
 'min_child_weight':[4,5,6]
}
gsearch1 = GridSearchCV(estimator = lgb2, param_grid = param_test1, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: -0.53853, std: 0.00461, params: {'min_child_weight': 4, 'num_leaves': 16},
  mean: -0.53825, std: 0.00488, params: {'min_child_weight': 4, 'num_leaves': 32},
  mean: -0.53954, std: 0.00478, params: {'min_child_weight': 4, 'num_leaves': 64},
  mean: -0.53847, std: 0.00452, params: {'min_child_weight': 5, 'num_leaves': 16},
  mean: -0.53837, std: 0.00488, params: {'min_child_weight': 5, 'num_leaves': 32},
  mean: -0.53936, std: 0.00476, params: {'min_child_weight': 5, 'num_leaves': 64},
  mean: -0.53849, std: 0.00461, params: {'min_child_weight': 6, 'num_leaves': 16},
  mean: -0.53837, std: 0.00463, params: {'min_child_weight': 6, 'num_leaves': 32},
  mean: -0.53942, std: 0.00507, params: {'min_child_weight': 6, 'num_leaves': 64}],
 {'min_child_weight': 4, 'num_leaves': 32},
 -0.53825453778937438)

In [37]:
# CHECK THAT NOT EDGE CASES!
best_num_leaves = gsearch1.best_params_['num_leaves']
best_min_child_weight = gsearch1.best_params_['min_child_weight']

In [38]:
lgb3 = lgb.LGBMClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    num_leaves=best_num_leaves,
    min_child_weight=best_min_child_weight,

    subsample=0.8,
    colsample_bytree=0.8,

    nthread=4,
   
    seed=2707
)
best_n_estimators = modelfit(lgb3, X, Y)

N: 44
0.539215063301


In [39]:
lgb4 = lgb.LGBMClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    num_leaves=best_num_leaves,
    min_child_weight=best_min_child_weight,

    subsample=0.8,
    colsample_bytree=0.8,

    nthread=4,
   
    seed=2707
)

param_test4 = {
 'subsample': np.arange(0.5, 1, 0.1),
 'colsample_bytree': np.arange(0.5, 1, 0.1),
}

gsearch4 = GridSearchCV(estimator = lgb4, param_grid = param_test4, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: -0.53963, std: 0.00469, params: {'colsample_bytree': 0.5, 'subsample': 0.5},
  mean: -0.53911, std: 0.00466, params: {'colsample_bytree': 0.5, 'subsample': 0.59999999999999998},
  mean: -0.53902, std: 0.00451, params: {'colsample_bytree': 0.5, 'subsample': 0.69999999999999996},
  mean: -0.53921, std: 0.00465, params: {'colsample_bytree': 0.5, 'subsample': 0.79999999999999993},
  mean: -0.53899, std: 0.00449, params: {'colsample_bytree': 0.5, 'subsample': 0.89999999999999991},
  mean: -0.53908, std: 0.00451, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.5},
  mean: -0.53870, std: 0.00442, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.59999999999999998},
  mean: -0.53893, std: 0.00491, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.69999999999999996},
  mean: -0.53885, std: 0.00440, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.79999999999999993},
  mean: -0.53852, std: 0.00465, params: {'colsample_bytree'

In [44]:
param_test4 = {
 'subsample': np.arange(0.85, 0.96, 0.025),
 'colsample_bytree': np.arange(0.65, 0.76, 0.025),
}

gsearch4 = GridSearchCV(estimator = lgb4, param_grid = param_test4, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: -0.53884, std: 0.00466, params: {'colsample_bytree': 0.65000000000000002, 'subsample': 0.84999999999999998},
  mean: -0.53853, std: 0.00455, params: {'colsample_bytree': 0.65000000000000002, 'subsample': 0.875},
  mean: -0.53865, std: 0.00483, params: {'colsample_bytree': 0.65000000000000002, 'subsample': 0.90000000000000002},
  mean: -0.53854, std: 0.00464, params: {'colsample_bytree': 0.65000000000000002, 'subsample': 0.92500000000000004},
  mean: -0.53873, std: 0.00472, params: {'colsample_bytree': 0.65000000000000002, 'subsample': 0.95000000000000007},
  mean: -0.53884, std: 0.00466, params: {'colsample_bytree': 0.67500000000000004, 'subsample': 0.84999999999999998},
  mean: -0.53853, std: 0.00455, params: {'colsample_bytree': 0.67500000000000004, 'subsample': 0.875},
  mean: -0.53865, std: 0.00483, params: {'colsample_bytree': 0.67500000000000004, 'subsample': 0.90000000000000002},
  mean: -0.53854, std: 0.00464, params: {'colsample_bytree': 0.67500000000000004, 'subsample

In [45]:
best_colsample_bytree = gsearch4.best_params_['colsample_bytree']
best_subsample = gsearch4.best_params_['subsample']

In [46]:
lgb_5 = lgb.LGBMClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    num_leaves=best_num_leaves,
    min_child_weight=best_min_child_weight,

    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    

    nthread=4,
   
    seed=2707
)

In [47]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = lgb_5, param_grid = param_test5, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: -0.53851, std: 0.00470, params: {'reg_alpha': 1e-05},
  mean: -0.53851, std: 0.00459, params: {'reg_alpha': 0.01},
  mean: -0.53866, std: 0.00472, params: {'reg_alpha': 0.1},
  mean: -0.53845, std: 0.00459, params: {'reg_alpha': 1},
  mean: -0.54235, std: 0.00391, params: {'reg_alpha': 100}],
 {'reg_alpha': 1},
 -0.53844688164123344)

In [48]:
param_test5 = {
 'reg_alpha':np.arange(0.1,1.1,0.1),
}
gsearch5 = GridSearchCV(estimator = lgb_5, param_grid = param_test5, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: -0.53866, std: 0.00472, params: {'reg_alpha': 0.10000000000000001},
  mean: -0.53871, std: 0.00463, params: {'reg_alpha': 0.20000000000000001},
  mean: -0.53862, std: 0.00455, params: {'reg_alpha': 0.30000000000000004},
  mean: -0.53852, std: 0.00468, params: {'reg_alpha': 0.40000000000000002},
  mean: -0.53848, std: 0.00457, params: {'reg_alpha': 0.5},
  mean: -0.53858, std: 0.00474, params: {'reg_alpha': 0.59999999999999998},
  mean: -0.53837, std: 0.00470, params: {'reg_alpha': 0.70000000000000007},
  mean: -0.53849, std: 0.00446, params: {'reg_alpha': 0.80000000000000004},
  mean: -0.53869, std: 0.00463, params: {'reg_alpha': 0.90000000000000002},
  mean: -0.53845, std: 0.00459, params: {'reg_alpha': 1.0}],
 {'reg_alpha': 0.70000000000000007},
 -0.53836733635332956)

In [49]:
best_reg_alpha = gsearch5.best_params_['reg_alpha']

In [50]:
lgb_6 = lgb.LGBMClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    num_leaves=best_num_leaves,
    min_child_weight=best_min_child_weight,

    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha = best_reg_alpha,
    

    nthread=4,
   
    seed=2707
)

In [51]:
param_test6 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 2, 3, 5, 10, 100]
}
gsearch6 = GridSearchCV(estimator = lgb_6, param_grid = param_test6, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: -0.53837, std: 0.00470, params: {'reg_lambda': 1e-05},
  mean: -0.53848, std: 0.00470, params: {'reg_lambda': 0.01},
  mean: -0.53847, std: 0.00462, params: {'reg_lambda': 0.1},
  mean: -0.53825, std: 0.00465, params: {'reg_lambda': 1},
  mean: -0.53840, std: 0.00473, params: {'reg_lambda': 2},
  mean: -0.53848, std: 0.00449, params: {'reg_lambda': 3},
  mean: -0.53841, std: 0.00469, params: {'reg_lambda': 5},
  mean: -0.53852, std: 0.00451, params: {'reg_lambda': 10},
  mean: -0.53916, std: 0.00428, params: {'reg_lambda': 100}],
 {'reg_lambda': 1},
 -0.53825271858508505)

In [52]:
param_test6 = {
 'reg_lambda':np.arange(0.1,1.1,0.1)
}
gsearch6 = GridSearchCV(estimator = lgb_6, param_grid = param_test6, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: -0.53847, std: 0.00462, params: {'reg_lambda': 0.10000000000000001},
  mean: -0.53866, std: 0.00458, params: {'reg_lambda': 0.20000000000000001},
  mean: -0.53836, std: 0.00444, params: {'reg_lambda': 0.30000000000000004},
  mean: -0.53840, std: 0.00460, params: {'reg_lambda': 0.40000000000000002},
  mean: -0.53842, std: 0.00458, params: {'reg_lambda': 0.5},
  mean: -0.53842, std: 0.00471, params: {'reg_lambda': 0.59999999999999998},
  mean: -0.53851, std: 0.00459, params: {'reg_lambda': 0.70000000000000007},
  mean: -0.53831, std: 0.00459, params: {'reg_lambda': 0.80000000000000004},
  mean: -0.53852, std: 0.00456, params: {'reg_lambda': 0.90000000000000002},
  mean: -0.53825, std: 0.00465, params: {'reg_lambda': 1.0}],
 {'reg_lambda': 1.0},
 -0.53825271858508505)

In [53]:
best_reg_lambda = gsearch6.best_params_['reg_lambda']

In [54]:
for lr in np.arange(0.01, 0.11, 0.01):
    lgb_fin = lgb.LGBMClassifier(
        learning_rate = lr,
        n_estimators=2000,
        num_leaves=best_num_leaves,
        min_child_weight=best_min_child_weight,

        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )
    print(lr, "===================================")
    best_n_estimators = modelfit(lgb_fin, X, Y)

N: 664
0.538457349708
N: 274
0.538514481337
N: 177
0.538574400259
N: 145
0.538724914114
N: 125
0.538811933277
N: 76
0.538795631122
N: 73
0.538711434561
N: 59
0.538864563732
N: 52
0.538999646975
N: 50
0.538825805619


In [None]:
0.01  666

In [55]:
lgb_fin = lgb.LGBMClassifier(
        learning_rate = 0.01,
        n_estimators=666,
        num_leaves=best_num_leaves,
        min_child_weight=best_min_child_weight,

        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )

In [56]:
lgb_fin.get_params()

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.70000000000000007,
 'drop_rate': 0.1,
 'is_unbalance': False,
 'learning_rate': 0.01,
 'max_bin': 255,
 'max_depth': -1,
 'max_drop': 50,
 'min_child_samples': 10,
 'min_child_weight': 4,
 'min_split_gain': 0,
 'n_estimators': 666,
 'nthread': 4,
 'num_leaves': 32,
 'objective': 'binary',
 'reg_alpha': 0.70000000000000007,
 'reg_lambda': 1.0,
 'scale_pos_weight': 1,
 'seed': 2707,
 'sigmoid': 1.0,
 'silent': True,
 'skip_drop': 0.5,
 'subsample': 0.90000000000000002,
 'subsample_for_bin': 50000,
 'subsample_freq': 1,
 'uniform_drop': False,
 'xgboost_dart_mode': False}

In [45]:
from sklearn.model_selection import cross_val_score
rates = [
    (0.01, 713),
    (0.02, 369),
    (0.03, 242),
    (0.04, 194),
    (0.05, 153),
    (0.06, 114),
    (0.07, 114),
    (0.08, 85),
    (0.09, 66),
    (0.1, 61),
]
for lr, n in rates:
    lgb_fin = lgb.LGBMClassifier(
        learning_rate = lr,
        n_estimators=n,
        num_leaves=best_num_leaves,
        min_child_weight=best_min_child_weight,

        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )
    
    scores = cross_val_score(lgb_fin, X, Y, scoring='neg_log_loss', cv=kf)
    print(lr, n, np.mean(scores))

0.01 713 -0.538162420234
0.02 369 -0.538157373435
0.03 242 -0.538190042071
0.04 194 -0.538225911238
0.05 153 -0.538427445476
0.06 114 -0.538354152819
0.07 114 -0.538477494464
0.08 85 -0.538507411006
0.09 66 -0.538652597946
0.1 61 -0.53844459685
