In [1]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils 

plt.style.use('ggplot')
%matplotlib inline

# TUNE XGB

In [2]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

X = X_train[[
"age_group",
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
"smoke_restored",
"alco_restored",
"active_restored",
"BMI",
"ap_dif",
"MAP",
"age_group_MAPX",
]]
Y = y_train

In [9]:
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score

def modelfit(estimator, X_train, y_train, X_test=None, fit_params=None, n_splits=7, early_stopping_rounds=100):
#     if fit_params is None:
#         # fit_params = {'eval_metric':'error'} # for lgb
# #         fit_params = {'eval_metric':'error'} # for xgb
#         fit_params = {} # default
#     else:
#         fit_params = fit_params.copy()
#     fit_params['early_stopping_rounds'] = early_stopping_rounds
#     fit_params['verbose'] = False
    
#     np.random.seed(1205)
#     random.seed(1205)

#     kf = StratifiedKFold(random_state=1205, n_splits=n_splits, shuffle=True)
#     accuracy = []
#     logloss = []

#     dummies = pd.get_dummies(y_train)
#     train_result = np.zeros(dummies.shape)
#     X_train = pd.DataFrame(X_train)
    
#     best_iterations = []
#     for train_idx, test_idx in kf.split(X_train, y_train):
#         clf = clone(estimator)

#         eval_set = [(X_train.iloc[test_idx], y_train[test_idx])]
#         clf.fit(X_train.iloc[train_idx], y_train[train_idx], eval_set=eval_set, **fit_params)
#         #best_iterations.append(clf.best_iteration) # for lgb
#         best_iterations.append(clf.best_ntree_limit) # for xgboost
        
#         train_result[test_idx] = clf.predict_proba(X_train.iloc[test_idx])
#         logloss.append(log_loss(dummies.iloc[test_idx], train_result[test_idx]))
#         accuracy.append(accuracy_score(y_train[test_idx], np.argmax(train_result[test_idx], axis=1)))
        
    xgb_param = estimator.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train.values, label=y_train)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=estimator.get_params()['n_estimators'], nfold=n_splits,
            metrics='logloss', early_stopping_rounds=early_stopping_rounds, seed=1205, shuffle=True)
    estimator.set_params(n_estimators=cvresult.shape[0])

    print("N:", cvresult.shape[0])
    print(cvresult.iloc[-1])
#     print("N:", cvresult.shape[0], best_iterations)
#     print("Accuracy:")
#     print(accuracy)
#     print("mean:", np.mean(accuracy))
#     print("std:", np.std(accuracy))
#     print("")
#     print("Logloss:")
#     print(logloss)
#     print("mean:", np.mean(logloss))
#     print("std:", np.std(logloss))
    return cvresult.shape[0]

In [10]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
# objective='multi:softmax',
# num_class=5,
 silent=True,
 nthread=4,
    tree_method='hist',
    grow_policy='lossguide',

 seed=2707)

best_n_estimators = modelfit(xgb1, X, Y)

N: 67
test-logloss-mean     0.538495
test-logloss-std      0.004673
train-logloss-mean    0.526784
train-logloss-std     0.000772
Name: 66, dtype: float64


In [11]:
kf = StratifiedKFold(random_state=2225, n_splits=7, shuffle=True)

In [12]:
xgb2 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=4,
    tree_method='hist',
    grow_policy='lossguide',
   
    seed=1205
)

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb2, param_grid = param_test1, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: -0.53974, std: 0.00416, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: -0.53969, std: 0.00423, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: -0.53969, std: 0.00427, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: -0.53867, std: 0.00460, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: -0.53861, std: 0.00462, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: -0.53863, std: 0.00469, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: -0.54017, std: 0.00480, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: -0.54014, std: 0.00475, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: -0.54012, std: 0.00487, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: -0.54503, std: 0.00476, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: -0.54365, std: 0.00511, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: -0.54353, std: 0.00531, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_chil

In [15]:
param_test1 = {
 'max_depth':[4,5,6],
 'min_child_weight':[2,3,4]
}
gsearch1 = GridSearchCV(estimator = xgb2, param_grid = param_test1, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: -0.53875, std: 0.00454, params: {'max_depth': 4, 'min_child_weight': 2},
  mean: -0.53871, std: 0.00453, params: {'max_depth': 4, 'min_child_weight': 3},
  mean: -0.53876, std: 0.00445, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: -0.53868, std: 0.00456, params: {'max_depth': 5, 'min_child_weight': 2},
  mean: -0.53861, std: 0.00462, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: -0.53862, std: 0.00475, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: -0.53926, std: 0.00466, params: {'max_depth': 6, 'min_child_weight': 2},
  mean: -0.53925, std: 0.00491, params: {'max_depth': 6, 'min_child_weight': 3},
  mean: -0.53898, std: 0.00490, params: {'max_depth': 6, 'min_child_weight': 4}],
 {'max_depth': 5, 'min_child_weight': 3},
 -0.53861489142171515)

In [16]:
# CHECK THAT NOT EDGE CASES!
best_max_depth = gsearch1.best_params_['max_depth']
best_min_child_weight = gsearch1.best_params_['min_child_weight']

In [17]:
xgb3 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,

    nthread=4,
    tree_method='hist',
    grow_policy='lossguide',
   
    seed=2707
)
best_n_estimators = modelfit(xgb3, X, Y)

N: 57
test-logloss-mean     0.538561
test-logloss-std      0.004438
train-logloss-mean    0.528687
train-logloss-std     0.000694
Name: 56, dtype: float64


In [18]:
param_test3 = {
#  'gamma':np.arange(0, 1.1, 0.1)
 'gamma':np.arange(0, 1.025, 0.025)
}
gsearch3 = GridSearchCV(estimator = xgb3, param_grid = param_test3, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch3.fit(X, Y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: -0.53877, std: 0.00448, params: {'gamma': 0.0},
  mean: -0.53877, std: 0.00449, params: {'gamma': 0.025000000000000001},
  mean: -0.53877, std: 0.00449, params: {'gamma': 0.050000000000000003},
  mean: -0.53876, std: 0.00449, params: {'gamma': 0.075000000000000011},
  mean: -0.53881, std: 0.00451, params: {'gamma': 0.10000000000000001},
  mean: -0.53881, std: 0.00451, params: {'gamma': 0.125},
  mean: -0.53879, std: 0.00450, params: {'gamma': 0.15000000000000002},
  mean: -0.53878, std: 0.00449, params: {'gamma': 0.17500000000000002},
  mean: -0.53878, std: 0.00447, params: {'gamma': 0.20000000000000001},
  mean: -0.53878, std: 0.00447, params: {'gamma': 0.22500000000000001},
  mean: -0.53882, std: 0.00449, params: {'gamma': 0.25},
  mean: -0.53882, std: 0.00449, params: {'gamma': 0.27500000000000002},
  mean: -0.53881, std: 0.00450, params: {'gamma': 0.30000000000000004},
  mean: -0.53882, std: 0.00449, params: {'gamma': 0.32500000000000001},
  mean: -0.53877, std: 0.00449, pa

In [19]:
best_gamma = gsearch3.best_params_['gamma']

In [21]:
xgb4 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=0.8,
    colsample_bytree=0.8,

    nthread=4,
    tree_method='hist',
    grow_policy='lossguide',
   
    seed=2707
)

param_test4 = {
 'subsample': np.arange(0.5, 1.0, 0.1),
 'colsample_bytree': np.arange(0.5, 1.0, 0.1),
}

gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: -0.53895, std: 0.00444, params: {'colsample_bytree': 0.5, 'subsample': 0.5},
  mean: -0.53894, std: 0.00438, params: {'colsample_bytree': 0.5, 'subsample': 0.59999999999999998},
  mean: -0.53922, std: 0.00451, params: {'colsample_bytree': 0.5, 'subsample': 0.69999999999999996},
  mean: -0.53911, std: 0.00478, params: {'colsample_bytree': 0.5, 'subsample': 0.79999999999999993},
  mean: -0.53909, std: 0.00465, params: {'colsample_bytree': 0.5, 'subsample': 0.89999999999999991},
  mean: -0.53885, std: 0.00451, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.5},
  mean: -0.53890, std: 0.00453, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.59999999999999998},
  mean: -0.53875, std: 0.00481, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.69999999999999996},
  mean: -0.53876, std: 0.00472, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.79999999999999993},
  mean: -0.53869, std: 0.00475, params: {'colsample_bytree'

In [22]:
param_test4 = {
 'subsample': np.arange(0.75, 0.86, 0.025),
 'colsample_bytree': np.arange(0.85, 0.96, 0.025),
}

gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: -0.53876, std: 0.00442, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.75},
  mean: -0.53882, std: 0.00450, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.77500000000000002},
  mean: -0.53871, std: 0.00446, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.80000000000000004},
  mean: -0.53863, std: 0.00460, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.82500000000000007},
  mean: -0.53876, std: 0.00467, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.85000000000000009},
  mean: -0.53870, std: 0.00457, params: {'colsample_bytree': 0.875, 'subsample': 0.75},
  mean: -0.53883, std: 0.00434, params: {'colsample_bytree': 0.875, 'subsample': 0.77500000000000002},
  mean: -0.53851, std: 0.00446, params: {'colsample_bytree': 0.875, 'subsample': 0.80000000000000004},
  mean: -0.53853, std: 0.00453, params: {'colsample_bytree': 0.875, 'subsample': 0.82500000000000007},
  mean: -0.53861, std: 0.00465, p

In [23]:
best_colsample_bytree = gsearch4.best_params_['colsample_bytree']
best_subsample = gsearch4.best_params_['subsample']

In [24]:
xgb_5 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    

    nthread=4,
    tree_method='hist',
    grow_policy='lossguide',
   
    seed=2707
)

In [25]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = xgb_5, param_grid = param_test5, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: -0.53851, std: 0.00457, params: {'reg_alpha': 1e-05},
  mean: -0.53852, std: 0.00453, params: {'reg_alpha': 0.01},
  mean: -0.53852, std: 0.00463, params: {'reg_alpha': 0.1},
  mean: -0.53861, std: 0.00458, params: {'reg_alpha': 1},
  mean: -0.54172, std: 0.00388, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 -0.53850955116629073)

In [None]:
param_test5 = {
 'reg_alpha':np.arange(5,15, 1),
}
gsearch5 = GridSearchCV(estimator = xgb_5, param_grid = param_test5, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

In [26]:
best_reg_alpha = 0# gsearch5.best_params_['reg_alpha']

In [27]:
xgb_6 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha = best_reg_alpha,
    

    nthread=4,
    tree_method='hist',
    grow_policy='lossguide',
   
    seed=2707
)

In [28]:
param_test6 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 2, 3, 5, 10, 100]
}
gsearch6 = GridSearchCV(estimator = xgb_6, param_grid = param_test6, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: -0.53867, std: 0.00452, params: {'reg_lambda': 1e-05},
  mean: -0.53868, std: 0.00455, params: {'reg_lambda': 0.01},
  mean: -0.53862, std: 0.00459, params: {'reg_lambda': 0.1},
  mean: -0.53851, std: 0.00457, params: {'reg_lambda': 1},
  mean: -0.53862, std: 0.00454, params: {'reg_lambda': 2},
  mean: -0.53860, std: 0.00464, params: {'reg_lambda': 3},
  mean: -0.53857, std: 0.00462, params: {'reg_lambda': 5},
  mean: -0.53861, std: 0.00465, params: {'reg_lambda': 10},
  mean: -0.53888, std: 0.00444, params: {'reg_lambda': 100}],
 {'reg_lambda': 1},
 -0.53850955139441148)

In [29]:
param_test6 = {
 'reg_lambda':np.arange(0.1,1.6,0.1)
}
gsearch6 = GridSearchCV(estimator = xgb_6, param_grid = param_test6, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: -0.53862, std: 0.00459, params: {'reg_lambda': 0.10000000000000001},
  mean: -0.53865, std: 0.00457, params: {'reg_lambda': 0.20000000000000001},
  mean: -0.53863, std: 0.00453, params: {'reg_lambda': 0.30000000000000004},
  mean: -0.53845, std: 0.00462, params: {'reg_lambda': 0.40000000000000002},
  mean: -0.53846, std: 0.00456, params: {'reg_lambda': 0.5},
  mean: -0.53860, std: 0.00455, params: {'reg_lambda': 0.59999999999999998},
  mean: -0.53870, std: 0.00450, params: {'reg_lambda': 0.70000000000000007},
  mean: -0.53866, std: 0.00448, params: {'reg_lambda': 0.80000000000000004},
  mean: -0.53869, std: 0.00461, params: {'reg_lambda': 0.90000000000000002},
  mean: -0.53851, std: 0.00457, params: {'reg_lambda': 1.0},
  mean: -0.53869, std: 0.00461, params: {'reg_lambda': 1.1000000000000001},
  mean: -0.53855, std: 0.00469, params: {'reg_lambda': 1.2000000000000002},
  mean: -0.53852, std: 0.00466, params: {'reg_lambda': 1.3000000000000003},
  mean: -0.53850, std: 0.00473, pa

In [30]:
best_reg_lambda = gsearch6.best_params_['reg_lambda']

In [33]:
for lr in np.arange(0.01, 0.11, 0.01):
    xgb_fin = xgb.XGBClassifier(
        learning_rate = lr,
        n_estimators=2000,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
        tree_method='hist',
    grow_policy='lossguide',
       
        seed=2707
    )
    print(lr, "===================================")
    best_n_estimators = modelfit(xgb_fin, X, Y)

N: 732
test-logloss-mean     0.538251
test-logloss-std      0.004582
train-logloss-mean    0.525705
train-logloss-std     0.000843
Name: 731, dtype: float64
N: 392
test-logloss-mean     0.538247
test-logloss-std      0.004687
train-logloss-mean    0.525067
train-logloss-std     0.000887
Name: 391, dtype: float64
N: 243
test-logloss-mean     0.538463
test-logloss-std      0.004583
train-logloss-mean    0.525737
train-logloss-std     0.000830
Name: 242, dtype: float64
N: 168
test-logloss-mean     0.538308
test-logloss-std      0.004659
train-logloss-mean    0.526503
train-logloss-std     0.000864
Name: 167, dtype: float64
N: 148
test-logloss-mean     0.538347
test-logloss-std      0.004481
train-logloss-mean    0.525691
train-logloss-std     0.000886
Name: 147, dtype: float64
N: 122
test-logloss-mean     0.538426
test-logloss-std      0.004620
train-logloss-mean    0.525824
train-logloss-std     0.000837
Name: 121, dtype: float64
N: 86
test-logloss-mean     0.538517
test-logloss-std     

In [None]:
0.07  218

In [31]:
xgb_fin = xgb.XGBClassifier(
        learning_rate = 0.07,
        n_estimators=218,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )

In [32]:
xgb_fin.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.95000000000000007,
 'gamma': 0.55000000000000004,
 'grow_policy': 'depthwise',
 'learning_rate': 0.07,
 'max_bin': 256,
 'max_delta_step': 0,
 'max_depth': 5,
 'max_leaves': 0,
 'min_child_weight': 3,
 'missing': None,
 'n_estimators': 218,
 'n_jobs': 4,
 'nthread': 4,
 'objective': 'binary:logistic',
 'random_state': 2707,
 'reg_alpha': 0,
 'reg_lambda': 0.40000000000000002,
 'scale_pos_weight': 1,
 'seed': 2707,
 'silent': True,
 'subsample': 0.85000000000000009,
 'tree_method': 'auto'}

In [45]:
from sklearn.model_selection import cross_val_score
rates = [
    (0.01, 713),
    (0.02, 369),
    (0.03, 242),
    (0.04, 194),
    (0.05, 153),
    (0.06, 114),
    (0.07, 114),
    (0.08, 85),
    (0.09, 66),
    (0.1, 61),
]
for lr, n in rates:
    xgb_fin = xgb.XGBClassifier(
        learning_rate = lr,
        n_estimators=n,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )
    
    scores = cross_val_score(xgb_fin, X, Y, scoring='neg_log_loss', cv=kf)
    print(lr, n, np.mean(scores))

0.01 713 -0.538162420234
0.02 369 -0.538157373435
0.03 242 -0.538190042071
0.04 194 -0.538225911238
0.05 153 -0.538427445476
0.06 114 -0.538354152819
0.07 114 -0.538477494464
0.08 85 -0.538507411006
0.09 66 -0.538652597946
0.1 61 -0.53844459685
