In [8]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils 

plt.style.use('ggplot')
%matplotlib inline

# TUNE XGB

In [9]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

X = X_train[[
"age_group",
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
"smoke_restored",
"alco_restored",
"active_restored",
"BMI",
"ap_dif",
"MAP",
"age_group_MAPX",
]]
Y = y_train

In [46]:
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score

def modelfit(estimator, X_train, y_train, X_test=None, fit_params=None, n_splits=7, early_stopping_rounds=100):
#     if fit_params is None:
#         # fit_params = {'eval_metric':'error'} # for lgb
# #         fit_params = {'eval_metric':'error'} # for xgb
#         fit_params = {} # default
#     else:
#         fit_params = fit_params.copy()
#     fit_params['early_stopping_rounds'] = early_stopping_rounds
#     fit_params['verbose'] = False
    
#     np.random.seed(1205)
#     random.seed(1205)

#     kf = StratifiedKFold(random_state=1205, n_splits=n_splits, shuffle=True)
#     accuracy = []
#     logloss = []

#     dummies = pd.get_dummies(y_train)
#     train_result = np.zeros(dummies.shape)
#     X_train = pd.DataFrame(X_train)
    
#     best_iterations = []
#     for train_idx, test_idx in kf.split(X_train, y_train):
#         clf = clone(estimator)

#         eval_set = [(X_train.iloc[test_idx], y_train[test_idx])]
#         clf.fit(X_train.iloc[train_idx], y_train[train_idx], eval_set=eval_set, **fit_params)
#         #best_iterations.append(clf.best_iteration) # for lgb
#         best_iterations.append(clf.best_ntree_limit) # for xgboost
        
#         train_result[test_idx] = clf.predict_proba(X_train.iloc[test_idx])
#         logloss.append(log_loss(dummies.iloc[test_idx], train_result[test_idx]))
#         accuracy.append(accuracy_score(y_train[test_idx], np.argmax(train_result[test_idx], axis=1)))
        
    xgb_param = estimator.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train.values, label=y_train)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=estimator.get_params()['n_estimators'], nfold=n_splits,
            metrics='logloss', early_stopping_rounds=early_stopping_rounds, seed=1205, shuffle=True)
    estimator.set_params(n_estimators=cvresult.shape[0])

    print("N:", cvresult.shape[0])
    print(cvresult[-1])
#     print("N:", cvresult.shape[0], best_iterations)
#     print("Accuracy:")
#     print(accuracy)
#     print("mean:", np.mean(accuracy))
#     print("std:", np.std(accuracy))
#     print("")
#     print("Logloss:")
#     print(logloss)
#     print("mean:", np.mean(logloss))
#     print("std:", np.std(logloss))
    return cvresult.shape[0]

In [19]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
# objective='multi:softmax',
# num_class=5,
 silent=True,
 nthread=4,

 seed=2707)

best_n_estimators = modelfit(xgb1, X, Y)

N: 56


In [20]:
kf = StratifiedKFold(random_state=2225, n_splits=7, shuffle=True)

In [21]:
xgb2 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=4,
   
    seed=1205
)

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb2, param_grid = param_test1, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: -0.54037, std: 0.00391, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: -0.54040, std: 0.00392, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: -0.54041, std: 0.00394, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: -0.53878, std: 0.00458, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: -0.53878, std: 0.00470, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: -0.53863, std: 0.00471, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: -0.53984, std: 0.00497, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: -0.53997, std: 0.00485, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: -0.53990, std: 0.00499, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: -0.54345, std: 0.00497, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: -0.54298, std: 0.00521, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: -0.54224, std: 0.00532, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_chil

In [24]:
param_test1 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch1 = GridSearchCV(estimator = xgb2, param_grid = param_test1, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: -0.53900, std: 0.00442, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: -0.53900, std: 0.00447, params: {'max_depth': 4, 'min_child_weight': 5},
  mean: -0.53899, std: 0.00445, params: {'max_depth': 4, 'min_child_weight': 6},
  mean: -0.53867, std: 0.00468, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: -0.53863, std: 0.00471, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: -0.53865, std: 0.00467, params: {'max_depth': 5, 'min_child_weight': 6},
  mean: -0.53909, std: 0.00485, params: {'max_depth': 6, 'min_child_weight': 4},
  mean: -0.53910, std: 0.00486, params: {'max_depth': 6, 'min_child_weight': 5},
  mean: -0.53895, std: 0.00476, params: {'max_depth': 6, 'min_child_weight': 6}],
 {'max_depth': 5, 'min_child_weight': 5},
 -0.5386313168832767)

In [25]:
# CHECK THAT NOT EDGE CASES!
best_max_depth = gsearch1.best_params_['max_depth']
best_min_child_weight = gsearch1.best_params_['min_child_weight']

In [26]:
xgb3 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,

    nthread=4,
   
    seed=2707
)
best_n_estimators = modelfit(xgb3, X, Y)

N: 61


In [28]:
param_test3 = {
#  'gamma':np.arange(0, 1.1, 0.1)
 'gamma':np.arange(0, .525, 0.025)
}
gsearch3 = GridSearchCV(estimator = xgb3, param_grid = param_test3, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch3.fit(X, Y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: -0.53870, std: 0.00457, params: {'gamma': 0.0},
  mean: -0.53867, std: 0.00463, params: {'gamma': 0.025000000000000001},
  mean: -0.53864, std: 0.00465, params: {'gamma': 0.050000000000000003},
  mean: -0.53864, std: 0.00465, params: {'gamma': 0.075000000000000011},
  mean: -0.53865, std: 0.00465, params: {'gamma': 0.10000000000000001},
  mean: -0.53865, std: 0.00466, params: {'gamma': 0.125},
  mean: -0.53866, std: 0.00465, params: {'gamma': 0.15000000000000002},
  mean: -0.53867, std: 0.00464, params: {'gamma': 0.17500000000000002},
  mean: -0.53867, std: 0.00463, params: {'gamma': 0.20000000000000001},
  mean: -0.53867, std: 0.00463, params: {'gamma': 0.22500000000000001},
  mean: -0.53870, std: 0.00465, params: {'gamma': 0.25},
  mean: -0.53871, std: 0.00466, params: {'gamma': 0.27500000000000002},
  mean: -0.53871, std: 0.00465, params: {'gamma': 0.30000000000000004},
  mean: -0.53872, std: 0.00464, params: {'gamma': 0.32500000000000001},
  mean: -0.53872, std: 0.00464, pa

In [29]:
best_gamma = gsearch3.best_params_['gamma']

In [30]:
xgb4 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=0.8,
    colsample_bytree=0.8,

    nthread=4,
   
    seed=2707
)

param_test4 = {
 'subsample': np.arange(0.5, 1, 0.1),
 'colsample_bytree': np.arange(0.5, 1, 0.1),
}

gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: -0.53898, std: 0.00459, params: {'colsample_bytree': 0.5, 'subsample': 0.5},
  mean: -0.53900, std: 0.00450, params: {'colsample_bytree': 0.5, 'subsample': 0.59999999999999998},
  mean: -0.53935, std: 0.00468, params: {'colsample_bytree': 0.5, 'subsample': 0.69999999999999996},
  mean: -0.53899, std: 0.00468, params: {'colsample_bytree': 0.5, 'subsample': 0.79999999999999993},
  mean: -0.53914, std: 0.00470, params: {'colsample_bytree': 0.5, 'subsample': 0.89999999999999991},
  mean: -0.53881, std: 0.00458, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.5},
  mean: -0.53893, std: 0.00463, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.59999999999999998},
  mean: -0.53885, std: 0.00478, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.69999999999999996},
  mean: -0.53864, std: 0.00460, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.79999999999999993},
  mean: -0.53870, std: 0.00480, params: {'colsample_bytree'

In [31]:
param_test4 = {
 'subsample': np.arange(0.65, 0.76, 0.025),
 'colsample_bytree': np.arange(0.85, 0.96, 0.025),
}

gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: -0.53875, std: 0.00462, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.65000000000000002},
  mean: -0.53870, std: 0.00457, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.67500000000000004},
  mean: -0.53888, std: 0.00463, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.70000000000000007},
  mean: -0.53885, std: 0.00469, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.72500000000000009},
  mean: -0.53858, std: 0.00464, params: {'colsample_bytree': 0.84999999999999998, 'subsample': 0.75000000000000011},
  mean: -0.53883, std: 0.00477, params: {'colsample_bytree': 0.875, 'subsample': 0.65000000000000002},
  mean: -0.53876, std: 0.00475, params: {'colsample_bytree': 0.875, 'subsample': 0.67500000000000004},
  mean: -0.53857, std: 0.00460, params: {'colsample_bytree': 0.875, 'subsample': 0.70000000000000007},
  mean: -0.53860, std: 0.00456, params: {'colsample_bytree': 0.875, 'subsample': 0.72500000000000009},
  m

In [32]:
best_colsample_bytree = gsearch4.best_params_['colsample_bytree']
best_subsample = gsearch4.best_params_['subsample']

In [33]:
xgb_5 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    

    nthread=4,
   
    seed=2707
)

In [34]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = xgb_5, param_grid = param_test5, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: -0.53857, std: 0.00460, params: {'reg_alpha': 1e-05},
  mean: -0.53862, std: 0.00460, params: {'reg_alpha': 0.01},
  mean: -0.53872, std: 0.00466, params: {'reg_alpha': 0.1},
  mean: -0.53869, std: 0.00463, params: {'reg_alpha': 1},
  mean: -0.54213, std: 0.00398, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 -0.53856819922862831)

In [None]:
param_test5 = {
 'reg_alpha':np.arange(5,15, 1),
}
gsearch5 = GridSearchCV(estimator = xgb_5, param_grid = param_test5, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

In [35]:
best_reg_alpha = 0# gsearch5.best_params_['reg_alpha']

In [36]:
xgb_6 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha = best_reg_alpha,
    

    nthread=4,
   
    seed=2707
)

In [37]:
param_test6 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 2, 3, 5, 10, 100]
}
gsearch6 = GridSearchCV(estimator = xgb_6, param_grid = param_test6, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: -0.53862, std: 0.00475, params: {'reg_lambda': 1e-05},
  mean: -0.53859, std: 0.00475, params: {'reg_lambda': 0.01},
  mean: -0.53865, std: 0.00482, params: {'reg_lambda': 0.1},
  mean: -0.53857, std: 0.00460, params: {'reg_lambda': 1},
  mean: -0.53860, std: 0.00475, params: {'reg_lambda': 2},
  mean: -0.53856, std: 0.00461, params: {'reg_lambda': 3},
  mean: -0.53857, std: 0.00468, params: {'reg_lambda': 5},
  mean: -0.53844, std: 0.00479, params: {'reg_lambda': 10},
  mean: -0.53871, std: 0.00457, params: {'reg_lambda': 100}],
 {'reg_lambda': 10},
 -0.5384445968501802)

In [38]:
param_test6 = {
 'reg_lambda':np.arange(5,16,1)
}
gsearch6 = GridSearchCV(estimator = xgb_6, param_grid = param_test6, scoring='neg_log_loss', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: -0.53857, std: 0.00468, params: {'reg_lambda': 5},
  mean: -0.53859, std: 0.00471, params: {'reg_lambda': 6},
  mean: -0.53865, std: 0.00476, params: {'reg_lambda': 7},
  mean: -0.53865, std: 0.00452, params: {'reg_lambda': 8},
  mean: -0.53865, std: 0.00463, params: {'reg_lambda': 9},
  mean: -0.53844, std: 0.00479, params: {'reg_lambda': 10},
  mean: -0.53850, std: 0.00468, params: {'reg_lambda': 11},
  mean: -0.53853, std: 0.00468, params: {'reg_lambda': 12},
  mean: -0.53860, std: 0.00475, params: {'reg_lambda': 13},
  mean: -0.53854, std: 0.00470, params: {'reg_lambda': 14},
  mean: -0.53859, std: 0.00468, params: {'reg_lambda': 15}],
 {'reg_lambda': 10},
 -0.5384445968501802)

In [40]:
best_reg_lambda = gsearch6.best_params_['reg_lambda']

In [41]:
for lr in np.arange(0.01, 0.11, 0.01):
    xgb_fin = xgb.XGBClassifier(
        learning_rate = lr,
        n_estimators=2000,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )
    print(lr, "===================================")
    best_n_estimators = modelfit(xgb_fin, X, Y)

N: 713
N: 369
N: 242
N: 194
N: 153
N: 114
N: 114
N: 85
N: 66
N: 61


In [None]:
0.07  218

In [None]:
xgb_fin = xgb.XGBClassifier(
        learning_rate = 0.07,
        n_estimators=218,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )

In [42]:
xgb_fin.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.875,
 'gamma': 0.050000000000000003,
 'grow_policy': 'depthwise',
 'learning_rate': 0.099999999999999992,
 'max_bin': 256,
 'max_delta_step': 0,
 'max_depth': 5,
 'max_leaves': 0,
 'min_child_weight': 5,
 'missing': None,
 'n_estimators': 61,
 'n_jobs': 4,
 'nthread': 4,
 'objective': 'binary:logistic',
 'random_state': 2707,
 'reg_alpha': 0,
 'reg_lambda': 10,
 'scale_pos_weight': 1,
 'seed': 2707,
 'silent': True,
 'subsample': 0.70000000000000007,
 'tree_method': 'auto'}

In [45]:
from sklearn.model_selection import cross_val_score
rates = [
    (0.01, 713),
    (0.02, 369),
    (0.03, 242),
    (0.04, 194),
    (0.05, 153),
    (0.06, 114),
    (0.07, 114),
    (0.08, 85),
    (0.09, 66),
    (0.1, 61),
]
for lr, n in rates:
    xgb_fin = xgb.XGBClassifier(
        learning_rate = lr,
        n_estimators=n,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=4,
       
        seed=2707
    )
    
    scores = cross_val_score(xgb_fin, X, Y, scoring='neg_log_loss', cv=kf)
    print(lr, n, np.mean(scores))

0.01 713 -0.538162420234
0.02 369 -0.538157373435
0.03 242 -0.538190042071
0.04 194 -0.538225911238
0.05 153 -0.538427445476
0.06 114 -0.538354152819
0.07 114 -0.538477494464
0.08 85 -0.538507411006
0.09 66 -0.538652597946
0.1 61 -0.53844459685
