In [63]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils 

plt.style.use('ggplot')
%matplotlib inline

In [8]:
train = pd.read_csv('train_original.csv', sep=';')
test = pd.read_csv('test_original.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

TARGET = 'smoke'

test['cardio'] = np.nan

X = pd.concat((train, test), axis=0)
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train.drop(TARGET, axis=1)

X_test1 = train.loc[train[TARGET].isnull()].drop(TARGET, axis=1)
X_test2 = test.loc[test[TARGET].isnull()].drop(TARGET, axis=1)

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
    
class ColumnsFilter:
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
#         print(X.__class__)
        return self
    
    def transform(self, X):
        return X[self.columns]
    
class LogLog:
    def __init__(self):
        self.columns = 1
    
    def fit(self, X, y=None):
        print(X[:4])
        return self
    
    def transform(self, X):
        return X
    
class SmoothLikelihood:
    def __init__(self, columns, glob_mean_value, kf, alpha=13):
        self.columns = columns
        self.glob_mean_value = glob_mean_value
        self.alpha = alpha
        self.kf = kf
        if isinstance(columns, (list, tuple)):
            self.new_column = '_'.join(columns) + '_target_mean'
            self.columns = columns
        else:
            self.new_column = columns + '_target_mean'
            self.columns = [columns]
    
    def fit_transform(self, X, y):
        X=X.copy()
        X['target'] = y
        
        def calc(x):
            return (x['sum'] + self.glob_mean_value * self.alpha) / (x['count'] + self.alpha)
        result = np.zeros(X.shape[0])
        for itr, ite in self.kf.split(X, y):
            tr = X.iloc[itr]
            te = X.iloc[ite]
            
            temp = tr.groupby(self.columns)['target'].agg(["count", "sum"])
            value_dict = temp.apply(calc, axis=1).to_dict()

            result[ite] = te[self.columns].apply(lambda x: value_dict.get(tuple(x.values), self.glob_mean_value), axis=1)

        result = pd.DataFrame(result, columns=[self.new_column])
        
        temp = X.groupby(self.columns)['target'].agg(["count", "sum"])
        self.value_dict = temp.apply(calc, axis=1).to_dict()
#         X.drop('target', axis=1, inplace=True)
        return result
    
    def transform(self, X):
        result = pd.DataFrame()
        result[self.new_column] = X[self.columns].apply(lambda x: self.value_dict.get(tuple(x.values), self.glob_mean_value), axis=1)
        return result

In [10]:
from sklearn.model_selection import StratifiedKFold

# a = SmoothLikelihood(['alco', 'gluc'], 0.08804784985046922, )
# a.fit_transform(X_train, y_train)

kf = StratifiedKFold(random_state=44444, n_splits=10, shuffle=True)
combined_features = FeatureUnion([("filter", ColumnsFilter([
    'age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco', 'active', 'gluc', 'BMI','error_group', 'ap_dif', 'MAP'
])),
#                                   ("mean_1", SmoothLikelihood(['alco', 'active'], 0.08804784985046922, kf=kf, alpha=13))
                                 ])

# # Use combined features to transform dataset:
# X_features = combined_features.fit(X, y).transform(X)

# svm = SVC(kernel="linear")

# # Do grid search over k, n_components and C:

pipeline = Pipeline([ ("features", combined_features), ("model", xgb.XGBClassifier(**{
                                                                    'colsample_bytree': 0.8,
                                                                    'learning_rate': 0.1,
                                                                    'n_estimators': 200,
                                                                    'subsample': 0.8,
                                                                    'n_jobs': 1,
                                                                    'random_state': 2707,
                                                                    'silent': True,}))])
from sklearn.model_selection import cross_val_score
print(np.mean(cross_val_score(pipeline, X_train, y_train, cv=kf)))

0.922470865881


In [None]:
0.922450250517

In [1]:
1-0.08804784985046922

0.9119521501495308

In [14]:
y_train.mean()

0.08804784985046922

In [12]:
params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'n_estimators': 224,
    'subsample': 0.8,
    
    'n_jobs': 1,
    'random_state': 2707,
    'silent': True,
}

utils.execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              mean_columns = mean_columns,
              use_columns = use_columns,
              n_folds = 7,
#               stratification_groups=rew,
              alpha = 13,
             )
gc.collect()

ValueError: cannot reindex from a duplicate axis

# TUNE XGB

In [52]:
from sklearn.base import clone
from sklearn.metrics import log_loss, accuracy_score

def modelfit(estimator, X_train, y_train, X_test=None, fit_params=None, n_splits=7, early_stopping_rounds=200):
    if fit_params is None:
        # fit_params = {'eval_metric':'error'} # for lgb
        fit_params = {'eval_metric':'error'} # for xgb
#         fit_params = {} # default
    else:
        fit_params = fit_params.copy()
    fit_params['early_stopping_rounds'] = early_stopping_rounds
    fit_params['verbose'] = False
    
    np.random.seed(1205)
    random.seed(1205)

    kf = StratifiedKFold(random_state=1205, n_splits=n_splits, shuffle=True)
    accuracy = []
    logloss = []

    dummies = pd.get_dummies(y_train)
    train_result = np.zeros(dummies.shape)
    X_train = pd.DataFrame(X_train)
    
    best_iterations = []
    for train_idx, test_idx in kf.split(X_train, y_train):
        clf = clone(estimator)

        eval_set = [(X_train.iloc[test_idx], y_train[test_idx])]
        clf.fit(X_train.iloc[train_idx], y_train[train_idx], eval_set=eval_set, **fit_params)
        #best_iterations.append(clf.best_iteration) # for lgb
        best_iterations.append(clf.best_ntree_limit) # for xgboost
        
        train_result[test_idx] = clf.predict_proba(X_train.iloc[test_idx])
        logloss.append(log_loss(dummies.iloc[test_idx], train_result[test_idx]))
        accuracy.append(accuracy_score(y_train[test_idx], np.argmax(train_result[test_idx], axis=1)))

    print("N:", max(best_iterations), best_iterations)
    print("Accuracy:")
    print(accuracy)
    print("mean:", np.mean(accuracy))
    print("std:", np.std(accuracy))
    print("")
    print("Logloss:")
    print(logloss)
    print("mean:", np.mean(logloss))
    print("std:", np.std(logloss))
    return max(best_iterations)

In [25]:
X = X_train[['age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco', 'active', 'gluc', 'BMI','error_group', 'ap_dif', 'MAP']]
Y = y_train

In [26]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
# objective='multi:softmax',
# num_class=5,
 silent=True,
 nthread=1,

 seed=2707)

best_n_estimators = modelfit(xgb1, X, Y)

N: 68 [19, 32, 9, 21, 40, 68, 46]
Accuracy:
[0.92023964198065544, 0.92261603984696461, 0.92211073413701006, 0.92138886883707505, 0.92369883779686712, 0.92282702858792953, 0.92196072769275195]
mean: 0.922120268411
std: 0.00102360886928

Logloss:
[0.22262724967405112, 0.21999449234965551, 0.2176888377914184, 0.21807458964031337, 0.21174351501354258, 0.22160328862356402, 0.22142029667248339]
mean: 0.219021752824
std: 0.003422816209


In [27]:
kf = StratifiedKFold(random_state=2, n_splits=7, shuffle=True)

In [28]:
xgb2 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=1,
   
    seed=1205
)

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb2, param_grid = param_test1, scoring='accuracy', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.92248, std: 0.00157, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.92243, std: 0.00157, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.92241, std: 0.00160, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.92245, std: 0.00155, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.92235, std: 0.00170, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.92252, std: 0.00170, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.92197, std: 0.00168, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.92203, std: 0.00192, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.92188, std: 0.00160, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.92133, std: 0.00183, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.92146, std: 0.00173, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.92177, std: 0.00174, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_child_weight': 5

In [32]:
param_test1 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch1 = GridSearchCV(estimator = xgb2, param_grid = param_test1, scoring='accuracy', n_jobs=1, iid=False, cv=kf)

gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.92235, std: 0.00168, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: 0.92234, std: 0.00172, params: {'max_depth': 4, 'min_child_weight': 5},
  mean: 0.92244, std: 0.00172, params: {'max_depth': 4, 'min_child_weight': 6},
  mean: 0.92231, std: 0.00168, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: 0.92252, std: 0.00170, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.92246, std: 0.00156, params: {'max_depth': 5, 'min_child_weight': 6},
  mean: 0.92219, std: 0.00167, params: {'max_depth': 6, 'min_child_weight': 4},
  mean: 0.92213, std: 0.00176, params: {'max_depth': 6, 'min_child_weight': 5},
  mean: 0.92214, std: 0.00169, params: {'max_depth': 6, 'min_child_weight': 6}],
 {'max_depth': 5, 'min_child_weight': 5},
 0.92252238797647312)

In [33]:
# CHECK THAT NOT EDGE CASES!
best_max_depth = gsearch1.best_params_['max_depth']
best_min_child_weight = gsearch1.best_params_['min_child_weight']

In [34]:
xgb3 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,

    nthread=8,
   
    seed=2707
)
best_n_estimators = modelfit(xgb3, X, Y)

N: 82 [37, 58, 9, 22, 36, 22, 82]
Accuracy:
[0.92052836725855347, 0.92254385331697109, 0.92268822637695802, 0.92174980148704255, 0.92348227820688655, 0.9230436038117239, 0.92196072769275195]
mean: 0.92228526545
std: 0.000903635472587

Logloss:
[0.2223168213889207, 0.22048724064221153, 0.21744168906518047, 0.21827879071262618, 0.21125888138282628, 0.22058107581549199, 0.22152339331034768]
mean: 0.218841127474
std: 0.00348009058341


In [35]:
param_test3 = {
 #'gamma':np.arange(0, 1.1, 0.1)
 'gamma':np.arange(0, 1.025, 0.025)
}
gsearch3 = GridSearchCV(estimator = xgb3, param_grid = param_test3, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch3.fit(X, Y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.92017, std: 0.00148, params: {'gamma': 0.0},
  mean: 0.92014, std: 0.00160, params: {'gamma': 0.025000000000000001},
  mean: 0.91998, std: 0.00146, params: {'gamma': 0.050000000000000003},
  mean: 0.92023, std: 0.00181, params: {'gamma': 0.075000000000000011},
  mean: 0.92009, std: 0.00170, params: {'gamma': 0.10000000000000001},
  mean: 0.92022, std: 0.00151, params: {'gamma': 0.125},
  mean: 0.92015, std: 0.00134, params: {'gamma': 0.15000000000000002},
  mean: 0.92038, std: 0.00167, params: {'gamma': 0.17500000000000002},
  mean: 0.92024, std: 0.00164, params: {'gamma': 0.20000000000000001},
  mean: 0.92023, std: 0.00175, params: {'gamma': 0.22500000000000001},
  mean: 0.92018, std: 0.00183, params: {'gamma': 0.25},
  mean: 0.92026, std: 0.00193, params: {'gamma': 0.27500000000000002},
  mean: 0.92058, std: 0.00165, params: {'gamma': 0.30000000000000004},
  mean: 0.92005, std: 0.00158, params: {'gamma': 0.32500000000000001},
  mean: 0.91999, std: 0.00161, params: {'gamma':

In [38]:
best_gamma = gsearch3.best_params_['gamma']

In [39]:
xgb4 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=0.8,
    colsample_bytree=0.8,

    nthread=8,
   
    seed=2707
)

param_test4 = {
 'subsample': np.arange(0.5, 1, 0.1),
 'colsample_bytree': np.arange(0.5, 1, 0.1),
}

gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.92249, std: 0.00153, params: {'colsample_bytree': 0.5, 'subsample': 0.5},
  mean: 0.92240, std: 0.00141, params: {'colsample_bytree': 0.5, 'subsample': 0.59999999999999998},
  mean: 0.92246, std: 0.00136, params: {'colsample_bytree': 0.5, 'subsample': 0.69999999999999996},
  mean: 0.92240, std: 0.00167, params: {'colsample_bytree': 0.5, 'subsample': 0.79999999999999993},
  mean: 0.92246, std: 0.00145, params: {'colsample_bytree': 0.5, 'subsample': 0.89999999999999991},
  mean: 0.92238, std: 0.00169, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.5},
  mean: 0.92246, std: 0.00147, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.59999999999999998},
  mean: 0.92233, std: 0.00150, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.69999999999999996},
  mean: 0.92229, std: 0.00165, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.79999999999999993},
  mean: 0.92232, std: 0.00182, params: {'colsample_bytree': 0.599999

In [40]:
param_test4 = {
 'subsample': np.arange(0.45, 0.56, 0.05),
 'colsample_bytree': np.arange(0.45, 0.56, 0.05),
}

gsearch4 = GridSearchCV(estimator = xgb4, param_grid = param_test4, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.92224, std: 0.00147, params: {'colsample_bytree': 0.45000000000000001, 'subsample': 0.45000000000000001},
  mean: 0.92206, std: 0.00170, params: {'colsample_bytree': 0.45000000000000001, 'subsample': 0.5},
  mean: 0.92237, std: 0.00161, params: {'colsample_bytree': 0.45000000000000001, 'subsample': 0.55000000000000004},
  mean: 0.92248, std: 0.00153, params: {'colsample_bytree': 0.5, 'subsample': 0.45000000000000001},
  mean: 0.92249, std: 0.00153, params: {'colsample_bytree': 0.5, 'subsample': 0.5},
  mean: 0.92235, std: 0.00156, params: {'colsample_bytree': 0.5, 'subsample': 0.55000000000000004},
  mean: 0.92247, std: 0.00167, params: {'colsample_bytree': 0.55000000000000004, 'subsample': 0.45000000000000001},
  mean: 0.92238, std: 0.00169, params: {'colsample_bytree': 0.55000000000000004, 'subsample': 0.5},
  mean: 0.92242, std: 0.00161, params: {'colsample_bytree': 0.55000000000000004, 'subsample': 0.55000000000000004}],
 {'colsample_bytree': 0.5, 'subsample': 0.5},
 0.92

In [41]:
best_colsample_bytree = gsearch4.best_params_['colsample_bytree']
best_subsample = gsearch4.best_params_['subsample']

In [42]:
xgb_5 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    

    nthread=8,
   
    seed=2707
)

In [43]:
param_test5 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch5 = GridSearchCV(estimator = xgb_5, param_grid = param_test5, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.92249, std: 0.00153, params: {'reg_alpha': 1e-05},
  mean: 0.92245, std: 0.00155, params: {'reg_alpha': 0.01},
  mean: 0.92238, std: 0.00156, params: {'reg_alpha': 0.1},
  mean: 0.92243, std: 0.00148, params: {'reg_alpha': 1},
  mean: 0.92250, std: 0.00152, params: {'reg_alpha': 100}],
 {'reg_alpha': 100},
 0.92250176325404787)

In [45]:
param_test5 = {
 'reg_alpha':np.arange(5,15, 1),
}
gsearch5 = GridSearchCV(estimator = xgb_5, param_grid = param_test5, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.92255, std: 0.00160, params: {'reg_alpha': 5},
  mean: 0.92250, std: 0.00160, params: {'reg_alpha': 6},
  mean: 0.92249, std: 0.00144, params: {'reg_alpha': 7},
  mean: 0.92253, std: 0.00150, params: {'reg_alpha': 8},
  mean: 0.92263, std: 0.00163, params: {'reg_alpha': 9},
  mean: 0.92257, std: 0.00158, params: {'reg_alpha': 10},
  mean: 0.92250, std: 0.00153, params: {'reg_alpha': 11},
  mean: 0.92253, std: 0.00160, params: {'reg_alpha': 12},
  mean: 0.92253, std: 0.00166, params: {'reg_alpha': 13},
  mean: 0.92252, std: 0.00158, params: {'reg_alpha': 14}],
 {'reg_alpha': 9},
 0.92262551903639023)

In [46]:
best_reg_alpha = gsearch5.best_params_['reg_alpha']

In [47]:
xgb_6 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_child_weight=best_min_child_weight,
    gamma=best_gamma,
    subsample=best_subsample,
    colsample_bytree=best_colsample_bytree,
    reg_alpha = best_reg_alpha,
    

    nthread=8,
   
    seed=2707
)

In [48]:
param_test6 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 2, 3, 5, 10, 100]
}
gsearch6 = GridSearchCV(estimator = xgb_6, param_grid = param_test6, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: 0.92262, std: 0.00155, params: {'reg_lambda': 1e-05},
  mean: 0.92262, std: 0.00155, params: {'reg_lambda': 0.01},
  mean: 0.92259, std: 0.00152, params: {'reg_lambda': 0.1},
  mean: 0.92263, std: 0.00163, params: {'reg_lambda': 1},
  mean: 0.92259, std: 0.00162, params: {'reg_lambda': 2},
  mean: 0.92258, std: 0.00158, params: {'reg_lambda': 3},
  mean: 0.92256, std: 0.00158, params: {'reg_lambda': 5},
  mean: 0.92258, std: 0.00154, params: {'reg_lambda': 10},
  mean: 0.92251, std: 0.00167, params: {'reg_lambda': 100}],
 {'reg_lambda': 1},
 0.92262551903639023)

In [49]:
param_test6 = {
 'reg_lambda':np.arange(0.5,1.6,0.1)
}
gsearch6 = GridSearchCV(estimator = xgb_6, param_grid = param_test6, scoring='accuracy', n_jobs=1, iid=False, cv=kf)
gsearch6.fit(X, Y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: 0.92263, std: 0.00164, params: {'reg_lambda': 0.5},
  mean: 0.92264, std: 0.00164, params: {'reg_lambda': 0.59999999999999998},
  mean: 0.92267, std: 0.00167, params: {'reg_lambda': 0.69999999999999996},
  mean: 0.92264, std: 0.00164, params: {'reg_lambda': 0.79999999999999993},
  mean: 0.92262, std: 0.00171, params: {'reg_lambda': 0.89999999999999991},
  mean: 0.92263, std: 0.00163, params: {'reg_lambda': 0.99999999999999989},
  mean: 0.92260, std: 0.00167, params: {'reg_lambda': 1.0999999999999999},
  mean: 0.92269, std: 0.00168, params: {'reg_lambda': 1.1999999999999997},
  mean: 0.92269, std: 0.00160, params: {'reg_lambda': 1.2999999999999998},
  mean: 0.92272, std: 0.00159, params: {'reg_lambda': 1.3999999999999999},
  mean: 0.92271, std: 0.00159, params: {'reg_lambda': 1.4999999999999998}],
 {'reg_lambda': 1.3999999999999999},
 0.92271833103359879)

In [50]:
best_reg_lambda = gsearch6.best_params_['reg_lambda']

In [53]:
for lr in np.arange(0.01, 0.11, 0.01):
    xgb_fin = xgb.XGBClassifier(
        learning_rate = lr,
        n_estimators=2000,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=8,
       
        seed=2707
    )
    print(lr, "===================================")
    best_n_estimators = modelfit(xgb_fin, X, Y)

N: 505 [1, 505, 434, 430, 500, 476, 430]
Accuracy:
[0.91302151003320342, 0.92290478596693859, 0.92239948025698404, 0.92167761495704903, 0.92434851656680861, 0.92253826162287034, 0.92217730291654632]
mean: 0.921295353189
std: 0.00346526515958

Logloss:
[0.26435496366397182, 0.21999780703319857, 0.21822870428108837, 0.2185297788854226, 0.21282388901286362, 0.22075844390513086, 0.22084109026699117]
mean: 0.225076382436
std: 0.0162351183443
N: 740 [1, 238, 188, 513, 227, 740, 243]
Accuracy:
[0.92016746066118094, 0.92304915902692553, 0.92225510719699699, 0.92232729372699052, 0.92434851656680861, 0.92318798729425355, 0.9221051111752816]
mean: 0.922491519378
std: 0.0011879086724

Logloss:
[0.22866472577201793, 0.21956701698411057, 0.21723508314424672, 0.21721605275029665, 0.2118782495159113, 0.22085282462165082, 0.21980713925178391]
mean: 0.219317298863
std: 0.0046923165865
N: 331 [1, 251, 139, 331, 158, 234, 190]
Accuracy:
[0.92060054857802798, 0.92326571861690609, 0.92203854760701653, 0.922

In [None]:
0.06 335

In [54]:
xgb_fin = xgb.XGBClassifier(
        learning_rate = 0.06,
        n_estimators=335,
        max_depth=best_max_depth,
        min_child_weight=best_min_child_weight,
        gamma=best_gamma,
        subsample=best_subsample,
        colsample_bytree=best_colsample_bytree,
        reg_alpha = best_reg_alpha,
        reg_lambda = best_reg_lambda,
        
    
        nthread=8,
       
        seed=2707
    )

In [55]:
scores = cross_val_score(xgb_fin, X, Y, cv=kf)

In [56]:
print(np.mean(scores), np.std(scores))

0.922532710016 0.00155244328908


In [57]:
0.9119521501495308

0.9119521501495308

In [61]:
xgb_fin.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.5,
 'gamma': 0.30000000000000004,
 'learning_rate': 0.06,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 5,
 'missing': None,
 'n_estimators': 335,
 'n_jobs': 1,
 'nthread': 8,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 9,
 'reg_lambda': 1.3999999999999999,
 'scale_pos_weight': 1,
 'seed': 2707,
 'silent': True,
 'subsample': 0.5}

In [77]:
clf11 = xgb.XGBClassifier(
        learning_rate=0.06,
        n_estimators=335,
        max_depth=5,
        min_child_weight=5,
        gamma=0.3,
        subsample=0.5,
        colsample_bytree=0.5,
        reg_alpha=9,
        reg_lambda=1.4,
        nthread=8,
       
        seed=2707
    )

clf22 = xgb.XGBClassifier(
        learning_rate=0.06,
        n_estimators=335,
        max_depth=5,
        min_child_weight=5,
        gamma=0.3,
        subsample=0.5,
        colsample_bytree=0.5,
        reg_alpha=9,
        reg_lambda=1.4,
        nthread=8,
        tree_method='hist',
        grow_policy='lossguide',
       
        seed=2707
    )

clf33 = lgb.LGBMClassifier(
        learning_rate=0.1,
        n_estimators=100,
#         num_leaves=2**5,
#         min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
#         reg_alpha=9,
#         reg_lambda=1.4,
        nthread=8,
       
        seed=2707
    )

In [None]:
train = pd.read_csv('train_original.csv', sep=';')
test = pd.read_csv('test_original.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

TARGET = 'smoke'
features = ['age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco', 'active', 'gluc', 'BMI', 'ap_dif', 'MAP']
test['cardio'] = np.nan

X = pd.concat((train, test), axis=0)
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train[features]

X_test1 = train.loc[train[TARGET].isnull(), features]
X_test2 = test.loc[test[TARGET].isnull(), features]


kf = StratifiedKFold(random_state=44444, n_splits=10, shuffle=True)
fold_score = []
for train_idx, test_idx in kf.split(X_train, y_train):
    clf1 = clone(clf11)
    clf2 = clone(clf22)
    clf3 = clone(clf33)
    clf1.fit(X_train.iloc[train_idx], y_train[train_idx])
    clf2.fit(X_train.iloc[train_idx], y_train[train_idx])
    clf3.fit(X_train.iloc[train_idx], y_train[train_idx])
#     predict1 = clf1.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict2 = clf2.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict3 = clf3.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict= (predict1+predict2+predict3)/3
    
    predict1 = clf1.predict(X_train.iloc[test_idx])
    predict2 = clf2.predict(X_train.iloc[test_idx])
    predict3 = clf3.predict(X_train.iloc[test_idx])
    predict= (predict1+predict2+predict3)/3
    fold_score.append(accuracy_score(y_train[test_idx], np.round(predict)))
    
print(np.mean(fold_score), np.std(fold_score))


In [None]:
0.922532710016 0.00155244328908

In [79]:
train = pd.read_csv('train_original.csv', sep=';')
test = pd.read_csv('test_original.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

TARGET = 'alco'
features = ['age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','smoke', 'active', 'gluc', 'BMI', 'ap_dif', 'MAP']
test['cardio'] = np.nan

X = pd.concat((train, test), axis=0)
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train[features]

X_test1 = train.loc[train[TARGET].isnull(), features]
X_test2 = test.loc[test[TARGET].isnull(), features]


kf = StratifiedKFold(random_state=44444, n_splits=10, shuffle=True)
fold_score = []
for train_idx, test_idx in kf.split(X_train, y_train):
    clf1 = clone(clf11)
    clf2 = clone(clf22)
    clf3 = clone(clf33)
    clf1.fit(X_train.iloc[train_idx], y_train[train_idx])
    clf2.fit(X_train.iloc[train_idx], y_train[train_idx])
    clf3.fit(X_train.iloc[train_idx], y_train[train_idx])
#     predict1 = clf1.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict2 = clf2.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict3 = clf3.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict= (predict1+predict2+predict3)/3
    
    predict1 = clf1.predict(X_train.iloc[test_idx])
    predict2 = clf2.predict(X_train.iloc[test_idx])
    predict3 = clf3.predict(X_train.iloc[test_idx])
    predict= (predict1+predict2+predict3)/3
    fold_score.append(accuracy_score(y_train[test_idx], np.round(predict)))
    
print(np.mean(fold_score), np.std(fold_score))

0.922532710016 0.00155244328908


In [80]:
train = pd.read_csv('train_original.csv', sep=';')
test = pd.read_csv('test_original.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

TARGET = 'gender'
features = [ 'active', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco', 'smoke', 'gluc', 'BMI', 'ap_dif', 'MAP']
test['cardio'] = np.nan

X = pd.concat((train, test), axis=0)
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train[features]

X_test1 = train.loc[train[TARGET].isnull(), features]
X_test2 = test.loc[test[TARGET].isnull(), features]


kf = StratifiedKFold(random_state=44444, n_splits=10, shuffle=True)
fold_score = []
for train_idx, test_idx in kf.split(X_train, y_train):
    clf1 = clone(clf1)
    clf2 = clone(clf2)
    clf3 = clone(clf3)
    clf1.fit(X_train.iloc[train_idx], y_train[train_idx])
    clf2.fit(X_train.iloc[train_idx], y_train[train_idx])
    clf3.fit(X_train.iloc[train_idx], y_train[train_idx])
#     predict1 = clf1.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict2 = clf2.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict3 = clf3.predict_proba(X_train.iloc[test_idx])[:, 1]
#     predict= (predict1+predict2+predict3)/3
    
    predict1 = clf1.predict(X_train.iloc[test_idx])
    predict2 = clf2.predict(X_train.iloc[test_idx])
    predict3 = clf3.predict(X_train.iloc[test_idx])
    predict= (predict1+predict2+predict3)/3
    fold_score.append(accuracy_score(y_train[test_idx], np.round(predict)))
    
print(np.mean(fold_score), np.std(fold_score))

0.922532710016 0.00155244328908
