In [47]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils

plt.style.use('ggplot')
%matplotlib inline

import importlib
utils = importlib.reload(utils)

In [44]:
# loo = pd.read_csv('loo.csv', sep=';', header=None)
# loo.columns=['loo']
# loo['target']=y_train
# loo_loss = loo.apply(lambda x: log_loss([x['target']], [x['loo']], labels=[1, 0]), axis=1)
# pd.DataFrame(loo_loss).to_csv('loo_loss.csv', index=False, header=False, sep=';')

In [45]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train, more_clean=True)
test = utils.clean_data(test, more_clean=True)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

data = pd.concat((train.drop('cardio', axis=1), test), axis=0)
dic = data.groupby('age_group_orig')['age'].mean().to_dict()
X_train['age_dif'] = X_train[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)
X_test['age_dif'] = X_test[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)

dic2 = data.groupby('age_group_orig')['age'].min().to_dict()
dic3 = data.groupby('age_group_orig')['age'].max().to_dict()
X_train['age_dif2'] = X_train[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)
X_test['age_dif2']  =  X_test[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)

dic3 = data.groupby('age_group_orig')['age'].std().to_dict()
X_train['age_dif3'] = X_train[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)
X_test['age_dif3']  =  X_test[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)

train_1 = X_train['ap_lo'] < 20
train_2 = X_train['ap_hi'] < 50
train_3 = (X_train['ap_hi']<10) | (X_train['ap_lo']<20)
train_4 = (data['weight'].mean() - X_train['weight']) > (3 * data['weight'].std())
train_5 = (data['height'].mean() - X_train['height']) > (3.5 * data['height'].std())

test_1 = X_test['ap_lo'] < 20
test_2 = X_test['ap_hi'] < 50
test_3 = (X_test['ap_hi']<10) | (X_test['ap_lo']<20)
test_4 = (data['weight'].mean() - X_test['weight']) > (3 * data['weight'].std())
test_5 = (data['height'].mean() - X_test['height']) > (3.5 * data['height'].std())

In [15]:
loo_loss = pd.read_csv('loo_loss.csv', sep=';', header=None)
strat = pd.qcut(loo_loss, 20, labels=False).astype(str)
strat = np.hstack((strat, y_train.reshape((-1,1))))
strat =np.apply_along_axis(lambda d: str(d[0]) + '_' + str(d[1]), 1, strat)

In [16]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
'age_dif2',
'ap_lo_mul_log_ap_hi'
]
model_name = '+-XGB_1.5_hey'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    #'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}
model = xgb.XGBClassifier(**params)
utils.execute_model(model,
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

# 10 folds logloss:
# [0.53731990653713735, 0.5371451460103317, 0.53759305584367933, 0.53649916905123263, 0.5365669149833473, 0.53913942057634634, 0.53802636138177329, 0.53734196102539478, 0.53772344060100696, 0.53709262123337387]
# mean: 0.537444799724 7387 7372 7383 7381 7355 7363 7404

# 7329
# std: 0.000722112840278


10 folds logloss:
[0.53715752800673577, 0.5373108941114515, 0.53756577143336526, 0.53622748590635649, 0.53638501345469425, 0.53898592073605878, 0.53778592251224122, 0.53724861389483003, 0.53775801292281022, 0.53694562309885774]
mean: 0.537337078608
std: 0.00074035388879
+-XGB_1.5_hey results saved!


378

In [17]:
use_columns = [
# "gender",
# "height",
# "weight",
# "ap_hi",
# "ap_lo",
# "cholesterol",
# "active_fair",
# "smoke_restored",
# "alco_restored",
# "height_mul_log_gluc",
# "BMI",
# "age_group",
# "cholesterol_div_log_gluc",
# "gluc_mul_log_age",
# 'age_dif2',
# 'ap_lo_mul_log_ap_hi'
    
#     "gender",
# "ap_hi",
# "ap_lo",
# "cholesterol",
# "active_fair",
# "smoke_restored",
# "alco_restored",
# "height_mul_log_cholesterol",
# "height_mul_log_gluc",
# "BMI",
# "age_group",
# "cholesterol_div_log_gluc",
# "gluc_mul_log_age",
# "ap_hi_mul_weight",
# "age_dif2",
# 'ap_lo_mul_log_ap_hi',
# 'age_group_div_height',
# 'age_group_mul_log_MAP'
    
    'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',
]
model_name = '+-XGB_hist_last_hey'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,
    
    #'scale_pos_weight': 1.0008,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 2222,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()
# 10 folds logloss:
# [0.53757190997207693, 0.5380473325125551, 0.53740657876655074, 0.53637490868959747, 0.53654954023473578, 0.53888017941311606, 0.53783108141170444, 0.53687143028967788, 0.53758322567055628, 0.53653519701878949]
# mean: 0.537365138398
# std: 0.000751543774796

# 10 folds logloss:
# [0.53652502003303926, 0.53822653738835646, 0.53701765458617179, 0.53608297558267504, 0.53673609970384017, 0.53826651063431563, 0.53853680535003579, 0.53728310895360654, 0.53718299642914946, 0.53730986089232202]
# mean: 0.537316756955
# std: 0.000762431052301


10 folds logloss:
[0.53652132170002453, 0.53853172703366781, 0.53701626663763691, 0.53575467657581621, 0.53640585688004316, 0.53786365939736092, 0.53793720902855335, 0.53698934633880691, 0.53792219387713891, 0.53754400077483155]
mean: 0.537248625824
std: 0.000814373004508
+-XGB_hist_last_hey results saved!


91

In [18]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"height_div_ap_lo",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_cholesterol",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
"ap_hi_mul_weight",
]

model_name = '+-XGB_5++_hey'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    #'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 5555,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53735667601641068, 0.53651338825631978, 0.53730000980301718, 0.53641026389851232, 0.53707447720078783, 0.53868050722776184, 0.53849308880150426, 0.5377645754844903, 0.53685784031652872, 0.53773113909775949]
mean: 0.53741819661
std: 0.000724654004672
+-XGB_5++_hey results saved!


42

In [19]:
use_columns = [
"gender",
"ap_hi",
"ap_lo",
"cholesterol",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_cholesterol",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
"ap_hi_mul_weight",
"age_dif2",
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP'
]
model_name = '+-XGB_11.5_hey'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    
    #'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53661436704239829, 0.53763204853848523, 0.53667955237596443, 0.53548566612412907, 0.53649631879979898, 0.53876049277591409, 0.53812302005611234, 0.53747629965599164, 0.53686211045986598, 0.53713116826364737]
mean: 0.537126104409
std: 0.000875348633
+-XGB_11.5_hey results saved!


92

In [20]:
use_columns = [
'gender',
'height',
'weight',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'ap_hi_mul_weight',
'age_dif',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
]
model_name = '+-XGB_10++_hey'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    
    #'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53659697523131511, 0.53716205685734852, 0.53646771445689923, 0.53555430732682074, 0.53702334168876376, 0.53848326200678476, 0.53795099132916557, 0.53755935381168218, 0.53713108845448565, 0.53728859777922156]
mean: 0.537121768894
std: 0.000770202248041
+-XGB_10++_hey results saved!


91

In [48]:
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

# 'cholesterol_div_weight',
# 'ap_hi_div_ap_lo',
# 'height_mul_log_ap_hi',
# 'BMI_div_cholesterol',
# 'cholesterol_div_log_ap_lo',
#     'BMI_1',
#     'age_group_div_log_gluc',
]
model_name = '+-XGB_15_hey'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()
# 10 folds logloss:
# [0.53646016616605852, 0.53819110210055832, 0.53700349915524548, 0.53586434324496812, 0.53614972118656656, 0.53817725547795814, 0.537725036665749, 0.53720775463564263, 0.53785448705315742, 0.53667016882603535]
# mean: 0.537130353451
# std: 0.000796657672578

# 10 folds logloss:
# [0.53646141939657832, 0.53815644988699762, 0.53697043853302318, 0.53582988165501011, 0.536220359169709, 0.53827735193706205, 0.537880909573071, 0.53717420261473092, 0.5379235707759743, 0.53683144369524272]
# mean: 0.537172602724
# std: 0.000814050947736


10 folds logloss:
[0.53642714099202538, 0.53811459261772754, 0.53708974684511568, 0.53582265335425983, 0.53616242744154852, 0.53818200963393326, 0.53792698418081897, 0.53719790817843127, 0.53785361626253447, 0.53679881485631686]
mean: 0.537157589436
std: 0.000805147038272
+-XGB_15_hey results saved!


96

In [22]:
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

# 'cholesterol_div_weight',
# 'ap_hi_div_ap_lo',
# 'height_mul_log_ap_hi',
# 'BMI_div_cholesterol',
# 'cholesterol_div_log_ap_lo',
#     'MAP_div_ap_lo',
#     'ap_dif',
#     'BMI_3'
]
model_name = 'XGB_16_hey'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    #'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}

X1 = X_train[use_columns].copy()
X2 = X_test[use_columns].copy()

for m in ['XGB_10',
 'XGB_10+',
 'XGB_12+',
 'XGB_13+',
 'XGB_14',
 'XGB_1_new2',
 'XGB_1_new3+',
 'XGB_2hist_new',
 'XGB_5',
 'XGB_5+']:

    tr_pr, ts_pr, _, _ = utils.load_model(m)
    X1[m] = tr_pr
    X2[m] = ts_pr
utils.execute_model(xgb.XGBClassifier(**params),
              X1,
              y_train,
              X2,
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53867506741042592, 0.54054023688199926, 0.53756564577900212, 0.53611703432271596, 0.53845780134205867, 0.54029530102726975, 0.53962356360469177, 0.53850994461364055, 0.53906353748760116, 0.53857616217669435]
mean: 0.538742429465
std: 0.00122342934658
XGB_16_hey results saved!


142

In [23]:
model_name = 'LGB_1_hey'
params = {
    'colsample_bytree': 0.7,
     'learning_rate': 0.01,
     'min_child_weight': 4,
     'n_estimators': 666,
     'num_leaves': 32,
     'reg_alpha': 0.7,
     'reg_lambda': 1.0,
     'subsample': 0.9,
       
    'nthread': 4,
    'seed': 8718,
    'silent': True,
}
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
'gluc_mul_height',
]
utils.execute_model(lgb.LGBMClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()
# 10 folds logloss:
# [0.53646016616605852, 0.53819110210055832, 0.53700349915524548, 0.53586434324496812, 0.53614972118656656, 0.53817725547795814, 0.537725036665749, 0.53720775463564263, 0.53785448705315742, 0.53667016882603535]
# mean: 0.537130353451
# std: 0.000796657672578


10 folds logloss:
[0.53668302129426326, 0.53854883743245474, 0.53721547074154286, 0.5359349155186991, 0.53687686452697514, 0.53893018930255421, 0.5376679075781392, 0.53769615771569002, 0.53812062157592333, 0.53708290951781379]
mean: 0.53747568952
std: 0.00085741449589
LGB_1_hey results saved!


102

# KERAS models

In [11]:
from keras.layers.core import Dense, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adamax
import pandas as pd


class KerasModel(object):
    def __init__(self,
                 var_num,
                 epochs=70,
                 learn_rate=0.1,
                 config=None,
                 batch_size=512,
                 verbose=0,
                 validation_split=0.2,
                 loss="binary_crossentropy"):

        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.validation_split = validation_split
        
        self.model = Sequential()
        model = self.model
        
        if config is None:
            config =[(var_num, 0.0)]
        else:
            config = config.copy()
            
        n, dp = config.pop(0)

        model.add(Dense(n, input_dim=var_num, kernel_initializer='uniform'))
        model.add(LeakyReLU())
        if 0 < dp < 1:
            model.add(Dropout(dp))
        
        while config:
            n, dp = config.pop(0)
            model.add(Dense(n, kernel_initializer='uniform'))
            model.add(LeakyReLU())
            if 0 < dp < 1:
                model.add(Dropout(dp))


        model.add(Dense(1, activation='sigmoid'))
        opt = Adamax(lr=learn_rate)

        model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])


    def fit(self, X, y, sample_weight=None, callbacks=[]):
        process_X = X.values if hasattr(X, 'iloc') else X
        process_y = y
        return self.model.fit(process_X, process_y, batch_size=self.batch_size,
                       epochs=self.epochs, verbose=self.verbose,
                       sample_weight=sample_weight,
                       callbacks=callbacks,
                       validation_split=self.validation_split,
                       shuffle=True)

    def predict_proba(self, X):
        process_x = X.values if hasattr(X, 'iloc') else X
        result  = self.model.predict(process_x)
        classone_probs = result
        classzero_probs = 1.0 - classone_probs
        return np.hstack((classzero_probs, classone_probs))
#         return result
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

Using Theano backend.


In [13]:
use_columns = [
'gender',
'height',
'weight',
'ap_hi',
'ap_lo',
'cholesterol',
'active_restored',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
# 'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'ap_hi_mul_weight',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = StandardScaler().fit(X)
X1 = pd.DataFrame(scaler.transform(X1)) #.values
X2 = pd.DataFrame(scaler.transform(X2)) #.values

def create(x1, x2):
    config = [(64,0.075), (64,0.025)]
    return KerasModel(var_num=len(use_columns),
                   epochs=200,
                   learn_rate=0.001,
                   config=config,
                   batch_size=1024,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="+-KERAS_3_hey",
              n_splits=0,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )


10 folds logloss:
[0.53914003932664611, 0.53974235800686543, 0.54045273288346729, 0.53872986886558294, 0.53718777955037189, 0.54068484223042579, 0.53823223962626177, 0.53851067567761335, 0.53929717428819057, 0.53986375673608011]
mean: 0.539184146719
std: 0.00101000637534
+-KERAS_3_hey results saved!


(0.5391841467191506, None)

In [49]:
models = [
    '+-KERAS_3_hey',
    '+-XGB_11.5_hey',
    '+-XGB_1.5_hey',
    '+-XGB_hist_last_hey',
    '+-XGB_5++_hey',
    'XGB_9_new',
    '+-XGB_10++_hey',
    '+-XGB_15_hey',
    'LGB_1_hey',
    
#      'KERAS_2_new2', 'XGB_11', 'XGB_1_new2', 'XGB_2hist_new', 'LGB_1_new', 'XGB_5', 'XGB_9',
    
#      'KERAS_3', 'XGB_11', 'XGB_1_new2', 'XGB_2hist_new', 'XGB_5', 'XGB_9_new', 'XGB_10',
#     'XGB_16_hey',
         ]
result = utils.merge_models(models, method='mean')
pd.DataFrame(result).to_csv('merged_models.csv', index=False, header=False, sep=';') # 0.5430089

# # Don't risk too much.
result[test_3 & (result < 0.2)] = 0.2
result[test_4 & (result < 0.2)] = 0.2
# result[test_5 & (result < 0.2)] = 0.2
# result[test_3 & (result > 0.8)] = 0.8
# result[test_4 & (result > 0.8)] = 0.8
result[test_5 & (result > 0.8)] = 0.8
pd.DataFrame(result).to_csv('merged_models-corrected.csv', index=False, header=False, sep=';') # 0.5430089

# t = pd.read_csv('test.csv', sep=';')
# result[(X_test['ap_lo']!=t['ap_lo']) & (result < 0.15)] = 0.15
# pd.DataFrame(result).to_csv('merged_models-corrected+.csv', index=False, header=False, sep=';') # 0.5430188


+-KERAS_3_hey
0.539184252718	0.577349632449	0.539184252718	0.577349632449

+-XGB_11.5_hey
0.537125919672	0.577349632449	0.53727601432	0.577349632449

+-XGB_1.5_hey
0.537336974965	0.577349632449	0.537060496897	0.577349632449

+-XGB_hist_last_hey
0.537248466552	0.577349632449	0.53690891588	0.577349632449

+-XGB_5++_hey
0.537417962421	0.577349632449	0.536878107751	0.577349632449

XGB_9_new
0.537758655716	0.538017158735	0.536843450618	0.566322933484

+-XGB_10++_hey
0.537121476251	0.577349632449	0.536837582151	0.567799645983

+-XGB_15_hey
0.537157448477	0.577349632449	0.536826235386	0.568928763556

LGB_1_hey
0.537475547952	0.577349632449	0.536818063274	0.569819740564


In [15]:
1/0

ZeroDivisionError: division by zero

In [None]:
t = pd.read_csv('test.csv', sep=';')
t.loc[(result<0.2 )&((X_test['ap_lo']!=t['ap_lo']))]

In [None]:
X_test['BMI'].hist(bins=100)

In [26]:
models = [
    '+-KERAS_3_hey',
    '+-XGB_11.5_hey',
    '+-XGB_1.5_hey',
    '+-XGB_hist_last_hey',
    '+-XGB_5++_hey',
    'XGB_9_new',
    '+-XGB_10++_hey',
    '+-XGB_15_hey',
    'LGB_1_hey',
]
r0, r2 = utils.merge_models2(models, method='mean')
# models = [
#     'KERAS_2_new2', 'XGB_11', 'XGB_1_new2', 'XGB_2hist_new', 'LGB_1_new', 'XGB_5', 'XGB_9',
# ]
# r00, r22 = utils.merge_models2(models, method='mean')


+-KERAS_3_hey
0.539184252718	0.577349632449	0.539184252718	0.577349632449

+-XGB_11.5_hey
0.537125919672	0.577349632449	0.53727601432	0.577349632449

+-XGB_1.5_hey
0.537336974965	0.577349632449	0.537060496897	0.577349632449

+-XGB_hist_last_hey
0.537248466552	0.577349632449	0.53690891588	0.577349632449

+-XGB_5++_hey
0.537417962421	0.577349632449	0.536878107751	0.577349632449

XGB_9_new
0.537758655716	0.538017158735	0.536843450618	0.566322933484

+-XGB_10++_hey
0.537121476251	0.577349632449	0.536837582151	0.567799645983

+-XGB_15_hey
0.537172452298	0.577349632449	0.536827877426	0.568928763556

LGB_1_hey
0.537475547952	0.577349632449	0.536819543708	0.569819740564


In [33]:
t = pd.read_csv('train.csv', sep=';')
best = 0
best_score = 1
for i in np.arange(0,0.5,0.005):
    r3 = r2.copy()
#     r3[((X_train['ap_lo']!=t['ap_lo'])) & (r3<i)]=i
    r3[train_4 & (r3<i)]=i
    
    a = log_loss(y_train, r3)
    if a < best_score:
        best_score=a
        best=i
print(best, best_score)

0.375 0.536813391444


In [32]:
t = pd.read_csv('train.csv', sep=';')
best = 0
best_score = 1
for i in np.arange(1.0,0.5,-0.005):
    r3 = r2.copy()
#     r3[((X_train['ap_lo']!=t['ap_lo'])|(X_train['ap_hi']!=t['ap_hi'])) &(r3>i)]=i
    r3[train_4 & (r3>i)]=i
    a = log_loss(y_train, r3)
    if a < best_score:
        best_score=a
        best=i
print(best, best_score)

1.0 0.536819543708


In [38]:
r3 = r2.copy()
# r3[train_1&(r3>0.505)]=0.505
# r3[test_3 & (r3 < 0.2)] = 0.2
# r3[test_4 & (r3 < 0.2)] = 0.2
# r3[test_5 & (r3 < 0.2)] = 0.2
# r3[test_3 & (r3 > 0.8)] = 0.8
# r3[test_4 & (r3 > 0.8)] = 0.8
# r3[test_5 & (r3 > 0.8)] = 0.8
log_loss(y_train, r3)

0.53681970108690569

In [None]:
X_train.loc[train_3&(r2<0.2)]

In [None]:
r2[train_3&(r2<0.2)]

In [None]:
X_test.loc[test_2]

In [None]:
r0[test_3]

In [51]:
models = [
    'KERAS_3_hey',
#     'XGB_11.5_hey',
#     'XGB_1.5_hey',
#     'XGB_hist_last_hey',
#     'XGB_5++_hey',
#     'XGB_9_new',
#     'XGB_10++_hey',
#     'XGB_15_hey',
    
#     '+-XGB_1.5_hey',
#  '+-XGB_10++_hey',
#  '+-XGB_11.5_hey',
#  '+-XGB_15_hey',
#  '+-XGB_hist_last_hey',
#  'XGB_10+',
#  'XGB_12+',
#  'XGB_13+',
#  'XGB_14',
#  'XGB_16_hey',
#  'XGB_1_new3+',
#  'XGB_5',
#  'XGB_5+'
         ]
result = utils.merge_models(models, method='mean')
pd.DataFrame(result).to_csv('merged_models.csv', index=False, header=False, sep=';')


+-KERAS_3_hey
0.539184252718	0.577349632449	0.539184252718	0.577349632449

+-XGB_11.5_hey
0.53712804872	0.577349632449	0.537275502801	0.577349632449

+-XGB_1.5_hey
0.537348885274	0.577349632449	0.537064484006	0.577349632449

+-XGB_hist_last_hey
0.537365260088	0.577349632449	0.536995256734	0.577349632449

+-XGB_5++_hey
0.5374474216	0.577349632449	0.536958651131	0.577349632449

XGB_9_new
0.537758655716	0.538017158735	0.536908682872	0.566322933484

+-XGB_10++_hey
0.537162920365	0.577349632449	0.536895276291	0.567799645983

+-XGB_15_hey
0.537130241705	0.577349632449	0.536863947454	0.568928763556

XGB_10+
0.537193855056	0.537648958464	0.536870932848	0.562947741617

XGB_12+
0.537164551952	0.537551808943	0.536875983198	0.558557627109

XGB_13+
0.537218943044	0.537574981347	0.53687232649	0.55523539621

XGB_14
0.537255606962	0.537599952893	0.536874751448	0.552641197632

XGB_16_hey
0.537285707357	0.577349632449	0.536881727484	0.554178786486

XGB_1_new3+
0.537300164785	0.53765817062	0.53689040909

In [28]:
# 0.536964010575	0.537253961124 = 0.5430002 less cleaning, more cleaning - 0.5429488

In [67]:
models = [
#     '+-KERAS_3_hey',
#     '+-XGB_11.5_hey',
#     '+-XGB_1.5_hey',
#     '+-XGB_hist_last_hey',
#     '+-XGB_5++_hey',
    'XGB_9_new',
#     '+-XGB_10++_hey',
#     '+-XGB_15_hey',
    'XGB_16_hey',
    'KERAS_3', 'XGB_1_new3+', 'XGB_2hist_new', 'XGB_5+', 'XGB_9_new', 'XGB_10+',  'XGB_12+', 'XGB_13+', 'XGB_14',
          'XGB_11', 'XGB_1_new2', 'XGB_2hist_new', 'XGB_5', 'XGB_9_new', 'XGB_10',
    'KERAS_2_new2',  'XGB_1_new2', 'XGB_2hist_new', 'LGB_1_new',
          'KERAS_2_new2', 'XGB_11', 'XGB_1_new2', 'XGB_2hist_new', 'LGB_1_new', 'XGB_5', 'XGB_9',
         ]
folds = utils.score_by_folds(models, X_train, n_folds=10, stratification_groups=strat)

In [68]:
r = set()
for i, f in enumerate(folds):
    r.update([a[1] for a in f[:3]])
    print(i, [a[1] for a in f[:3]])

0 ['XGB_14', 'XGB_13+', 'XGB_10']
1 ['XGB_5+', 'XGB_5', 'XGB_5']
2 ['XGB_10+', 'XGB_12+', 'XGB_10']
3 ['XGB_12+', 'XGB_10+', 'XGB_14']
4 ['XGB_1_new3+', 'XGB_14', 'XGB_1_new2']
5 ['XGB_14', 'XGB_13+', 'XGB_10+']
6 ['XGB_1_new3+', 'XGB_14', 'XGB_1_new2']
7 ['XGB_1_new3+', 'XGB_1_new2', 'XGB_1_new2']
8 ['XGB_12+', 'XGB_5+', 'XGB_5']
9 ['XGB_13+', 'XGB_2hist_new', 'XGB_2hist_new']


In [69]:
r

{'XGB_10',
 'XGB_10+',
 'XGB_12+',
 'XGB_13+',
 'XGB_14',
 'XGB_1_new2',
 'XGB_1_new3+',
 'XGB_2hist_new',
 'XGB_5',
 'XGB_5+'}