In [169]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils

plt.style.use('ggplot')
%matplotlib inline

import importlib
utils = importlib.reload(utils)

In [170]:
# loo = pd.read_csv('loo.csv', sep=';', header=None)
# loo.columns=['loo']
# loo['target']=y_train
# loo_loss = loo.apply(lambda x: log_loss([x['target']], [x['loo']], labels=[1, 0]), axis=1)
# pd.DataFrame(loo_loss).to_csv('loo_loss.csv', index=False, header=False, sep=';')

In [171]:
from sklearn.pipeline import Pipeline, FeatureUnion
from utils import SmoothLikelihood,SmoothLikelihood2,SmoothLikelihood3,SmoothLikelihood4, ColumnsFilter

def wrap_classifier(clf, use_columns, mean_columns):
    fs = [("filter", ColumnsFilter(use_columns))]
    
    for i, cc in enumerate(mean_columns):
        fs.append(('mean_'+str(i), SmoothLikelihood4(cc, 0.5,
                                                     kf=StratifiedKFold(random_state=111111+i, n_splits=20, shuffle=True),
                                                     alpha=13,
                                                     seed=10+i,
                                                     std=0.0003)))
    combined_features = FeatureUnion(fs)
    return Pipeline([("features", combined_features), ("model", clf)])

In [15]:
model_name = 'XGB_raw'

train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train, light_clean=True)
test = utils.clean_data(test, light_clean=True)

# train = utils.new_features(train)
# test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = xgb.XGBClassifier(**params)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.5378472632515674, 0.53747802321409233, 0.53841321712714674, 0.53748802503075122, 0.53708116769299297, 0.53895782394882996, 0.53880991178416571, 0.53780336914881466, 0.5382301981409312, 0.53837661648625423]
mean: 0.538048561583
std: 0.000579327562488
15 Splits logloss:
[0.53815101382225039, 0.53922203350067144, 0.53844920865836599, 0.53906946631290376, 0.53819479897973088, 0.53854909934156703, 0.53793389583680606, 0.53758419167037519, 0.53917354914342008, 0.53870947058924612, 0.53795858176300926, 0.53838735470477317, 0.53822071172367958, 0.53786733888089655, 0.53858950677443118]
mean: 0.53840401478
std: 0.000474424402404
XGB_raw results saved!


855

In [184]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

data = pd.concat((train.drop('cardio', axis=1), test), axis=0)
dic = data.groupby('age_group_orig')['age'].mean().to_dict()
X_train['age_dif'] = X_train[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)
X_test['age_dif'] = X_test[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)

In [185]:
loo_loss = pd.read_csv('loo_loss.csv', sep=';', header=None)
strat = pd.qcut(loo_loss, 20, labels=False).astype(str)
strat = np.hstack((strat, y_train.reshape((-1,1))))
strat =np.apply_along_axis(lambda d: str(d[0]) + '_' + str(d[1]), 1, strat)

In [186]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
"active_fair",
"smoke_restored",
"alco_restored",
# "active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
#     "MAP",
#     "MAP_2",
#     "ap_dif",
#     "ap_dif_2"
    'age_dif'
]
mean_columns = [
#     ['gender','age_group','BMI_group', 'ap_hi_group']
]

In [187]:
model_name = 'XGB_1_new'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 2707,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()
# 10 folds logloss:
# [0.53738498425183678, 0.53677908088457116, 0.53770915332159341, 0.53642631845512079, 0.53676456015622709, 0.53878675389632313, 0.53836206743417025, 0.53744774297941589, 0.53755816713189852, 0.537097236617332]
# mean: 0.537431606513
# std: 0.000692157037285
# 15 Splits logloss:
# [0.53763639168405819, 0.53886063596233724, 0.53784681931617007, 0.53773258502195986, 0.53745225650551065, 0.53805173773726533, 0.53725377004735531, 0.53679796278192882, 0.53810787266315452, 0.53753800572366228, 0.53773746853348403, 0.53755873253622222, 0.53793157278471404, 0.53761289643780108, 0.53821744652580294]
# mean: 0.537755743617
# std: 0.000452249518864
# XGB_1 results saved!


10 folds logloss:
[0.53766870782391707, 0.5371781589982656, 0.53764979482730646, 0.53592581451612442, 0.53674140330896469, 0.53876232484002406, 0.53799331229425473, 0.53764991774813253, 0.53766763662006734, 0.53664362431963064]
mean: 0.53738806953
std: 0.000755215513669
15 Splits logloss:
[0.53731972628370639, 0.53877173196240546, 0.53778848880405228, 0.53813508655059905, 0.53755323041496528, 0.53772702134196604, 0.53670914507852419, 0.53664368834115916, 0.53790470273384738, 0.53736529400873756, 0.53741512203997088, 0.53732824028088222, 0.53794231541295134, 0.53781718299848336, 0.53799249489853784]
mean: 0.537627564743
std: 0.000520435603982
XGB_1_new results saved!


518

In [188]:
model_name = 'XGB_2hist_new'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 2222,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53742095253026101, 0.53767337073955745, 0.53724969669021405, 0.536155548064702, 0.5365284309133298, 0.53895756242222637, 0.53802455899607027, 0.53781807399199777, 0.53742604232789148, 0.53674733701047095]
mean: 0.537400157369
std: 0.000763122688786
15 Splits logloss:
[0.53769708232945279, 0.53898821373824912, 0.5379198568361323, 0.53799003746129925, 0.53750276593447088, 0.53852462245107047, 0.5371592566763006, 0.53693911509304526, 0.53807369166195751, 0.53787331296414842, 0.53755220067474463, 0.53773441704983516, 0.53751191239245233, 0.53805531309704691, 0.53854589558277455]
mean: 0.537871179596
std: 0.000517207465662
XGB_2hist_new results saved!


138

In [189]:
model_name = 'XGB_3hist_new'
clf = xgb.XGBClassifier(
        learning_rate=0.07,
        n_estimators=218,
        max_depth=3,
        min_child_weight=5,
        gamma=0.2,
        subsample=0.6,
        colsample_bytree=0.55,
        reg_alpha=0,
        reg_lambda=1.0,
        nthread=4,
        tree_method='hist',
        grow_policy='lossguide',
       
        seed=3333
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53826489304966763, 0.53798224668341754, 0.53859057713016834, 0.53796218054044909, 0.53713616471547965, 0.53816194652215021, 0.53949251167125778, 0.53885003490168393, 0.5379599304908681, 0.53690351934188729]
mean: 0.538130400505
std: 0.000719521795838
15 Splits logloss:
[0.53774224973567541, 0.53970737251976419, 0.53815620637614103, 0.53932043082142866, 0.53854982834442389, 0.53876422947144043, 0.53733083212260335, 0.53731033780124215, 0.53880647705020823, 0.53792458517264041, 0.53835267348649596, 0.53811634844883038, 0.53848130716507636, 0.53848878099521003, 0.53877992630058102]
mean: 0.538388772387
std: 0.00064109119052
XGB_3hist_new results saved!


390

In [190]:
model_name = 'XGB_4_new'
clf = xgb.XGBClassifier(
        learning_rate=0.06,
        n_estimators=114,
        max_depth=5,
        min_child_weight=5,
        gamma=0.3,
        subsample=0.5,
        colsample_bytree=0.5,
        reg_alpha=9,
        reg_lambda=1.4,
        nthread=4,
       
        seed=4444
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53818056194016639, 0.53785110441554917, 0.53880208941580432, 0.53759546457943341, 0.53795668659688778, 0.53925279130278592, 0.54023044240296769, 0.53988280982509873, 0.54026161465426914, 0.53826465705926907]
mean: 0.538827822219
std: 0.000963588347452
15 Splits logloss:
[0.53854497322901373, 0.53947848093208106, 0.53895008758437779, 0.53902409400081353, 0.53833429579064251, 0.53848167358888754, 0.53762720214877102, 0.53743138609205687, 0.53926151614600704, 0.53855956849349396, 0.53855676796305041, 0.53854136349438197, 0.53859907100083571, 0.53874987606189795, 0.53844995009473395]
mean: 0.538572687108
std: 0.000513296668634
XGB_4_new results saved!


390

In [191]:
model_name = 'LGB_1_new'
params = {
     'colsample_bytree': 0.95,
     'learning_rate': 0.02,
#      'max_depth': 5,
     'num_leaves': 2**5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,
       
    'nthread': 4,
    'seed': 8718,
    'silent': True,
}
model = wrap_classifier(lgb.LGBMClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53803517382758959, 0.53774908041682512, 0.5375514233839005, 0.53678010595518055, 0.53804411440895172, 0.53945955116548783, 0.53789086416460552, 0.53826605942225358, 0.5378012988810259, 0.53696597647804667]
mean: 0.53785436481
std: 0.000697501607386
15 Splits logloss:
[0.53785457323324815, 0.53928087049349438, 0.53845319420131743, 0.53860079115431658, 0.53828235020615101, 0.53875716276496943, 0.53727536846888879, 0.53711399587738318, 0.53888586974957875, 0.53853146374396033, 0.53815841380613227, 0.53815491175746843, 0.5373999884097046, 0.53854279105450342, 0.53872449324350813]
mean: 0.538267749211
std: 0.00060065063734
LGB_1_new results saved!


349

In [192]:
model_name = 'LGB_2_new'
clf = lgb.LGBMClassifier(
        learning_rate=0.07,
        n_estimators=218,
#         max_depth=3,
        num_leaves=2**3,
        min_child_weight=5,
        subsample=0.6,
        colsample_bytree=0.55,
        reg_alpha=0,
        reg_lambda=1.0,
        nthread=4,
       
        seed=2222
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53823819723712585, 0.53815642365090577, 0.53903338333044326, 0.53716444061798696, 0.53793499289340041, 0.539295060892931, 0.53830716508138177, 0.53971412664835727, 0.53832091062189724, 0.53797215425003397]
mean: 0.538413685522
std: 0.000703820855275
15 Splits logloss:
[0.53800627020240532, 0.53946319594046499, 0.53869993823313034, 0.53945724590196265, 0.5387846122397838, 0.53848822445097111, 0.53738588985495028, 0.53712389938214244, 0.53905264536064612, 0.53843536284032678, 0.53822389449908004, 0.53857285256350707, 0.53760995842000092, 0.53884653050319098, 0.53886452627238512]
mean: 0.538467669778
std: 0.000671537659485
LGB_2_new results saved!


355

In [None]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"height_div_ap_lo",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_cholesterol",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
"ap_hi_mul_weight",
]

In [None]:
model_name = 'XGB_5'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
model_name = 'XGB_6hist'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 6666,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
# "active_fair",
"smoke_restored",
"alco_restored",
"active_restored",
# "height_mul_log_gluc",
"BMI",
"age_group",
# "cholesterol_div_log_age_group",
"MAP",
"ap_dif",
# "gluc_mul_log_age",
# "age_group_mul_log_ap_lo",
]
model_name = 'XGB_7'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
"active_fair",
"smoke_restored",
"alco_restored",
# "active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
]
mean_columns = [
    ['cholesterol','gluc','smoke_restored','active_restored'],
    ['ap_hi_group', 'age_group', 'gender'],
    ['gender','cholesterol','age_group']
]
model_name = 'XGB_9'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [145]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
"active_fair",
"alco_fair",
"smoke_restored",
# "alco_restored",
# "active_restored",
"height_mul_log_gluc",
"BMI",
"BMI_3",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
    'ap_dif_2',
    'age_group_MAPX',
    'age_div_age_group',
    'age_mul_log_gluc',
    
]
mean_columns = [
]
model_name = 'XGB_11'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53702754166239408, 0.53732704219352057, 0.53762902438564697, 0.53598334690260996, 0.53677554472915723, 0.53921571647577682, 0.53853838956150135, 0.53842079784396368, 0.537217876242024, 0.53718332616262865]
mean: 0.537531860616
std: 0.000901907997232
15 Splits logloss:
[0.53755476690757842, 0.539016408252042, 0.53787276211735746, 0.53811473230875673, 0.53783332744985823, 0.53800176815919221, 0.53714927350392649, 0.53689691185082, 0.53816779236876888, 0.53817296514624646, 0.53753700531593396, 0.53799478836996217, 0.53818051950048118, 0.53767774554297687, 0.53840506153465029]
mean: 0.537905055222
std: 0.000494418715413
XGB_11 results saved!


49292

In [183]:
def wrap_classifier(clf, use_columns, mean_columns):
    fs = [("filter", ColumnsFilter(use_columns))]
    
    for i, cc in enumerate(mean_columns):
        fs.append(('mean_'+str(i), SmoothLikelihood4(cc, 0.5,
                                                     kf=StratifiedKFold(random_state=111111+i, n_splits=10, shuffle=True),
                                                     alpha=13,
                                                     seed=10+i,
                                                     std=0.0005
                                                   )))
    combined_features = FeatureUnion(fs)
    return Pipeline([("features", combined_features), ("model", clf)])

use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
"active_fair",
"smoke_restored",
"alco_restored",
# "active_restored",
# "height_mul_log_gluc",
"BMI",
"age_group",
# "cholesterol_div_log_gluc",
# "gluc_mul_log_age",
    'age_dif',
    "ap_hi_group_2"
]
mean_columns = [
#     ['age_group', 'gender', "smoke_restored", 'cholesterol'],
#     ['gender','cholesterol','age_group', 'gluc']
#      ['cholesterol','gluc','smoke_restored','active_restored'],
    ['error_group']
]
model_name = 'XGB_10'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 1223,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.5372111269936507, 0.53788198172508528, 0.53787346234721201, 0.53726198306558748, 0.53691768028034792, 0.53922079779856802, 0.53837426917475084, 0.5380849575126494, 0.5379320815823645, 0.53847840810145864]
mean: 0.537923674858
std: 0.000646623545141
15 Splits logloss:
[0.53813554203563507, 0.53889735095061009, 0.53857056367113476, 0.53890790383783838, 0.53776885750623682, 0.53844837977151783, 0.53722770847681733, 0.53746262225863484, 0.53826124022830102, 0.53843488726658484, 0.53763134277824842, 0.53794951593272744, 0.53811147255095693, 0.53814386241510515, 0.53823092000612194]
mean: 0.538145477979
std: 0.000467471359279
XGB_10 results saved!


1490

In [None]:
# 10 folds logloss:
# [0.53738498425183678, 0.53677908088457116, 0.53770915332159341, 0.53642631845512079, 0.53676456015622709, 0.53878675389632313, 0.53836206743417025, 0.53744774297941589, 0.53755816713189852, 0.537097236617332]
# mean: 0.537431606513
# std: 0.000692157037285
# 15 Splits logloss:
# [0.53763639168405819, 0.53886063596233724, 0.53784681931617007, 0.53773258502195986, 0.53745225650551065, 0.53805173773726533, 0.53725377004735531, 0.53679796278192882, 0.53810787266315452, 0.53753800572366228, 0.53773746853348403, 0.53755873253622222, 0.53793157278471404, 0.53761289643780108, 0.53821744652580294]
# mean: 0.537755743617
# std: 0.000452249518864
# XGB_1 results saved!

# KERAS models

In [198]:
from keras.layers.core import Dense, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adamax
import pandas as pd


class KerasModel(object):
    def __init__(self,
                 var_num,
                 epochs=70,
                 learn_rate=0.1,
                 config=None,
                 batch_size=512,
                 verbose=0,
                 validation_split=0.2,
                 loss="binary_crossentropy"):

        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.validation_split = validation_split
        
        self.model = Sequential()
        model = self.model
        
        if config is None:
            config =[(var_num, 0.0)]
        else:
            config = config.copy()
            
        n, dp = config.pop(0)

        model.add(Dense(n, input_dim=var_num, kernel_initializer='uniform'))
        model.add(LeakyReLU())
        if 0 < dp < 1:
            model.add(Dropout(dp))
        
        while config:
            n, dp = config.pop(0)
            model.add(Dense(n, kernel_initializer='uniform'))
            model.add(LeakyReLU())
            if 0 < dp < 1:
                model.add(Dropout(dp))


        model.add(Dense(1, activation='sigmoid'))
        opt = Adamax(lr=learn_rate)

        model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])


    def fit(self, X, y, sample_weight=None, callbacks=[]):
        process_X = X.values if hasattr(X, 'iloc') else X
        process_y = y
        return self.model.fit(process_X, process_y, batch_size=self.batch_size,
                       epochs=self.epochs, verbose=self.verbose,
                       sample_weight=sample_weight,
                       callbacks=callbacks,
                       validation_split=self.validation_split,
                       shuffle=True)

    def predict_proba(self, X):
        process_x = X.values if hasattr(X, 'iloc') else X
        result  = self.model.predict(process_x)
        classone_probs = result
        classzero_probs = 1.0 - classone_probs
        return np.hstack((classzero_probs, classone_probs))
#         return result
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

In [None]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
# "active_fair",
"smoke_restored",
"alco_restored",
"active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = StandardScaler().fit(X)
X1 = pd.DataFrame(scaler.transform(X1)) #.values
X2 = pd.DataFrame(scaler.transform(X2)) #.values

In [None]:
{'batch_size': 512, 'choice': {'dropout2': 0.2, 'layers': 'two', 'units2': 64}, 
 'dropout1': 0.2, 'epochs': 100, 'learning_rate': 0.01, 'units1': 64}

def create(x1, x2):
    config = [(64,0.2), (64,0.2)]
    return KerasModel(var_num=len(use_columns),
                   epochs=200,
                   learn_rate=0.001,
                   config=config,
                   batch_size=1024,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="KERAS_1",
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )

In [200]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
# "active_fair",
"smoke_restored",
"alco_restored",
"active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
    "age_dif"
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
scaler = MinMaxScaler(feature_range=(-1, 1)).fit(X)
X1 = pd.DataFrame(scaler.transform(X1)) #.values
X2 = pd.DataFrame(scaler.transform(X2)) #.values

In [201]:
def create(x1, x2):
    config = [(64,0.075), (64,0.025)]
    return KerasModel(var_num=len(use_columns),
                   epochs=200,
                   learn_rate=0.001,
                   config=config,
                   batch_size=1024,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="KERAS_2_new",
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )
# 10 folds logloss:
# [0.53965103053793229, 0.53896566715560834, 0.54138009090647121, 0.53903986118884439, 0.53730151855359221, 0.54045185878077562, 0.54030395891497984, 0.53968888114681612, 0.54007287932858927, 0.54085279261079078]
# mean: 0.539770853912
# std: 0.00109090757361
# 15 Splits logloss:
# [0.54073159602319909, 0.54097557777459071, 0.54112020575538988, 0.53956214497673016, 0.54038192202266133, 0.54060559865353364, 0.53947380368367193, 0.54109671255476066, 0.54037745703960816, 0.54021880053285332, 0.54036240602399976, 0.53981261554897009, 0.5410725153582836, 0.53928201454735936, 0.53999037307128306]
# mean: 0.540337582904
# std: 0.000590992843301
# KERAS_2 results saved!


10 folds logloss:
[0.53941655247960096, 0.53948285905635651, 0.5410106729314389, 0.53912962744409887, 0.53698305462716001, 0.5403335904846005, 0.54010221090437982, 0.53951386144785773, 0.54022708162887678, 0.54065412124808676]
mean: 0.539685363225
std: 0.00106472303221
15 Splits logloss:
[0.54007965267547187, 0.54125255258470062, 0.54130464466328598, 0.53925078093961232, 0.54033351755230907, 0.54033504540224875, 0.53899051997952518, 0.54045933061341445, 0.54048469060233661, 0.54023878135549885, 0.54058610645051863, 0.53928027534440515, 0.54102960345102469, 0.53928483830747154, 0.54022886490688793]
mean: 0.540209280322
std: 0.000702974350206
KERAS_2_new results saved!


(0.53968536322524574, 0.54020928032191406)

In [208]:
models = ['KERAS_2_new', 'XGB_11',
#           'XGB_1',
#           'XGB_2hist',
#           'XGB_3hist',
#           'LGB_1',
          'XGB_1_new','XGB_2hist_new','XGB_3hist_new','LGB_1_new',
          'XGB_5','XGB_7','XGB_9',
#           'XGB_10'
         ]#,'XGB_6hist'
result = utils.merge_models(models, method='mean')
pd.DataFrame(result).to_csv('merged_models.csv', index=False, header=False, sep=';')


KERAS_2_new
0.539685175486	0.540209280363	0.539685175486	0.540209280363

XGB_11
0.53753164306	0.537905055126	0.5376548215	0.537953317537

XGB_1_new
0.537388013254	0.537627564789	0.537305646979	0.53755126205

XGB_2hist_new
0.537400099099	0.537871179589	0.5371744248	0.537449710497

XGB_3hist_new
0.538130456162	0.538388772384	0.537172995316	0.537434991092

LGB_1_new
0.537854351958	0.538267749211	0.537139391736	0.537409443804

XGB_5
0.537530269381	0.537988954721	0.537112881679	0.537399602515

XGB_7
0.537807145189	0.53821442309	0.537109283787	0.537398921549

XGB_9
0.538012812474	0.53827815187	0.537076838277	0.53734943096


In [None]:
# 0.537098245519	0.537377737357 = 0.5427524 # 'KERAS_2', 'XGB_11', 'XGB_1_new','XGB_2hist_new','XGB_3hist_new','LGB_1_new','XGB_5','XGB_7','XGB_9',
# 0.537199897152	0.537483979535 = 0.5429358 # ['KERAS_2', 'XGB_11','XGB_1','XGB_2hist','XGB_3hist','LGB_1','XGB_5','XGB_7','XGB_9']
# 0.537250398407	0.537528540545 = 0.5429539 # ['less_KERAS_1', 'less_clean_XGB_1','less_clean_XGB_2hist', 'less_clean_XGB_3hist','less_clean_LGB_1','less_clean_XGB_5','less_clean_XGB_7','less_clean_XGB_9']
# 0.537214504856	0.537510747850 = 0.5430159 # ['KERAS_1', 'XGB_1','XGB_2hist','XGB_3hist','LGB_1','XGB_5','XGB_7','XGB_9']
# 0.537275189857	0.537591079712 = 0.5430573 # ['KERAS_1', 'XGB_1','XGB_2hist','XGB_3hist','LGB_1','XGB_5']
# 0.537309757757	0.537609167308 = 0.5430907 # ['KERAS_1', 'XGB_1','XGB_2hist','XGB_3hist','LGB_1']
############################################################
# 0.537557163108	0.537699150119 = 0.5428822 # + keras
# 0.537519393165	0.537619764695 = 0.5432057 * new log features

# 0.538126862768	0.538047373071 = 0.5431843
# 0.538084892954	0.538027778903 = 0.5433475
# 0.538157706723	0.538186899005 = 0.5430039 # без стасо-колонок
# 0.537450021945	0.537849282831 = 0.5434434
# 0.537513993512	0.537749328355 = 0.5433418

# 0.537790804547	0.538014148853 = 0.5438901 # overfit full

# 0.537368545033                   = 0.5428417
# 0.537561506209                   = 0.5432193
# 0.537449005194                   = 0.5428384


# 0.537727687638	0.537773366433 = 0.5437636
# 0.537662820586	0.538506869394 = 0.5433681
# 0.537639730851	0.538322959718 = 0.5436416 # overfit due to smoke leak
# 0.537723242359	0.538418956272 = 0.5433819
# 0.537697918691	0.538454608940 = 0.5435035
# 0.537797383673	0.538468964022 = 0.5433098
# 0.537998452494	0.538697384187 = 0.5433633
# 0.538141859286	0.538727942758 = 0.5434983


#--------------
# 0.5430039

In [213]:
from itertools import combinations

models = [
          'KERAS_1', 'XGB_1', 'XGB_2hist','XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9',
          'KERAS_2_new','XGB_11',
          'less_KERAS_1', 
          'XGB_1_new','XGB_2hist_new','XGB_3hist_new','LGB_1_new',
          'XGB_5','XGB_7','XGB_9',
#           'less_clean_XGB_1','less_clean_XGB_2hist',
#           'less_clean_XGB_3hist','less_clean_LGB_1','less_clean_XGB_5','less_clean_XGB_7','less_clean_XGB_9'
]
result = []
for i in range(9, 11):
#     print(len(list(combinations(models, i))))
#     continue
    for comb in combinations(models, i):
        score1, score2 = utils.get_merge_score(comb, method='mean')
        score = (score1 + score2) / 2.0
        result.append((score, comb))
        print(score, comb)

0.537344374554 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'KERAS_2_new')
0.537347410595 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'XGB_11')
0.53741059484 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'less_KERAS_1')
0.537317839747 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'XGB_1_new')
0.537317948572 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'XGB_2hist_new')
0.537346353574 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'XGB_3hist_new')
0.537323547629 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'LGB_1_new')
0.537365759321 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'XGB_5')
0.537375323921 ('KERAS_1', 'XGB_1', 'XGB_2hist', 'XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9', 'XGB_7')


KeyboardInterrupt: 

In [None]:
# sorted(result)[:10]