In [30]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils

plt.style.use('ggplot')
%matplotlib inline

import importlib
utils = importlib.reload(utils)

In [31]:
# loo = pd.read_csv('loo.csv', sep=';', header=None)
# loo.columns=['loo']
# loo['target']=y_train
# loo_loss = loo.apply(lambda x: log_loss([x['target']], [x['loo']], labels=[1, 0]), axis=1)
# pd.DataFrame(loo_loss).to_csv('loo_loss.csv', index=False, header=False, sep=';')

In [32]:
from sklearn.pipeline import Pipeline, FeatureUnion
from utils import SmoothLikelihood,SmoothLikelihood2,SmoothLikelihood3,SmoothLikelihood4, ColumnsFilter

def wrap_classifier(clf, use_columns, mean_columns):
    fs = [("filter", ColumnsFilter(use_columns))]
    
    for i, cc in enumerate(mean_columns):
        fs.append(('mean_'+str(i), SmoothLikelihood4(cc, 0.5,
                                                     kf=StratifiedKFold(random_state=111111+i, n_splits=20, shuffle=True),
                                                     alpha=13,
                                                     seed=10+i,
                                                     std=0.0003)))
    combined_features = FeatureUnion(fs)
    return Pipeline([("features", combined_features), ("model", clf)])

In [37]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train, light_clean=True)
test = utils.clean_data(test, light_clean=True)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

data = pd.concat((train.drop('cardio', axis=1), test), axis=0)
dic = data.groupby('age_group_orig')['age'].mean().to_dict()
X_train['age_dif'] = X_train[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)
X_test['age_dif'] = X_test[['age_group_orig', 'age']].apply(lambda x: x['age'] - dic[x['age_group_orig']], axis=1)

In [38]:
loo_loss = pd.read_csv('loo_loss.csv', sep=';', header=None)
strat = pd.qcut(loo_loss, 20, labels=False).astype(str)
strat = np.hstack((strat, y_train.reshape((-1,1))))
strat =np.apply_along_axis(lambda d: str(d[0]) + '_' + str(d[1]), 1, strat)

In [39]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
"active_fair",
"smoke_restored",
"alco_restored",
# "active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
    'age_dif',
    'ap_lo_mul_log_ap_hi'
]
mean_columns = []

In [40]:
model_name = 'less_clean_XGB_1'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 1111,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()
# 10 folds logloss:
# [0.53750662927884696, 0.53682361733818373, 0.53772246391589884, 0.53641438649073925, 0.53681142146966887, 0.5386806105270695, 0.53851692181918176, 0.53765561764272107, 0.53761915585305875, 0.5372220835395054]
# mean: 0.537497290787
# std: 0.000685804276892
# 15 Splits logloss:
# [0.53752077755172334, 0.53884876460706199, 0.53793286362964488, 0.53782186562816303, 0.53746432078682949, 0.53795492306129922, 0.53722755332912009, 0.53677501827691287, 0.53817421751389549, 0.53748560265362977, 0.53772447056482942, 0.53752604069161625, 0.53790496196766335, 0.53766607886224627, 0.53821279794039822]
# mean: 0.537749350471
# std: 0.000461265207885
# less_clean_XGB_1 results saved!


10 folds logloss:
[0.53741734668921193, 0.53731271895681432, 0.53709940435306536, 0.5360551803482928, 0.53663742854255403, 0.53921646996837669, 0.53782424343931556, 0.53759149644189574, 0.53764000608122842, 0.53662160706381201]
mean: 0.537341590188
std: 0.000814489392489
15 Splits logloss:
[0.53748771650148996, 0.53872671181832754, 0.53799987039803754, 0.53800371278751469, 0.5374988112907324, 0.53794360463622781, 0.53675037235997258, 0.5367343247710239, 0.53788853202902132, 0.53750437614179791, 0.53746412236917585, 0.53747436232776158, 0.53775120756367134, 0.53784639235852016, 0.5379987562392794]
mean: 0.537671524906
std: 0.000482839930105
less_clean_XGB_1 results saved!


57904

In [8]:
model_name = 'less_clean_XGB_2hist'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 2222,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.537681674407812, 0.5374036427870823, 0.53780396278918163, 0.53644202404198837, 0.53698163481322481, 0.53877963627714331, 0.53837328512392357, 0.53747535489649723, 0.53783993328185542, 0.53739387889157031]
mean: 0.537617502731
std: 0.000625471819756
15 Splits logloss:
[0.53803666369288805, 0.53900778780487324, 0.53805946787837011, 0.5382160618031131, 0.53763524591018041, 0.53839894471690064, 0.53769013202057359, 0.53691560897408497, 0.53829935738825729, 0.53818675257212345, 0.53793615029370856, 0.53786078193100795, 0.53780439842483474, 0.53810153918119064, 0.53862016495601051]
mean: 0.538051270503
std: 0.000459928887534
less_clean_XGB_2hist results saved!


128

In [9]:
model_name = 'less_clean_XGB_3hist'
clf = xgb.XGBClassifier(
        learning_rate=0.07,
        n_estimators=218,
        max_depth=3,
        min_child_weight=5,
        gamma=0.2,
        subsample=0.6,
        colsample_bytree=0.55,
        reg_alpha=0,
        reg_lambda=1.0,
        nthread=4,
        tree_method='hist',
        grow_policy='lossguide',
       
        seed=3333
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53794848962274122, 0.53840454240346336, 0.53911028899306412, 0.53780152227009459, 0.53700387875268729, 0.53811086092129412, 0.53910745269471216, 0.53869158948141072, 0.53839149058581515, 0.53767008704289487]
mean: 0.538224020277
std: 0.000625139559729
15 Splits logloss:
[0.53774433149690071, 0.53952158135477279, 0.53874664456698862, 0.53915631803515407, 0.53817723964216813, 0.53879480247669631, 0.53735064151271117, 0.53694645060394841, 0.53912702845604643, 0.53822399018305755, 0.53823582653612612, 0.53851612334463395, 0.53823700841729127, 0.53840360449760083, 0.53894271681377925]
mean: 0.538408287196
std: 0.000669305790344
less_clean_XGB_3hist results saved!


380

In [10]:
model_name = 'less_clean_LGB_1'
params = {
     'colsample_bytree': 0.95,
     'learning_rate': 0.02,
#      'max_depth': 5,
     'num_leaves': 2**5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,
       
    'nthread': 4,
    'seed': 8718,
    'silent': True,
}
model = wrap_classifier(lgb.LGBMClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53822546165928242, 0.53726557791675777, 0.53813297033292207, 0.53661041737825366, 0.53779583477187076, 0.54007333984716932, 0.53824334797007334, 0.53820414949704376, 0.53863375499703448, 0.53748342235480762]
mean: 0.538066827673
std: 0.000872008315546
15 Splits logloss:
[0.53824024667981485, 0.5394249547142218, 0.53812800184939402, 0.53857082085055563, 0.53824170322226716, 0.53874019614054713, 0.5377725676434808, 0.53725665104326037, 0.53895406767439147, 0.53880727791822314, 0.53825700742261584, 0.53877016813931389, 0.53760391762484561, 0.53860657540014034, 0.53877083563186834]
mean: 0.53840966613
std: 0.000545515123343
less_clean_LGB_1 results saved!


339

In [11]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"height_div_ap_lo",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_cholesterol",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
"ap_hi_mul_weight",
]

In [12]:
model_name = 'less_clean_XGB_5'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53757817234046468, 0.53642089334171095, 0.53756244257296049, 0.53637306485601355, 0.53703105920654515, 0.53866482434383978, 0.53879766958755826, 0.53806448356140324, 0.53709252253974182, 0.53777785896916253]
mean: 0.537536299132
std: 0.000790812270729
15 Splits logloss:
[0.53777840967139323, 0.53906536858262766, 0.53821866284630127, 0.53833766663376059, 0.53788316612451204, 0.53826855468750001, 0.53743274176209455, 0.5369712367485322, 0.53869985720213676, 0.53776989293417765, 0.53775621078642355, 0.53795500151866249, 0.53828220149768247, 0.53765067268349231, 0.53792258344360055]
mean: 0.537999481808
std: 0.000491508979978
less_clean_XGB_5 results saved!


380

In [13]:
model_name = 'less_clean_XGB_6hist'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 6666,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53783426029783155, 0.53730454837956509, 0.53757442879888706, 0.53648401374802823, 0.53742934435363576, 0.53891227691794941, 0.53858347490109859, 0.53775422021290686, 0.53742341372307445, 0.53810254381192968]
mean: 0.537740252514
std: 0.000649420886648
15 Splits logloss:
[0.53801744681579011, 0.53917585243692712, 0.53825795536488297, 0.53867842462738713, 0.53800474340850046, 0.53856530982761508, 0.53778419572318947, 0.53720672703206185, 0.53883805142613572, 0.53822902200920952, 0.53819271245306088, 0.53872584251145872, 0.53851081431293413, 0.53828595760509013, 0.53840662245709625]
mean: 0.538325311867
std: 0.000457863142863
less_clean_XGB_6hist results saved!


380

In [14]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
# "active_fair",
"smoke_restored",
"alco_restored",
"active_restored",
# "height_mul_log_gluc",
"BMI",
"age_group",
# "cholesterol_div_log_age_group",
"MAP",
"ap_dif",
# "gluc_mul_log_age",
# "age_group_mul_log_ap_lo",
]
model_name = 'less_clean_XGB_7'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53783434031589394, 0.53711212534478159, 0.53763021268741329, 0.53673781867084536, 0.53626783718895155, 0.5388187945284717, 0.53852322454475721, 0.53844219548127847, 0.53769695753646285, 0.53907871918712025]
mean: 0.537814222549
std: 0.00087155374792
15 Splits logloss:
[0.5384691365061417, 0.53911952030463584, 0.53844912255253818, 0.53853678701445462, 0.53798074507216609, 0.53820041192491497, 0.53779381840995377, 0.53785413937483517, 0.53826854386491085, 0.53858630601227997, 0.53732406100754937, 0.53830437463167169, 0.53827298504903553, 0.5378959310109771, 0.53847693627061588]
mean: 0.538235521267
std: 0.000409370360108
less_clean_XGB_7 results saved!


128

In [15]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
"active_fair",
"smoke_restored",
"alco_restored",
# "active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
]
mean_columns = [
    ['cholesterol','gluc','smoke_restored','active_restored'],
    ['ap_hi_group', 'age_group', 'gender'],
    ['gender','cholesterol','age_group']
]
model_name = 'less_clean_XGB_9'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53678117155269223, 0.53749002082402308, 0.53880420271988838, 0.53788791030522687, 0.53688843970436917, 0.53889757402852068, 0.53872159459945312, 0.53948858685566536, 0.53759171754554413, 0.53802164856520251]
mean: 0.53805728667
std: 0.000855398636654
15 Splits logloss:
[0.53883947741541838, 0.53907805475308779, 0.53885972528337012, 0.53833372443861194, 0.53816895376305496, 0.53805926224110379, 0.53751982665895703, 0.53728270039689685, 0.53832711018763835, 0.53812446051623142, 0.53818328344981581, 0.53891519370568652, 0.53856118834675071, 0.5379699590323227, 0.53798829974145401]
mean: 0.538280747995
std: 0.000491386507105
less_clean_XGB_9 results saved!


1799

# KERAS models

In [16]:
from keras.layers.core import Dense, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adamax
import pandas as pd


class KerasModel(object):
    def __init__(self,
                 var_num,
                 epochs=70,
                 learn_rate=0.1,
                 config=None,
                 batch_size=512,
                 verbose=0,
                 validation_split=0.2,
                 loss="binary_crossentropy"):

        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.validation_split = validation_split
        
        self.model = Sequential()
        model = self.model
        
        if config is None:
            config =[(var_num, 0.0)]
        else:
            config = config.copy()
            
        n, dp = config.pop(0)

        model.add(Dense(n, input_dim=var_num, kernel_initializer='uniform'))
        model.add(LeakyReLU())
        if 0 < dp < 1:
            model.add(Dropout(dp))
        
        while config:
            n, dp = config.pop(0)
            model.add(Dense(n, kernel_initializer='uniform'))
            model.add(LeakyReLU())
            if 0 < dp < 1:
                model.add(Dropout(dp))


        model.add(Dense(1, activation='sigmoid'))
        opt = Adamax(lr=learn_rate)

        model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])


    def fit(self, X, y, sample_weight=None, callbacks=[]):
        process_X = X.values if hasattr(X, 'iloc') else X
        process_y = y
        return self.model.fit(process_X, process_y, batch_size=self.batch_size,
                       epochs=self.epochs, verbose=self.verbose,
                       sample_weight=sample_weight,
                       callbacks=callbacks,
                       validation_split=self.validation_split,
                       shuffle=True)

    def predict_proba(self, X):
        process_x = X.values if hasattr(X, 'iloc') else X
        result  = self.model.predict(process_x)
        classone_probs = result
        classzero_probs = 1.0 - classone_probs
        return np.hstack((classzero_probs, classone_probs))
#         return result
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

Using Theano backend.


In [17]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
# "active_fair",
"smoke_restored",
"alco_restored",
"active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X1 = pd.DataFrame(scaler.transform(X1)) #.values
X2 = pd.DataFrame(scaler.transform(X2)) #.values

In [18]:
{'batch_size': 512, 'choice': {'dropout2': 0.2, 'layers': 'two', 'units2': 64}, 
 'dropout1': 0.2, 'epochs': 100, 'learning_rate': 0.01, 'units1': 64}

def create(x1, x2):
    config = [(64,0.2), (64,0.2)]
    return KerasModel(var_num=len(use_columns),
                   epochs=200,
                   learn_rate=0.001,
                   config=config,
                   batch_size=1024,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="less_KERAS_1",
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )


10 folds logloss:
[0.54010186852355924, 0.5389756124294105, 0.5422224340391163, 0.53965950358793857, 0.53684625128683128, 0.54024911674876808, 0.53980558129210743, 0.5396944318478758, 0.54021687020924092, 0.54141733697018979]
mean: 0.539918900694
std: 0.00135071700904
15 Splits logloss:
[0.5405580942484417, 0.541737956541546, 0.54160325897405193, 0.53933949688156801, 0.54073086427657735, 0.54070588151871091, 0.54016684151263461, 0.5398751308805354, 0.54038940654854695, 0.5407173610321645, 0.54002828889215981, 0.54062301767209453, 0.54065891653228371, 0.5395765451684239, 0.54039274100498069]
mean: 0.540473586779
std: 0.000625736102818
less_KERAS_1 results saved!


(0.53991890069350368, 0.54047358677898127)

In [29]:
models = [#'KERAS_1', 'XGB_1', 'XGB_2hist','XGB_3hist', 'LGB_1', 'XGB_5', 'XGB_7', 'XGB_9',
          'less_KERAS_1', 
          'less_clean_XGB_1','less_clean_XGB_2hist',
          'less_clean_XGB_3hist','less_clean_LGB_1','less_clean_XGB_5','less_clean_XGB_7','less_clean_XGB_9']#,'XGB_6hist'
result = utils.merge_models(models, method='mean')
pd.DataFrame(result).to_csv('merged_models_less.csv', index=False, header=False, sep=';')


less_KERAS_1
0.539918770309	0.540473586819	0.539918770309	0.540473586819

less_clean_XGB_1
0.537497121307	0.537749350414	0.53779701521	0.538104510181

less_clean_XGB_2hist
0.537617405472	0.538051270465	0.537458941722	0.537766451683

less_clean_XGB_3hist
0.538224017692	0.538408287285	0.537421902447	0.537702524975

less_clean_LGB_1
0.538066647494	0.53840966613	0.537360137058	0.537632215256

less_clean_XGB_5
0.53753602927	0.537999481829	0.537317560601	0.537611496467

less_clean_XGB_7
0.537813830938	0.538235521251	0.537298524697	0.53759884592

less_clean_XGB_9
0.538057010186	0.538280747852	0.537250398407	0.537528540545
