In [20]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils

plt.style.use('ggplot')
%matplotlib inline

# import importlib
# utils = importlib.reload(utils)

In [None]:
# loo = pd.read_csv('loo.csv', sep=';', header=None)
# loo.columns=['loo']
# loo['target']=y_train
# loo_loss = loo.apply(lambda x: log_loss([x['target']], [x['loo']], labels=[1, 0]), axis=1)
# pd.DataFrame(loo_loss).to_csv('loo_loss.csv', index=False, header=False, sep=';')

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from utils import SmoothLikelihood,SmoothLikelihood2,SmoothLikelihood3,SmoothLikelihood4, ColumnsFilter

def wrap_classifier(clf, use_columns, mean_columns):
    fs = [("filter", ColumnsFilter(use_columns))]
    
    for i, cc in enumerate(mean_columns):
        fs.append(('mean_'+str(i), SmoothLikelihood4(cc, 0.5,
                                                     kf=StratifiedKFold(random_state=111111+i, n_splits=20, shuffle=True),
                                                     alpha=13,
                                                     seed=10+i,
                                                     std=0.0013)))
    combined_features = FeatureUnion(fs)
    return Pipeline([("features", combined_features), ("model", clf)])

In [25]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

# ccc = ['age', 'age_group', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI', 'MAP']
# def new_cols(data):    
#     for col1 in ccc:
#         data[col1 + '_log'] = np.log(data[col1] + 1.1)
#         for col2 in ccc:
#             data['%s_mul_%s' % (col1, col2)] = data[col1] * data[col2]
#             data['%s_mul_log_%s' % (col1, col2)] = data[col1] * np.log(data[col2] + 1)
#             data['%s_div_log_%s' % (col1, col2)] = data[col1] / (np.log(data[col2] + 1) + 1)

#             if col2 == col1:
#                 continue

#             data['%s_div_%s' % (col1, col2)] = data[col1] / (data[col2] + 1)

# new_cols(X_train)
# new_cols(X_test)

In [26]:
loo_loss = pd.read_csv('loo_loss.csv', sep=';', header=None)
strat = pd.qcut(loo_loss, 20, labels=False).astype(str)
strat = np.hstack((strat, y_train.reshape((-1,1))))
strat =np.apply_along_axis(lambda d: str(d[0]) + '_' + str(d[1]), 1, strat)

In [None]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
"active_fair",
"smoke_restored",
"alco_restored",
# "active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
]
mean_columns = []

In [None]:
model_name = 'XGB_1'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 1111,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
model_name = 'XGB_2hist'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 2222,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
model_name = 'XGB_3hist'
clf = xgb.XGBClassifier(
        learning_rate=0.07,
        n_estimators=218,
        max_depth=3,
        min_child_weight=5,
        gamma=0.2,
        subsample=0.6,
        colsample_bytree=0.55,
        reg_alpha=0,
        reg_lambda=1.0,
        nthread=4,
        tree_method='hist',
        grow_policy='lossguide',
       
        seed=3333
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
model_name = 'XGB_4'
clf = xgb.XGBClassifier(
        learning_rate=0.06,
        n_estimators=114,
        max_depth=5,
        min_child_weight=5,
        gamma=0.3,
        subsample=0.5,
        colsample_bytree=0.5,
        reg_alpha=9,
        reg_lambda=1.4,
        nthread=4,
       
        seed=4444
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
model_name = 'LGB_1'
params = {
     'colsample_bytree': 0.95,
     'learning_rate': 0.02,
#      'max_depth': 5,
     'num_leaves': 2**5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,
       
    'nthread': 4,
    'seed': 8718,
    'silent': True,
}
model = wrap_classifier(lgb.LGBMClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

In [None]:
model_name = 'LGB_2'
clf = lgb.LGBMClassifier(
        learning_rate=0.07,
        n_estimators=218,
#         max_depth=3,
        num_leaves=2**3,
        min_child_weight=5,
        subsample=0.6,
        colsample_bytree=0.55,
        reg_alpha=0,
        reg_lambda=1.0,
        nthread=4,
       
        seed=2222
    )
model = wrap_classifier(clf, use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()

# KERAS models

In [30]:
from keras.layers.core import Dense, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adamax
import pandas as pd


class KerasModel(object):
    def __init__(self,
                 var_num,
                 epochs=70,
                 learn_rate=0.1,
                 config=None,
                 batch_size=512,
                 verbose=0,
                 validation_split=0.2,
                 loss="binary_crossentropy"):

        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.validation_split = validation_split
        
        self.model = Sequential()
        model = self.model
        
        if config is None:
            config =[(var_num, 0.0)]
        else:
            config = config.copy()
            
        n, dp = config.pop(0)

        model.add(Dense(n, input_dim=var_num, kernel_initializer='uniform'))
        model.add(LeakyReLU())
        if 0 < dp < 1:
            model.add(Dropout(dp))
        
        while config:
            n, dp = config.pop(0)
            model.add(Dense(n, kernel_initializer='uniform'))
            model.add(LeakyReLU())
            if 0 < dp < 1:
                model.add(Dropout(dp))


        model.add(Dense(1, activation='sigmoid'))
        opt = Adamax(lr=learn_rate)

        model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])


    def fit(self, X, y, sample_weight=None, callbacks=[]):
        process_X = X.values if hasattr(X, 'iloc') else X
        process_y = y
        return self.model.fit(process_X, process_y, batch_size=self.batch_size,
                       epochs=self.epochs, verbose=self.verbose,
                       sample_weight=sample_weight,
                       callbacks=callbacks,
                       validation_split=self.validation_split,
                       shuffle=True)

    def predict_proba(self, X):
        process_x = X.values if hasattr(X, 'iloc') else X
        result  = self.model.predict(process_x)
        classone_probs = result
        classzero_probs = 1.0 - classone_probs
        return np.hstack((classzero_probs, classone_probs))
#         return result
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

Using Theano backend.


In [33]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
# "gluc",
# "active_fair",
"smoke_restored",
"alco_restored",
"active_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X1 = pd.DataFrame(scaler.transform(X1)) #.values
X2 = pd.DataFrame(scaler.transform(X2)) #.values

In [34]:
{'batch_size': 512, 'choice': {'dropout2': 0.2, 'layers': 'two', 'units2': 64}, 
 'dropout1': 0.2, 'epochs': 100, 'learning_rate': 0.01, 'units1': 64}

def create(x1, x2):
    config = [(64,0.2), (64,0.2)]
    return KerasModel(var_num=len(use_columns),
                   epochs=200,
                   learn_rate=0.001,
                   config=config,
                   batch_size=1024,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="KERAS_1",
              n_splits=15,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )


10 folds logloss:
[0.5398176975716229, 0.53902177177050103, 0.54205365875576927, 0.53952215795429959, 0.53682056784376031, 0.54021127974049987, 0.5398021075327597, 0.53969405788531211, 0.54014786682364979, 0.54143110054802046]
mean: 0.539852226643
std: 0.00132566004865


KeyboardInterrupt: 

In [11]:
models = ['KERAS_1', 'XGB_1','XGB_2hist','XGB_3hist','XGB_4','LGB_1','LGB_2',]
result = utils.merge_models(models, method='mean')
pd.DataFrame(result).to_csv('merged_models.csv', index=False, header=False, sep=';')


KERAS_1
0.541188333484	0.541405679164	0.541188333484	0.541405679164

model1-
0.537495498978	0.537706445895	0.53815759386	0.538294889252

model2-
0.537771013645	0.538051774932	0.537698317632	0.53784253395

model3-
0.538029709996	0.538415276079	0.53753730697	0.53771354451

model5-
0.538296979367	0.538547578947	0.537508023582	0.537696880375

model1
0.538080079502	0.538009376104	0.537479453883	0.537609018688

model3
0.538702687693	0.539402696417	0.53741390216	0.770362462535

model1+
0.537856175071	0.538141755107	0.537347025354	0.544398539016

model2+
0.538247526491	0.53857718903	0.537341094639	0.543130882008


In [None]:

############################################################
# 0.537557163108	0.537699150119 = 0.5428822 # + keras
# 0.537519393165	0.537619764695 = 0.5432057 * new log features

# 0.538126862768	0.538047373071 = 0.5431843
# 0.538084892954	0.538027778903 = 0.5433475
# 0.538157706723	0.538186899005 = 0.5430039 # без стасо-колонок
# 0.537450021945	0.537849282831 = 0.5434434
# 0.537513993512	0.537749328355 = 0.5433418

# 0.537790804547	0.538014148853 = 0.5438901 # overfit full

# 0.537368545033                   = 0.5428417
# 0.537561506209                   = 0.5432193
# 0.537449005194                   = 0.5428384


# 0.537727687638	0.537773366433 = 0.5437636
# 0.537662820586	0.538506869394 = 0.5433681
# 0.537639730851	0.538322959718 = 0.5436416 # overfit due to smoke leak
# 0.537723242359	0.538418956272 = 0.5433819
# 0.537697918691	0.538454608940 = 0.5435035
# 0.537797383673	0.538468964022 = 0.5433098
# 0.537998452494	0.538697384187 = 0.5433633
# 0.538141859286	0.538727942758 = 0.5434983


#--------------
# 0.5430039