In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone

import gc
import utils

In [2]:
clf11 = xgb.XGBClassifier(
        learning_rate=0.06,
        n_estimators=335,
        max_depth=5,
        min_child_weight=5,
        gamma=0.3,
        subsample=0.5,
        colsample_bytree=0.5,
        reg_alpha=9,
        reg_lambda=1.4,
        nthread=4,
       
        seed=1111
    )

clf22 = xgb.XGBClassifier(
        learning_rate=0.07,
        n_estimators=218,
        max_depth=3,
        min_child_weight=5,
        gamma=0.2,
        subsample=0.6,
        colsample_bytree=0.55,
        reg_alpha=0,
        reg_lambda=1.0,
        nthread=4,
        tree_method='hist',
        grow_policy='lossguide',
       
        seed=2222
    )

clf33 = lgb.LGBMClassifier(
        learning_rate=0.1,
        n_estimators=100,
        subsample=0.8,
        colsample_bytree=0.8,
        nthread=4,
       
        seed=3333
    )

clf44 = xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=50,
        nthread=4,
       
        seed=4444
    )

clf55 = xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=200,
        subsample=0.8,
        colsample_bytree=0.8,
        nthread=4,
        seed=5555
    )

CLF = VotingClassifier(estimators=[
                            ('1', clf11),
                            ('2', clf22),
                            ('3', clf33),
                            ('4', clf44),
                            ('5', clf55)
                                        ],
                            voting='soft'
                           )



In [3]:
kf = StratifiedKFold(random_state=74581, n_splits=10, shuffle=True)

In [4]:
train = pd.read_csv('train_original.csv', sep=';')
test = pd.read_csv('test_original.csv', sep=';', na_values='None')

train['smoke_fair'] = train['smoke']
train['alco_fair'] = train['alco']
train['active_fair'] = train['active']

test['smoke_fair'] = test['smoke']
test['alco_fair'] = test['alco']
test['active_fair'] = test['active']

from sklearn.model_selection import train_test_split
__, idx = train_test_split(list(range(train.shape[0])), test_size=0.101, random_state=7581)
train.loc[idx, 'alco_fair'] = np.nan
__, idx = train_test_split(list(range(train.shape[0])), test_size=0.101, random_state=3258)
train.loc[idx, 'smoke_fair'] = np.nan
__, idx = train_test_split(list(range(train.shape[0])), test_size=0.096, random_state=7459)
train.loc[idx, 'active_fair'] = np.nan

train['smoke_restored'] = train['smoke_fair']
train['alco_restored'] = train['alco_fair']
train['active_restored'] = train['active_fair']

test['smoke_restored'] = test['smoke_fair']
test['alco_restored'] = test['alco_fair']
test['active_restored'] = test['active_fair']

In [5]:
TARGET = 'smoke_restored'
features = ['age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco_fair', 'active_fair', 'gluc', 'BMI', 'ap_dif', 'MAP']
X = utils.new_features(utils.clean_data(pd.concat((train.drop('cardio', axis=1), test), axis=0), more_clean=True))
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train[features]

scores = cross_val_score(CLF, X_train, y_train, cv=kf)
print(TARGET, 'score:', np.mean(scores), 1 - y_train.mean(), 'std:', np.std(scores), sep='\t')
clf = clone(CLF)
clf.fit(X_train, y_train)

idx = train[TARGET].isnull()
train.loc[idx, TARGET] = clf.predict(train.loc[idx, features])
idx = test[TARGET].isnull()
test.loc[idx, TARGET] = clf.predict(test.loc[idx, features])
# smoke_restored	score:	0.921535053005	0.911857619577	std:	0.00118175426621

smoke_restored	score:	0.921624037058	0.911857619577	std:	0.00119759592711


In [6]:
TARGET = 'alco_restored'
features = ['age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','smoke_fair', 'active_fair', 'gluc', 'BMI', 'ap_dif', 'MAP']
X = utils.new_features(utils.clean_data(pd.concat((train.drop('cardio', axis=1), test), axis=0), more_clean=True))
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train[features]

scores = cross_val_score(CLF, X_train, y_train, cv=kf)
print(TARGET, 'score:', np.mean(scores), 1 - y_train.mean(), 'std:', np.std(scores), sep='\t')
clf = clone(CLF)
clf.fit(X_train, y_train)

idx = train[TARGET].isnull()
train.loc[idx, TARGET] = clf.predict(train.loc[idx, features])
idx = test[TARGET].isnull()
test.loc[idx, TARGET] = clf.predict(test.loc[idx, features])
# alco_restored	score:	0.946006075991	0.946017197077	std:	7.1756050826e-05

alco_restored	score:	0.945994954995	0.946017197077	std:	8.64069377288e-05


In [7]:
TARGET = 'active_restored'
features = ['age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco_fair', 'smoke_fair', 'gluc', 'BMI', 'ap_dif', 'MAP']
X = utils.new_features(utils.clean_data(pd.concat((train.drop('cardio', axis=1), test), axis=0), more_clean=True))
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train[features]

scores = cross_val_score(CLF, X_train, y_train, cv=kf)
print(TARGET, 'score:', np.mean(scores), y_train.mean(), 'std:', np.std(scores), sep='\t')
clf = clone(CLF)
clf.fit(X_train, y_train)

idx = train[TARGET].isnull()
train.loc[idx, TARGET] = clf.predict(train.loc[idx, features])
idx = test[TARGET].isnull()
test.loc[idx, TARGET] = clf.predict(test.loc[idx, features])
# active_restored	score:	0.804708861278	0.195457110297	std:	0.000152627764982

active_restored	score:	0.804719929346	0.804542889703	std:	0.000194306205362


In [8]:
train.to_csv('train.csv', index=False, sep=';')
test.to_csv('test.csv', index=False, sep=';')