In [1]:
import pandas as pd
import numpy as np

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import matplotlib.pyplot as plt
from xgboost import plot_importance
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import seaborn as sns

import lightgbm as lgb
import xgboost as xgb
from itertools import combinations

import utils

plt.style.use('ggplot')
%matplotlib inline

In [2]:
best_interactions = [
]

In [3]:
def clean_data(data):
    data['error_group'] = 0

    # weight/height correction
    idx = (data['height']<130) & (data['weight']>150)
    data.loc[idx, ["height", "weight"]] = data.loc[idx, ["weight", "height"]].values
#     data.loc[idx, 'error_group'] = 100-1

    # preasure correction
    data.loc[data["ap_hi"] < 0, "ap_hi"] *= -1
    data.loc[data["ap_lo"] < 0, "ap_lo"] *= -1
    
    for i in range(10):
        str_i = str(i)
        data['hi_' + str_i + 's'] = data['ap_hi'].apply(lambda x: str(x).count(str_i))
        data['lo_' + str_i + 's'] = data['ap_lo'].apply(lambda x: str(x).count(str_i))
        for j in range(10):
            str_j = str_i + str(j)
            data['hi_' + str_j + 's'] = data['ap_hi'].apply(lambda x: str(x).count(str_j))
            data['lo_' + str_j + 's'] = data['ap_lo'].apply(lambda x: str(x).count(str_j))
        
    data.loc[(data['ap_lo'] < 20), 'error_group'] = 5
    data.loc[(data['ap_hi'] < 50), 'error_group'] = 6
    data.loc[(data['ap_lo'] > 250), 'error_group'] = 1
    data.loc[(data['ap_lo'] > 4000), 'error_group'] = 2
    data.loc[(data['ap_hi'] > 250), 'error_group'] = 3
    data.loc[(data['ap_hi'] > 10000), 'error_group'] = 4

    data.loc[(data["ap_hi"] < 20) & (data["ap_hi"] > 10), "ap_hi"] *= 10
    data.loc[(data["ap_lo"] < 15) & (data["ap_lo"] > 2), "ap_lo"] *= 10

    idx = data['ap_hi'] > 10000
    data.loc[idx, 'ap_hi'] = 10 * (data.loc[idx, 'ap_hi'] // 1000)
    data.loc[data['ap_lo'] >= 10000, 'ap_lo'] //= 100
    
    manual_update = [
        (12494, ['ap_hi', 'ap_lo'], [120, 80]),
        (42591, ['ap_hi', 'ap_lo'], [140, 90]), # ?
        (78873, ['ap_hi', 'ap_lo'], [130, 100]), # ?
        (51749, ['ap_hi', 'ap_lo'], [120, 80]),
        (57807, ['ap_hi', 'ap_lo'], [170, 100]),
        (60477, ['ap_hi', 'ap_lo'], [110, 80]),
        (91198, ['ap_hi', 'ap_lo'], [100, 90]),
        (6580,  ['ap_hi', 'ap_lo'], [110, 90]),
        (44701, ['ap_hi', 'ap_lo'], [120, 70]),
        (94673,  ['ap_hi', 'ap_lo'], [110, 60]),
        
    ]
    for idx, cols, update in manual_update:
        data.loc[data['id']==idx, cols] = update
    return data

In [4]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = clean_data(train)
test = clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

In [5]:
import gc
gc.collect()

131

In [6]:
from sklearn.model_selection import train_test_split
__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.101, random_state=42)
X_train.loc[idx, 'alco'] = np.nan
__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.101, random_state=43)
X_train.loc[idx, 'smoke'] = np.nan
__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.096, random_state=44)
X_train.loc[idx, 'active'] = np.nan

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

def impute_nans(model, X_train, X_test, columns, target, perform_cv = False, scoring='accuracy'):
    X = pd.concat((X_train, X_test), axis=0)
    train_idx = ~X[target].isnull()
    model = clone(model)
    model.fit(X.loc[train_idx, columns], X.loc[train_idx, target])
    X_train.loc[X_train[target].isnull(), target] = model.predict(X_train.loc[X_train[target].isnull(), columns])
    X_test.loc[X_test[target].isnull(), target] = model.predict(X_test.loc[X_test[target].isnull(), columns])
    
    if perform_cv:
        n_folds = 7
        kf = StratifiedKFold(random_state=111, n_splits=n_folds, shuffle=True)
        scores = cross_val_score(model, X.loc[train_idx, columns], X.loc[train_idx, target], cv=kf, scoring=scoring)
        print(target, np.mean(scores), 'on', n_folds, 'folds', 'with mean on train', X.loc[train_idx, target].mean())
    
params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'subsample': 0.8,
    'n_jobs': 4,
    'random_state': 2707,
    'silent': True,
}
clf = xgb.XGBClassifier(**params)

columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP']
impute_nans(clf, X_train, X_test, columns, 'smoke')#, perform_cv = True)

columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP', ]
impute_nans(clf, X_train, X_test, columns, 'alco')#, perform_cv = True)

columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP', ]
impute_nans(clf, X_train, X_test, columns, 'active')#, perform_cv = True)

In [7]:
gc.collect()
use_columns = [
    'age_group', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'
'BMI',
 'MAP',
    'ap_dif',
]

In [8]:
params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'subsample': 0.8,
    'n_jobs': 1,
    'random_state': 27,
    'silent': True,
}

def get_score(column=None):
    score1, score2 = utils.execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
#               X_test,
#               model_name="model1",
               mean_columns = [column] if column else [],
              use_columns=use_columns,
#               n_splits=15,
              n_folds=10,
              alpha=13,
            verbose=0
             )
    return score1

def get_score2(columns):
#     print(columns)
    score1, score2 = utils.execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
#               X_test,
#               model_name="model1",
               mean_columns = columns,
               use_columns=use_columns,
#               n_splits=15,
              n_folds=10,
              alpha=13,
            verbose=0
             )
    return score1

In [None]:
reference = get_score()
print('reference', reference)
mean_cols = ['cholesterol_gluc_smoke_active']
while len(mean_cols) < 10:
    best_score = 99
    best = None
    for c in best_interactions:
        if 'error' in c or 'MAP' in c or c in mean_cols:
            continue
        score = get_score2(mean_cols + [c])
        gc.collect()
        if score < best_score:
            best_score = score
            best = c
            print('..',c)
    mean_cols.append(best)
    print(best_score, best, mean_cols)

In [None]:
reference = get_score()
print('reference', reference)
for c in X_train.columns:
    if 'lo_' in c or 'hi_' in c or 'id' in c:
        continue
    score = get_score(c)
    gc.collect()
    if score < reference:
        print(score, c)        

In [19]:
import importlib
utils = importlib.reload(utils)

In [20]:
import gc
gc.collect()

117

In [16]:
a=set()
a.update([1,2,3])
a.update([1,4,3])
a.add(6)
list(a)

[1, 2, 3, 4, 6]

In [None]:
columns_to_interact = ['age_group', 'gender', 'MAP', 'error_group', 'cholesterol', 'gluc', 
                       'BMI_group', 'smoke', 'alco', 'active', "ap_hi_group", "ap_lo_group",
                      "weight_group", "height_group"]

reference = get_score()
print('reference', reference)
gc.collect()
for i in range(5,1,-1):
    for comb in combinations(columns_to_interact, i):
        score = get_score(comb)
        gc.collect()
        if score < reference:
            print(score, comb)    

reference 0.539621817593
0.539441890402 ('age_group', 'gender', 'MAP', 'gluc', 'active')
0.539486722965 ('age_group', 'gender', 'MAP', 'BMI_group', 'active')
0.539568276225 ('age_group', 'gender', 'MAP', 'smoke', 'active')
0.539131997639 ('age_group', 'gender', 'MAP', 'active', 'ap_hi_group')
0.539439302446 ('age_group', 'gender', 'MAP', 'active', 'ap_lo_group')
0.539486722965 ('age_group', 'gender', 'MAP', 'active', 'height_group')
0.539542360336 ('age_group', 'gender', 'error_group', 'cholesterol', 'gluc')
0.539590106151 ('age_group', 'gender', 'error_group', 'cholesterol', 'alco')
0.539476921793 ('age_group', 'gender', 'error_group', 'cholesterol', 'active')
0.539360420903 ('age_group', 'gender', 'error_group', 'gluc', 'alco')
0.539310460404 ('age_group', 'gender', 'error_group', 'gluc', 'active')
0.539523872868 ('age_group', 'gender', 'error_group', 'smoke', 'alco')
0.539238124747 ('age_group', 'gender', 'error_group', 'smoke', 'active')
0.539305261039 ('age_group', 'gender', 'erro

In [None]:
# reference = get_score()
# print('reference', reference)
# mean_cols = ['cholesterol_gluc_smoke_active']
while len(mean_cols) < 20:
    best_score = 99
    best = None
    for c in best_interactions:
        if 'error' in c or 'MAP' in c or c in mean_cols:
            continue
        score = get_score2(mean_cols + [c])
        gc.collect()
        if score < best_score:
            best_score = score
            best = c
            print('..',c)
    mean_cols.append(best)
    print(best_score, best, mean_cols)