In [10]:
import pandas as pd
import numpy as np

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import matplotlib.pyplot as plt
from xgboost import plot_importance
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.model_selection import StratifiedKFold
import seaborn as sns

import lightgbm as lgb
import xgboost as xgb
from itertools import combinations

import utils

plt.style.use('ggplot')
%matplotlib inline

In [11]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

ccc = ['age', 'age_group', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'BMI', 'MAP']
def new_cols(data):    
    for col1 in ccc:
        data[col1 + '_log'] = np.log(data[col1] + 1.1)
        for col2 in ccc:
            data['%s_mul_%s' % (col1, col2)] = data[col1] * data[col2]
            data['%s_mul_log_%s' % (col1, col2)] = data[col1] * np.log(data[col2] + 1)
            data['%s_div_log_%s' % (col1, col2)] = data[col1] / (np.log(data[col2] + 1) + 1)

            if col2 == col1:
                continue

            data['%s_div_%s' % (col1, col2)] = data[col1] / (data[col2] + 1)

new_cols(X_train)
new_cols(X_test)

In [12]:
import gc
gc.collect()

498

In [13]:
gc.collect()
use_columns = [
"age_group",
 "gender",
 "height",
 "weight",
 "ap_hi",
 "ap_lo",
 "cholesterol",
 "gluc",
 "smoke_restored",
 "alco_restored",
 "active_restored",
 "BMI",
'height_mul_log_gluc',
'cholesterol_div_log_age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_group_mul_log_ap_lo',
]

from sklearn.pipeline import Pipeline, FeatureUnion
from utils import SmoothLikelihood,SmoothLikelihood2,SmoothLikelihood3,SmoothLikelihood4, ColumnsFilter
def wrap_classifier(clf, columns):
#     kf = StratifiedKFold(random_state=111111, n_splits=10, shuffle=True)
    fs = [("filter", ColumnsFilter(use_columns))]
    for i, cc in enumerate(columns):
        fs.append(('mean_'+str(i), SmoothLikelihood4(cc, 0.5,
                                                    kf=StratifiedKFold(random_state=111111+i, n_splits=20, shuffle=True),
                                                    alpha=13,
                                                    seed=10+i,
                                                    std=0.0013)))
    combined_features = FeatureUnion(fs)
    return Pipeline([ ("features", combined_features), ("model", clf)])

In [22]:
params = {
    'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.1,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 61,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 207,
    'silent': True,
}

def get_score(column=None):
    score1, score2 = utils.execute_model(wrap_classifier(xgb.XGBClassifier(**params), column if column else []),
              X_train,
              y_train,
#               X_test,
#               model_name="model1",
#               n_splits=15,
              n_folds=10,
            verbose=0,
            seed=207
             )
    return score1

# def get_score2(columns):
# #     print(columns)
#     score1, score2 = utils.execute_model(xgb.XGBClassifier(**params),
#               X_train,
#               y_train,
# #               X_test,
# #               model_name="model1",
#                mean_columns = columns,
#                use_columns=use_columns,
# #               n_splits=15,
#               n_folds=10,
#               alpha=13,
#             verbose=0
#              )
#     return score1

In [None]:
reference = get_score()
print('reference', reference)
mean_cols = ['cholesterol_gluc_smoke_active']
while len(mean_cols) < 10:
    best_score = 99
    best = None
    for c in best_interactions:
        if 'error' in c or 'MAP' in c or c in mean_cols:
            continue
        score = get_score2(mean_cols + [c])
        gc.collect()
        if score < best_score:
            best_score = score
            best = c
            print('..',c)
    mean_cols.append(best)
    print(best_score, best, mean_cols)

In [15]:
import importlib
utils = importlib.reload(utils)

In [23]:
import gc
gc.collect()

170

In [24]:
# X_train['age_group']

In [None]:
columns_to_interact = ['age_group', 'gender', 'MAP', 'cholesterol', 'gluc', 
                       'BMI_group', 'smoke', 'alco', 'active', "ap_hi_group", "ap_lo_group","ap_hi_group_2", "ap_lo_group_2",
                      "weight_group", "height_group"]

reference = get_score()
print('reference', reference)
gc.collect()
for i in range(1,4):
    for comb in combinations(columns_to_interact, i):
        score = get_score(comb)
        gc.collect()
        if score < reference:
            print(score, comb)    

reference 0.538102999893


In [None]:
# reference = get_score()
# print('reference', reference)
# mean_cols = ['cholesterol_gluc_smoke_active']
while len(mean_cols) < 20:
    best_score = 99
    best = None
    for c in best_interactions:
        if 'error' in c or 'MAP' in c or c in mean_cols:
            continue
        score = get_score2(mean_cols + [c])
        gc.collect()
        if score < best_score:
            best_score = score
            best = c
            print('..',c)
    mean_cols.append(best)
    print(best_score, best, mean_cols)