In [1]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
import gc

plt.style.use('ggplot')
%matplotlib inline

In [6]:
a = np.ones(100)
a[99] = 0
best_score = 99
best = None
for i in range(101):
    b = np.ones(100) * i / 100
    c = log_loss(a,b)
    if c < best_score:
        best_score = c
        best = i / 100
print(best, best_score)

0.99 0.0560015343548


In [7]:
a = np.ones(100)
a[99] = 0
b = np.ones(100)
log_loss(a,b)

0.34539575992340976

In [3]:
a

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [65]:
params = {
'colsample_bytree': 0.8,
#'gamma': 0.125,
'learning_rate': 0.1,
# 'max_depth': 9,
# 'min_child_weight': 1,
'n_estimators': 200,
'subsample': 0.8,
# 'reg_alpha': 0.0,
# 'reg_lambda': 2.0,

'nthread': 1,
'seed': 2707,
'silent': True,
}

In [66]:
from sklearn.model_selection import StratifiedKFold
from itertools import combinations

def generate_interactions(data, columns, degree=3):
    result = pd.DataFrame()
    for i in range(2, degree + 1):
        for comb in combinations(columns, i):
            name = '_'.join(comb)
            result[name] = data[list(comb)].apply(lambda row: '_'.join([str(i) for i in row]), axis=1)
    return result

def get_mean_columns(x_train, y_train, x_test, columns, alpha=10):
    train = x_train[columns].copy()
    test = x_test[columns].copy()
    
#     train.reset_index(inplace=True, drop=True)
#     test.reset_index(inplace=True, drop=True)
    
    train["target"] = y_train
    glob_mean = y_train.mean()
    
    for c in columns:
        K = train.groupby([c]).size()
        mean_loc = train.groupby([c])["target"].mean()
        values = (mean_loc * K + glob_mean * alpha) / (K + alpha)
        values.name = c + "_target_mean"
        test = test.join(values, on=c)
        test.loc[test[values.name].isnull(), values.name] = glob_mean

    return test.drop(columns, axis=1)

def populate_mean_columns(x_train, y_train, x_test, columns, alpha=10, n_splits=5):
    test_extentions = get_mean_columns(x_train, y_train, x_test, columns, alpha)
    x_train = x_train.reindex(columns = np.append( x_test.columns.values, test_extentions.columns.values))
    x_test = pd.concat((x_test, test_extentions), axis=1)
    kf = StratifiedKFold(random_state=2707, n_splits=n_splits, shuffle=True)
    for train_idx, test_idx in kf.split(x_train, y_train):
        extentions = get_mean_columns(x_train.iloc[train_idx], y_train[train_idx], x_train.iloc[test_idx], columns, alpha)
        x_train.loc[x_train.index[test_idx], extentions.columns] = extentions
    
    return x_train, x_test

In [49]:
X.columns
r = get_mean_columns(X_train, y_train, X_test, ['height_round'], alpha=10)

In [60]:
r['height_round_target_mean'].value_counts()

0.485701    7555
0.499843    6746
0.505395    4998
0.495172    4578
0.521298    2357
0.507695    2179
0.504346     725
0.541523     505
0.484308     191
0.579425      96
0.530084      30
0.478196      13
0.413690       8
0.568114       7
0.458208       5
0.615154       2
0.437312       2
0.466467       1
0.538231       1
0.599800       1
Name: height_round_target_mean, dtype: int64

In [63]:
np.any(X.isnull())

True

In [67]:
import os
from sklearn.base import clone

def folder(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
def save_model_result(model_name, X_train, X_test):
    folder('models')
    folder('models/' + model_name)
    pd.DataFrame(X_train).to_csv('models/' + model_name + '/train.csv', index=False, header=False, sep=';')
    pd.DataFrame(X_test).to_csv('models/' + model_name + '/test.csv', index=False, header=False, sep=';')

def execute_model(estimator, X_train, y_train, X_test=None, mean_columns=[], drop_columns=[], model_name="", n_splits=10,
                  create_callback=None, verbose=1, seed=1205, stratification_groups = None):

    np.random.seed(seed)
    random.seed(seed)

    kf = StratifiedKFold(random_state=seed, n_splits=n_splits, shuffle=True)
    logloss = []

    train_result = np.zeros(X_train.shape[0])
    X_train = pd.DataFrame(X_train)
    for train_idx, test_idx in kf.split(X_train, y_train if stratification_groups is None else stratification_groups):
        if create_callback is None:
            clf = clone(estimator)
        else:
            clf = create_callback()

        x1, x2 = populate_mean_columns(X_train.iloc[train_idx],
                                       y_train[train_idx],
                                       X_train.iloc[test_idx],
                                       mean_columns)
        
        x1.drop(drop_columns, axis=1, inplace=True)
        x2.drop(drop_columns, axis=1, inplace=True)
        
        clf.fit(x1, y_train[train_idx])
        train_result[test_idx] = clf.predict_proba(x2)[:, 1]
        logloss.append(log_loss(y_train[test_idx], train_result[test_idx]))

    if verbose:
        print("Logloss:")
        print(logloss)
        print("mean:", np.mean(logloss))
        print("std:", np.std(logloss))

    if model_name:
        X_test = pd.DataFrame(X_test)
        if create_callback is None:
            clf = clone(estimator)
        else:
            clf = create_callback()
            
        x1, x2 = populate_mean_columns(X_train,
                                       y_train,
                                       X_test,
                                       mean_columns)
        
        x1.drop(drop_columns, axis=1, inplace=True)
        x2.drop(drop_columns, axis=1, inplace=True)
        
        clf.fit(x1, y_train)
        test_result = clf.predict_proba(x2)[:, 1]
        save_model_result(model_name, train_result, test_result)
        if verbose:
            print(model_name, 'results saved!')
    return np.mean(logloss)

In [68]:
def clean_data(data):
    data.loc[data["ap_hi"] < 0, "ap_hi"] *= -1
    data.loc[data["ap_lo"] < 0, "ap_lo"] *= -1
#     data.loc[data["ap_hi"] > 10000, "ap_hi"] /= 100
#     data.loc[data["ap_hi"] > 600, "ap_hi"] /= 10
#     data.loc[data["ap_hi"] < 30, "ap_hi"] *= 10
#     data.loc[data["ap_lo"] > 4000, "ap_lo"] /= 100
#     data.loc[data["ap_lo"] > 300, "ap_lo"] /= 10
    
    
    
    
    
#     data["temp_column"] = data["ap_lo"]
#     idx = (data["ap_hi"] < data["ap_lo"]) & (data["ap_hi"] > 50)
#     data.loc[idx, "ap_lo"] = data.loc[idx, "ap_hi"]
#     data.loc[idx, "ap_hi"] = data.loc[idx, "temp_column"]
#     data = data.drop('temp_column', axis=1)
    
#     idx = (data["ap_lo"] < 1) & (data["ap_hi"] % 1 > 0)
#     data.loc[idx, "ap_lo"] = 10 * (data.loc[idx, "ap_hi"] % 1)
#     data.loc[idx, "ap_hi"] = (data.loc[idx, "ap_hi"] // 1)
    
#     idx = (data["ap_lo"] < 1) & (data["ap_hi"] % 10 > 0)
#     data.loc[idx, "ap_lo"] = 10 * (data.loc[idx, "ap_hi"] % 10)
#     data.loc[idx, "ap_hi"] = (data.loc[idx, "ap_hi"] // 10) * 10
    
#     data.loc[data["ap_lo"] < 2, "ap_lo"] *= 100
#     data.loc[data["ap_lo"] < 11, "ap_lo"] *= 10
#------------------

#     data.loc[(data['height']==data['ap_hi'])
#             &(data['weight']==data['ap_lo'])
#             &((data['weight']%10!=0)|(data['height']%10!=0)), ['ap_hi', 'ap_lo']]=np.nan
    
#     data.loc[(data['height']==data['ap_hi'])
#             &(data['weight']==data['ap_lo']), ['height', 'weight']]=np.nan
    
    manual_update = [
        (91933, ['ap_hi', 'ap_lo'], [130, 85]),
        (97818, ['ap_hi', 'ap_lo'], [110, 70]),
        
        (12494, ['ap_hi', 'ap_lo'], [120, 80]),
        (51749, ['ap_hi', 'ap_lo'], [120, 80]),
        
        (75399, ['ap_hi', 'ap_lo'], [120, 80]),
        
        (9482 , ['ap_lo'], [90]),
        (20438, ['ap_lo'], [100]),
        (22832, ['ap_lo'], [80]),
        (29821, ['ap_lo'], [100]),
        (33191, ['ap_lo'], [70]),
        (62058, ['ap_lo'], [80]),
        (62921, ['ap_lo'], [80]),
        (75482, ['ap_lo'], [80]),
        (95886, ['ap_lo'], [70]),
        (10586, ['ap_lo'], [100]),
        (26985, ['ap_lo'], [90]),
        (45450, ['ap_lo'], [70]),
        (50848, ['ap_lo'], [100]),
        (63276, ['ap_lo'], [90]),
        (74784, ['ap_lo'], [80]),
#         (10586, ['ap_lo'], [100]),
        (57993, ['ap_lo'], [90]),
        
        (11089, ['ap_hi'], [150]),
        (56466, ['ap_hi'], [140]),
        
        (2654 ,['ap_hi'], [90]),
        (12710 ,['ap_hi'], [140.0]),
        (13616 ,['ap_hi'], [70.0]),
        (19827 ,['ap_hi'], [150.0]),
        (36793 ,['ap_hi'], [140.0]),
        (40239 ,['ap_hi'], [160.0]),
        (52725 ,['ap_hi'], [130.0]),
        (58349 ,['ap_hi'], [140.0]),
        (58728 ,['ap_hi'], [120.0]),
        (61618 ,['ap_hi'], [140.0]),
        (61725 ,['ap_hi'], [140.0]),
        (62154 ,['ap_hi'], [130.0]),
        (69672 ,['ap_hi'], [140.0]),
        (77010 ,['ap_hi'], [90.0]),
        (81769 ,['ap_hi'], [130.0]),
        (82660 ,['ap_hi'], [110.0]),
        (91364 ,['ap_hi'], [120.0]),
        (99089 ,['ap_hi'], [200.0]),
        (1079 ,['ap_hi'], [140.0]),
        (35256 ,['ap_hi'], [150.0]),
        (44904 ,['ap_hi'], [140.0]),
        
    ]
    for idx, cols, update in manual_update:
        data.loc[data['id']==idx, cols] = update
        
    data.loc[(data['ap_lo']>300) & (data['ap_lo']<3000), 'ap_lo'] //= 10
    data.loc[(data['ap_lo']<1), 'ap_lo'] = np.nan
    data.loc[(data['ap_hi']<10), 'ap_hi'] = np.nan
    
    data.loc[(data['ap_lo']>300)&((data['ap_lo']//100)%10==0), 'ap_lo'] //= 100
    data.loc[(data['ap_lo']<20), 'ap_lo'] *= 10
    data.loc[(data['ap_lo']<20), 'ap_lo'] *= 10 # for ap_lo = 1.0
    
    data.loc[(data['ap_hi']==20), 'ap_hi'] = 120
    data.loc[(data['ap_hi']<20), 'ap_hi'] *= 10
    
    idx = data['ap_hi'] > 10000
    data.loc[idx, 'ap_hi'] = 10 * (data.loc[idx, 'ap_hi'] // 1000)
    
    idx = (250<data['ap_hi'])&(data['ap_hi']%10>5)
    data.loc[idx, 'ap_lo'] = 10 * (data.loc[idx, 'ap_hi'] % 10)
    data.loc[idx, 'ap_hi'] //= 10
    
    
    data["temp_column"] = data["ap_lo"]
    idx = data["ap_hi"] < data["ap_lo"]
    data.loc[idx, "ap_lo"] = data.loc[idx, "ap_hi"]
    data.loc[idx, "ap_hi"] = data.loc[idx, "temp_column"]
    data = data.drop('temp_column', axis=1)
    
    data.loc[data['height']<100, 'height'] += 100

    data.loc[(data['height']==data['ap_hi'])
            &(data['weight']==data['ap_lo'])
            &((data['weight']%10!=0)|(data['height']%10!=0)), ['ap_hi', 'ap_lo']]=np.nan
    
    data.loc[(data['height']==data['ap_hi'])
            &(data['weight']==data['ap_lo']), ['height', 'weight']]=np.nan

    return data

def new_features(data):
    
    data["imt"] = data["weight"] / (data["height"] * data["height"])
    data["ap_dif"] = data["ap_hi"] - data["ap_lo"]
    
    data["age_years"] = np.round(data["age"] / 365)
    
    age_bins = [0, 10000, 14000, 14980, 15700, 16420, 17140, 17890, 18625, 19355, 20090, 20820, 21555, 22280, 22990, 24000]
    age_names = [25, 30, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64]
    data["age_group"] = pd.cut(data['age'], age_bins, labels=age_names)
    
#     columns_to_interact = [
#         'gender',
#         'height',
#         'weight',
#         'ap_hi',
#         'ap_lo',
#         'cholesterol',
#         'gluc',
#         'smoke',
#         'alco',
#         'active',
#         'imt',
#         'ap_dif',
#         'age_years',
#     ]
#     data = pd.concat((data, generate_interactions(data, columns_to_interact, 2)), axis=1)
    return data

In [69]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

# clear train
#train = train.loc[train["height"] < 249]
# train.reset_index(inplace=True, drop=True)

train = clean_data(train)
test = clean_data(test)

train = new_features(train)
test = new_features(test)

no_smoke = pd.isnull(test['smoke'])
no_alco = pd.isnull(test['alco'])
no_active = pd.isnull(test['active'])

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

# X_test.loc[no_smoke, 'smoke'] = 0
# X_test.loc[no_alco, 'alco'] = 0
# X_test.loc[no_active, 'active'] = 1

# X_train["is_test"] = 0
# X_test["is_test"] = 1

X = pd.concat((X_train, X_test), axis=0)

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
label_encoder_dict = defaultdict(LabelEncoder)
columns_to_encode = ['age_group']



X[columns_to_encode].apply(lambda x: label_encoder_dict[x.name].fit(x))
X_train[columns_to_encode] = X_train[columns_to_encode].apply(lambda x: label_encoder_dict[x.name].transform(x))
X_test[columns_to_encode] = X_test[columns_to_encode].apply(lambda x: label_encoder_dict[x.name].transform(x))

In [70]:
# X.loc[(X['height'].isnull())&(X['weight'].isnull())]

In [71]:
# X_train_plus = X_train.copy()
# X_train_plus['TARGET'] = y_train
# X_train_plus.loc[X_train_plus['ap_hi'].isnull(),'ap_hi']=-1
# X_train_plus.loc[X_train_plus['ap_lo'].isnull(),'ap_lo']=-1
# sns.lmplot(x='ap_hi', y='ap_lo', data=X_train_plus, hue='TARGET', size=10)

In [72]:
# X_train_plus = X_train.copy()
# X_train_plus['TARGET'] = y_train
# sns.lmplot(x='height', y='weight', data=X_train_plus, hue='TARGET', size=10)

In [73]:
def get_sub_model(x_train, x_test, train_columns, target, model):
    x = pd.concat((x_train, x_test), axis=0)
    train_idx = ~x[target].isnull()
    y_train = x.loc[train_idx, target]
    
    model = clone(model)
    model.fit(x.loc[train_idx, train_columns], y_train)
    x_train.loc[x_train[target].isnull(), target] = model.predict(x_train.loc[x_train[target].isnull(), train_columns])
    x_test.loc[x_test[target].isnull(), target] = model.predict(x_test.loc[x_test[target].isnull(), train_columns])
    return x_train, x_test

clf = xgb.XGBRegressor(**params)
train_cols = ['age', 'gender', 'cholesterol', 'gluc']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'height', clf)

train_cols = ['age','height', 'gender', 'cholesterol', 'gluc']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'weight', clf)

train_cols = ['age', 'weight','height', 'gender', 'cholesterol', 'gluc']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'ap_hi', clf)

train_cols = ['age', 'weight','height', 'gender', 'cholesterol', 'gluc', 'ap_hi']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'ap_lo', clf)

from sklearn.model_selection import train_test_split
clf = xgb.XGBClassifier(**params)

__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.101, random_state=42)
X_train.loc[idx, 'alco'] = np.nan

__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.101, random_state=43)
X_train.loc[idx, 'smoke'] = np.nan

__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.096, random_state=44)
X_train.loc[idx, 'active'] = np.nan

train_cols = ['age', 'weight','height', 'gender', 'cholesterol', 'gluc', 'ap_hi', 'ap_lo']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'smoke', clf)

train_cols = ['age', 'weight','height', 'gender', 'cholesterol', 'gluc', 'ap_hi', 'ap_lo', 'smoke']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'alco', clf)

train_cols = ['age', 'weight','height', 'gender', 'cholesterol', 'gluc', 'ap_hi', 'ap_lo', 'smoke', 'alco']
X_train, X_test = get_sub_model(X_train, X_test, train_cols, 'active', clf)

X = pd.concat((X_train, X_test), axis=0)

In [74]:
a = X_train['age_years']
b = X_train['gender']
c = X_train['weight']//5
d = X_train['height']//5
e = X_train['ap_dif'] // 10
f = X_train['cholesterol']
g = X_train['gluc']
h = pd.Series(y_train)


In [75]:
# X.loc[X['imt']>0.006]
# X.loc[X['height']<100]
# X.loc[X['weight']<30]

In [76]:
rew = a.apply(str) + b.apply(str) + h.apply(str)
# (rew).value_counts()

In [77]:
execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              X_test,
              model_name="test2",
              mean_columns = ['age_years', 'age_group'],
              drop_columns=["id",  'age_years', 'age_group', ],
              n_splits=5,
              stratification_groups=rew
             )



Logloss:
[0.53865280103144375, 0.53653914483065202, 0.54012481623526643, 0.54289168743819749, 0.53599795644458692]
mean: 0.538841281196
std: 0.00250890833563
test2 results saved!


0.53884128119602936

In [39]:
((X['imt']*1000000)//100).value_counts()

24.0     10856
23.0     10781
25.0      9342
26.0      7995
27.0      7473
22.0      6690
28.0      5926
29.0      5384
30.0      4765
31.0      4130
21.0      4081
32.0      3560
33.0      2718
20.0      2502
34.0      2265
35.0      1971
36.0      1464
19.0      1360
37.0      1132
38.0       921
39.0       728
18.0       701
40.0       546
41.0       445
17.0       370
42.0       348
44.0       232
43.0       225
16.0       148
46.0       139
45.0       121
47.0        91
48.0        83
49.0        53
50.0        50
15.0        49
52.0        37
51.0        27
14.0        20
53.0        20
55.0        18
54.0        15
56.0        15
57.0        10
58.0        10
59.0        10
60.0        10
65.0         8
62.0         6
13.0         6
66.0         4
3.0          4
61.0         3
11.0         3
69.0         2
9.0          2
10.0         2
86.0         2
12.0         2
72.0         2
133.0        1
152.0        1
8.0          1
64.0         1
68.0         1
84.0         1
67.0      

In [78]:
# np.round(X['imt'], 4)

(X['height'] // 2).value_counts()
(X['weight'] // 2).value_counts()

(X['ap_hi'] // 10).value_counts()
(X['ap_lo'] // 10).value_counts()
(X['ap_dif'] // 10).value_counts()
((X['imt']*1000000)//100).value_counts()

X.columns

columns_to_interact = [
    'gender', 
    'height_round', 'weight_round', 'ap_hi_round', 'ap_lo_round', 'ap_dif_round', 'imt_round',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active',
       'age_years', 'age_group'
]

columns_to_drop = [
    'height_round', 'weight_round', 'ap_hi_round', 'ap_lo_round', 'ap_dif_round',
]

X_train['height_round'] = X_train['height'] // 5
X_train['weight_round'] = X_train['weight'] // 5
X_train['ap_hi_round']  = X_train['ap_hi'] // 10
X_train['ap_lo_round']  = X_train['ap_lo'] // 10
X_train['ap_dif_round'] = X_train['ap_dif'] // 10
X_train['imt_round'] = (X_train['imt']*1000000)//100

X_test['height_round'] = X_test['height'] // 5
X_test['weight_round'] = X_test['weight'] // 5
X_test['ap_hi_round']  = X_test['ap_hi'] // 10
X_test['ap_lo_round']  = X_test['ap_lo'] // 10
X_test['ap_dif_round'] = X_test['ap_dif'] // 10
X_test['imt_round'] = (X_test['imt']*1000000)//100

temp = generate_interactions(X_train, columns_to_interact, 2)
X_train = pd.concat((X_train, temp), axis=1)
X_test = pd.concat((X_test, generate_interactions(X_test, columns_to_interact, 2)), axis=1)

temp.columns

Index(['gender_height_round', 'gender_weight_round', 'gender_ap_hi_round',
       'gender_ap_lo_round', 'gender_ap_dif_round', 'gender_imt_round',
       'gender_cholesterol', 'gender_gluc', 'gender_smoke', 'gender_alco',
       'gender_active', 'gender_age_years', 'gender_age_group',
       'height_round_weight_round', 'height_round_ap_hi_round',
       'height_round_ap_lo_round', 'height_round_ap_dif_round',
       'height_round_imt_round', 'height_round_cholesterol',
       'height_round_gluc', 'height_round_smoke', 'height_round_alco',
       'height_round_active', 'height_round_age_years',
       'height_round_age_group', 'weight_round_ap_hi_round',
       'weight_round_ap_lo_round', 'weight_round_ap_dif_round',
       'weight_round_imt_round', 'weight_round_cholesterol',
       'weight_round_gluc', 'weight_round_smoke', 'weight_round_alco',
       'weight_round_active', 'weight_round_age_years',
       'weight_round_age_group', 'ap_hi_round_ap_lo_round',
       'ap_hi_round_ap_di

In [79]:
X = pd.concat((X_train, X_test), axis=0)

In [80]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
label_encoder_dict = defaultdict(LabelEncoder)
columns_to_encode = list(temp.columns)



X[columns_to_encode].apply(lambda x: label_encoder_dict[x.name].fit(x))
X_train[columns_to_encode] = X_train[columns_to_encode].apply(lambda x: label_encoder_dict[x.name].transform(x))
X_test[columns_to_encode] = X_test[columns_to_encode].apply(lambda x: label_encoder_dict[x.name].transform(x))

In [81]:
X_train.groupby(['age_group']).size()

age_group
1        4
2     3404
3     3319
4     3545
5     3712
6     4009
7     6635
8     6644
9     7475
10    7532
11    7097
12    6777
13    4924
14    4923
dtype: int64

In [82]:
columns_to_interact

['gender',
 'height_round',
 'weight_round',
 'ap_hi_round',
 'ap_lo_round',
 'ap_dif_round',
 'imt_round',
 'cholesterol',
 'gluc',
 'smoke',
 'alco',
 'active',
 'age_years',
 'age_group']

In [83]:
execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              X_test,
              model_name="test1",
              mean_columns = columns_to_interact,# + columns_to_encode,
              drop_columns=["id",  'age_years', 'age_group', ] + columns_to_drop,
              n_splits=5,
              stratification_groups=rew
             )
# Logloss:
# [0.53606295374894086, 0.54558918257530531, 0.55208194023929991, 0.53862096692588302, 0.54030507154921392, 0.52728928389793639, 0.54445819770394066, 0.5471169774755773, 0.53937070812633525, 0.53763649226068233]
# mean: 0.54085317745
# std: 0.00652303329993

# Logloss:
# [0.53583038247638315, 0.54544756661076377, 0.55207535525647966, 0.53875549079382201, 0.54033393128170781, 0.52712229988336357, 0.54422856284791976, 0.54695083486057405, 0.53913600056444511, 0.53769506663089839]
# mean: 0.540757549121
# std: 0.00653365614016



Logloss:
[0.53811907823933691, 0.53687966559339328, 0.54060007002521782, 0.54424048765991262, 0.53534928879453503]
mean: 0.539037718062
std: 0.00311818393261
test1 results saved!


0.53903771806247913

In [84]:
execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              X_test,
              model_name="test3",
              mean_columns = columns_to_interact + columns_to_encode,
              drop_columns=["id",  'age_years', 'age_group', ] + columns_to_drop,
              n_splits=5,
              stratification_groups=rew
             )



Logloss:
[0.53868046242070411, 0.53747653413431473, 0.54043864369340444, 0.54435090061374836, 0.53568999099992265]
mean: 0.539327306372
std: 0.0029511149702
test3 results saved!


0.53932730637241888

In [19]:
# plt.subplots(figsize=(20,7))
# X.loc[(X['ap_lo']>200), 'ap_lo'].hist(bins=500)

pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)

height has been deprecated.



In [279]:
X.loc[(X['ap_lo'].isnull())]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,imt,ap_dif,age_years,age_group


In [281]:
X.loc[(X['ap_lo']%10!=0)]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,imt,ap_dif,age_years,age_group
22,32,23046,1,158,90.0,145.0,85,2,2,0.0,0.0,1.0,0.003605,60,63.0,14
33,45,20652,1,160,73.0,130.0,85,1,1,0.0,0.0,0.0,0.002852,45,57.0,10
79,108,20370,2,164,74.0,140.0,85,1,1,0.0,0.0,0.0,0.002751,55,56.0,10
94,127,15946,2,185,88.0,133.0,89,2,3,0.0,0.0,1.0,0.002571,44,44.0,4
113,149,18938,1,157,79.0,125.0,65,1,2,0.0,0.0,1.0,0.003205,60,52.0,8
132,177,22585,2,164,84.0,150.0,89,2,2,0.0,0.0,1.0,0.003123,61,62.0,13
160,216,23117,2,180,104.0,120.0,85,2,2,0.0,0.0,1.0,0.003210,35,63.0,14
176,239,16621,1,177,81.0,120.0,63,1,1,0.0,0.0,1.0,0.002585,57,46.0,5
213,292,19085,1,154,64.0,110.0,65,1,1,0.0,0.0,1.0,0.002699,45,52.0,8
227,312,14373,1,168,68.0,120.0,79,1,1,0.0,0.0,1.0,0.002409,41,39.0,2


In [270]:
idx = (10>X['ap_hi'])
what = 'ap_hi'
fromc = ['age', 'weight','height', 'gender', 'cholesterol', 'gluc', 'ap_lo']

x_lo = X.loc[(~X[what].isnull())&(~idx), fromc + [what]]
y_lo = x_lo[what]
x_lo = x_lo.drop([what], axis=1)
model_lo = xgb.XGBRegressor(seed=1)
model_lo.fit(x_lo, y_lo)

x2_lo = X.loc[idx, fromc]
X['raz'] = 0

X.loc[idx, 'raz'] = model_lo.predict(x2_lo)
X.loc[idx]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,imt,ap_dif,age_years,age_group,raz
5382,7657,21240,1,162.0,69.0,7.0,80.0,1,1,0.0,0.0,1.0,0.002629,-73,58.0,11,122.588516
42334,60477,18716,1,171.0,80.0,1.0,108.0,1,1,0.0,0.0,1.0,0.002736,-1087,51.0,8,157.962418
1929,6580,19079,1,176.0,92.0,1.0,109.0,1,1,0.0,,1.0,0.00297,-1098,52.0,8,158.920624
12852,42755,19136,1,158.0,61.0,1.0,30.0,1,1,,,1.0,0.002444,-29,52.0,8,107.735397


In [271]:
X.loc[idx, ['id','ap_hi', 'ap_lo', 'raz']]

Unnamed: 0,id,ap_hi,ap_lo,raz
5382,7657,7.0,80.0,122.588516
42334,60477,1.0,108.0,157.962418
1929,6580,1.0,109.0,158.920624
12852,42755,1.0,30.0,107.735397


In [249]:
X.loc[X['ap_hi']<X['ap_lo']]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,imt,ap_dif,age_years,age_group
474,681,19099,1,156.0,65.0,120.0,150.0,2,1,0.0,0.0,1.0,0.002671,-30,52.0,8
636,913,20457,2,169.0,68.0,70.0,110.0,1,1,0.0,0.0,1.0,0.002381,-40,56.0,10
2384,3356,23361,1,154.0,102.0,90.0,150.0,1,1,0.0,0.0,0.0,0.004301,-60,64.0,14
2990,4214,21957,2,182.0,90.0,80.0,140.0,3,3,0.0,0.0,1.0,0.002717,-60,60.0,12
3447,4880,19992,2,180.0,80.0,80.0,125.0,3,3,1.0,1.0,1.0,0.002469,-45,55.0,9
3623,5130,21874,1,160.0,83.0,80.0,120.0,1,1,0.0,0.0,1.0,0.003242,-40,60.0,12
4825,6836,19618,2,164.0,89.0,90.0,140.0,2,2,0.0,0.0,1.0,0.003309,-50,54.0,9
4830,6843,16969,2,159.0,68.0,70.0,100.0,1,1,0.0,0.0,0.0,0.00269,-30,46.0,5
4941,6992,20501,1,160.0,69.0,80.0,170.0,1,1,0.0,0.0,1.0,0.002695,-90,56.0,10
5121,7277,17600,2,173.0,78.0,90.0,140.0,1,1,0.0,1.0,1.0,0.002606,-50,48.0,6


In [216]:
# mmin = 99
# g=0
# for i in range(50,1050,50):
#     X_train["age_years"] = np.round(X_train["age"] / i)
#     f = execute_model(xgb.XGBClassifier(seed=1),
#               X_train,
#               y_train,
#               mean_columns=['age_years'],
#               drop_columns=["id", 'age_years', 'imt',],
#               n_splits=10
#              )
#     if f < mmin:
#         mmin = f
#         g = i
# print(i)

In [None]:
Logloss:
[0.53422969918597507, 0.54446269467012454, 0.55027883568192537, 0.53687928813613639, 0.53864020987537253, 0.52530850938530493, 0.54294463323440822, 0.54588922270463258, 0.53702049074152336, 0.53530571997748999]
mean: 0.539095930359
std: 0.00672598228164