In [2]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils 

plt.style.use('ggplot')
%matplotlib inline

In [3]:
def clean_data(data):
    data['error_group'] = 0

    # weight/height correction
    idx = (data['height']<130) & (data['weight']>150)
    data.loc[idx, ["height", "weight"]] = data.loc[idx, ["weight", "height"]].values
#     data.loc[idx, 'error_group'] = 100-1
#     data.loc[data['weight']<20, "weight"] *= 10
#     data.loc[data['weight']<20, "weight"] *= 10
#     data.loc[data['weight']<25, "weight"] += 100

    # preasure correction

    data.loc[data["ap_hi"] < 0, "ap_hi"] *= -1
    data.loc[data["ap_lo"] < 0, "ap_lo"] *= -1
    
    for i in range(10):
        str_i = str(i)
        data['hi_' + str_i + 's'] = data['ap_hi'].apply(lambda x: str(x).count(str_i))
#         data[str(i)+'lo'] = data['ap_lo'].apply(lambda x: str(x).count(str(i)))
#         data[str(i)+'hilo'] = data[str(i)+'hi']+data[str(i)+'lo']
#         data=data.drop(str(i)+'lo', axis=1)
        for j in range(10):
            str_j = str_i + str(j)
            data['hi_' + str_j + 's'] = data['ap_hi'].apply(lambda x: str(x).count(str_j))
        

    data.loc[(data['ap_lo'] < 20), 'error_group'] = 5
    data.loc[(data['ap_hi'] < 50), 'error_group'] = 6
    data.loc[(data['ap_lo'] > 250), 'error_group'] = 1
    data.loc[(data['ap_lo'] > 4000), 'error_group'] = 2
    data.loc[(data['ap_hi'] > 250), 'error_group'] = 3
    data.loc[(data['ap_hi'] > 10000), 'error_group'] = 4

    data.loc[(data["ap_hi"] < 20) & (data["ap_hi"] > 10), "ap_hi"] *= 10
    data.loc[(data["ap_lo"] < 15) & (data["ap_lo"] > 2), "ap_lo"] *= 10

    idx = data['ap_hi'] > 10000
    data.loc[idx, 'ap_hi'] = 10 * (data.loc[idx, 'ap_hi'] // 1000)
    data.loc[data['ap_lo'] >= 10000, 'ap_lo'] //= 100

#     data.loc[data['ap_lo'].isin([1100])&(data['ap_hi']>160), 'ap_lo'] = 110
#     data.loc[data['ap_lo'].isin([1100]), 'ap_lo'] = 100
#     data.loc[(data['ap_lo']>250)&(data['ap_lo']<4000)&(data['ap_lo']%100==0), 'ap_lo'] /= 10
    
    manual_update = [
        
# id	age	gender	height	weight	ap_hi	ap_lo	cholesterol	gluc	smoke	alco	active	cardio	BMI
# 12494	16905	2	163	63.0	1	2088	1	1	1.0	0.0	1.0	0	23.711845
# 42591	18191	2	162	63.0	140	1900	1	1	1.0	0.0	1.0	1	24.005487
# 78873	20323	1	168	68.0	130	1900	1	1	0.0	0.0	1.0	0	24.092971
# 51749	18419	1	169	62.0	1	2088	1	1	0.0	0.0	1.0	-5	21.707923
        (12494, ['ap_hi', 'ap_lo'], [120, 80]),
        (42591, ['ap_hi', 'ap_lo'], [140, 90]), # ?
        (78873, ['ap_hi', 'ap_lo'], [130, 100]), # ?
        (51749, ['ap_hi', 'ap_lo'], [120, 80]),
        
# 57807	20496	1	164	62.0	70	1100	1	1	0.0	0.0	0.0	0	23.051755
# 60477	18716	1	171	80.0	1	1088	1	1	0.0	0.0	1.0	1	27.358845
# 91198	18182	2	186	95.0	100	901	2	2	0.0	0.0	1.0	0	27.459822
# 6580	19079	1	176	92.0	1	1099	1	1	0.0	NaN	1.0	-5	29.700413
        (57807, ['ap_hi', 'ap_lo'], [170, 100]),
        (60477, ['ap_hi', 'ap_lo'], [110, 80]),
        (91198, ['ap_hi', 'ap_lo'], [100, 90]),
        (6580,  ['ap_hi', 'ap_lo'], [110, 90]),
        
# 44701	22801	1	163	115.0	20	170	1	1	0.0	0.0	1.0	1	43.283526
# 94673	22551	1	169	88.0	10	160	3	3	0.0	0.0	0.0	1	30.811246
        (44701, ['ap_hi', 'ap_lo'], [120, 70]),
        (94673,  ['ap_hi', 'ap_lo'], [110, 60]),
        
    ]
    for idx, cols, update in manual_update:
        data.loc[data['id']==idx, cols] = update
        

    return data

In [4]:
train = pd.read_csv('train_original.csv', sep=';')
test = pd.read_csv('test_original.csv', sep=';', na_values='None')

train = clean_data(train)
test = clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test

from sklearn.model_selection import train_test_split
__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.101, random_state=12321)
X_train.loc[idx, 'alco'] = np.nan
__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.101, random_state=11111)
X_train.loc[idx, 'smoke'] = np.nan
__, idx = train_test_split(list(range(X_train.shape[0])), test_size=0.096, random_state=32123)
X_train.loc[idx, 'active'] = np.nan

In [76]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

def impute_nans(X_train, X_test, columns, mean_columns, target, perform_cv = False):
    X = pd.concat((X_train, X_test), axis=0)
    train_idx = ~X[target].isnull()
#     model = clone(model)
#     model.fit(X.loc[train_idx, columns], X.loc[train_idx, target])
#     X_train.loc[X_train[target].isnull(), target] = model.predict(X_train.loc[X_train[target].isnull(), columns])
#     X_test.loc[X_test[target].isnull(), target] = model.predict(X_test.loc[X_test[target].isnull(), columns])
    
    drop_columns = [c for c in X_train.columns if c not in columns]
    
    params1 = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'subsample': 0.8,
    'n_jobs': 1,
    'random_state': 2707,
    'silent': True,
    }

    def create_callback(tr, tst):
        return xgb.XGBClassifier(**params1)
    
    params2 = {
        'colsample_bytree': 0.8,
        'learning_rate': 0.1,
        'n_estimators': 44,
        'subsample': 0.8,

        'nthread': 1,
        'seed': 2707,
        'silent': True,
    }
    def create_callback2(tr, tst):
        return lgb.LGBMClassifier(**params2)
    
    params3 = {
        'colsample_bytree': 0.8,
        'learning_rate': 0.1,
        'n_estimators': 200,
        'subsample': 0.8,
        'tree_method': 'hist',
        'grow_policy': 'lossguide',

        'n_jobs': 1,
        'random_state': 2707,
        'silent': True,
    }
    def create_callback3(tr, tst):
        return xgb.XGBClassifier(**params3)
    
    if perform_cv:
        n_folds = 7
        print(X.loc[train_idx, target].mean(), 1-X.loc[train_idx, target].mean())
        kf = StratifiedKFold(random_state=11111, n_splits=n_folds, shuffle=True)
        fold_score = []
        train = X.loc[train_idx]
        y = train[target].values.ravel()
        for train_idx, test_idx in kf.split(train, train[target]):
            predict1 = utils.fit_predict_model(create_callback,
                                                        train.iloc[train_idx],
                                                        y[train_idx],
                                                        train.iloc[test_idx],
                                                        mean_columns=mean_columns,
                                                        drop_columns=drop_columns,
                                                        alpha=13)
            
            predict2 = utils.fit_predict_model(create_callback2,
                                                        train.iloc[train_idx],
                                                        y[train_idx],
                                                        train.iloc[test_idx],
                                                        mean_columns=mean_columns,
                                                        drop_columns=drop_columns,
                                                        alpha=13)
            
            predict3 = utils.fit_predict_model(create_callback3,
                                                        train.iloc[train_idx],
                                                        y[train_idx],
                                                        train.iloc[test_idx],
                                                        mean_columns=mean_columns,
                                                        drop_columns=drop_columns,
                                                        alpha=13)
            
            predict = (predict1+predict2+predict3)/3
            
            fold_score.append(accuracy_score(y[test_idx], np.round(predict)))
            print(np.round(predict).mean(), predict.mean())

        print(target, np.mean(fold_score), np.std(fold_score))

In [61]:
X_test['cardio']=-1
X_train['cardio']=y_train
columns = ['age_group', 'gender', 'weight', 'ap_hi','alco', 'active', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP']
impute_nans(X_train, X_test, columns, ['alco'], 'smoke', perform_cv = True)

0.08761957730812013 0.9123804226918799


ValueError: cannot reindex from a duplicate axis

In [28]:
smoke 0.921668557493 0.00139147924516

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke',
       ...
       'ap_dif', 'MAP', 'age_years', 'age_group', 'age_group_MAPX',
       'ap_hi_group', 'ap_lo_group', 'weight_group', 'height_group',
       'BMI_group'],
      dtype='object', length=137)

In [84]:
smoke 0.912213571502 0.912213571502

SyntaxError: invalid syntax (<ipython-input-84-e5e11a6c8f55>, line 1)

In [85]:
X_train.columns.get_loc(['ap_hi', 'ap_lo'])

TypeError: '['ap_hi', 'ap_lo']' is an invalid key

In [64]:
np.ndarray((10,5))[:, 1]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [70]:
from sklearn.utils import safe_indexing,column_or_1d

# column_or_1d(X_train, 1)

In [74]:
X_train.columns.where(X_train.columns=='height')

Index([     nan,      nan,      nan, 'height',      nan,      nan,      nan,
            nan,      nan,      nan,
       ...
            nan,      nan,      nan,      nan,      nan,      nan,      nan,
            nan,      nan,      nan],
      dtype='object', length=138)

In [161]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.base import clone

def impute_nans(model, X_train, X_test, columns, target, perform_cv = False, scoring='accuracy'):
    X = pd.concat((X_train, X_test), axis=0)
    train_idx = ~X[target].isnull()
    model = clone(model)
    model.fit(X.loc[train_idx, columns], X.loc[train_idx, target])
    X_train.loc[X_train[target].isnull(), target] = model.predict(X_train.loc[X_train[target].isnull(), columns])
    X_test.loc[X_test[target].isnull(), target] = model.predict(X_test.loc[X_test[target].isnull(), columns])
    
    if perform_cv:
        n_folds = 7
        kf = StratifiedKFold(random_state=111, n_splits=n_folds, shuffle=True)
        scores = cross_val_score(model, X.loc[train_idx, columns], X.loc[train_idx, target], cv=kf, scoring=scoring)
        print(target, np.mean(scores), 'on', n_folds, 'folds', 'with mean on train', X.loc[train_idx, target].mean())
    
params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'subsample': 0.8,
    'n_jobs': 4,
    'random_state': 2707,
    'silent': True,
}
clf = xgb.XGBClassifier(**params)

columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP']
impute_nans(clf, X_train, X_test, columns, 'smoke')#, perform_cv = True)

columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP', ]
impute_nans(clf, X_train, X_test, columns, 'alco')#, perform_cv = True)

columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP', ]
impute_nans(clf, X_train, X_test, columns, 'active')#, perform_cv = True)

In [162]:
columns_to_interact = ['age_group', 'gender', 'MAP', 'error_group', 'cholesterol', 'gluc', 
                       'BMI_group', 'smoke', 'alco', 'active', "ap_hi_group", "ap_lo_group",
                      "weight_group", "height_group"]
temp = utils.generate_interactions(X_train, columns_to_interact, degree=5, white_list=best_interactions)
X_train = pd.concat((X_train, temp), axis=1)
X_test = pd.concat((X_test, utils.generate_interactions(X_test, columns_to_interact, degree=5, white_list=best_interactions)), axis=1)
print(temp.columns)

X = pd.concat((X_train, X_test), axis=0)

Index(['age_group_gender', 'age_group_gluc', 'gender_MAP', 'gender_smoke',
       'gender_active', 'MAP_error_group', 'MAP_smoke', 'MAP_active',
       'error_group_gluc', 'error_group_active', 'cholesterol_gluc',
       'gluc_smoke', 'gluc_alco', 'gluc_active', 'smoke_active',
       'age_group_gender_error_group', 'age_group_gender_gluc',
       'age_group_gender_smoke', 'age_group_gender_alco',
       'age_group_error_group_gluc', 'age_group_error_group_alco',
       'age_group_gluc_smoke', 'age_group_gluc_alco', 'age_group_gluc_active',
       'gender_MAP_error_group', 'gender_MAP_smoke', 'gender_MAP_alco',
       'gender_cholesterol_gluc', 'gender_cholesterol_smoke',
       'gender_cholesterol_active', 'gender_gluc_smoke', 'gender_gluc_active',
       'gender_smoke_active', 'gender_alco_active', 'MAP_error_group_smoke',
       'MAP_error_group_alco', 'MAP_gluc_active', 'MAP_BMI_group_alco',
       'MAP_smoke_alco', 'MAP_alco_height_group',
       'error_group_cholesterol_gluc', 'e

In [224]:
use_columns = [
#     'age_group', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active',
# 'BMI',
#  'MAP',
#     'ap_dif',
    
"age_group",
"gender",
"height",
"ap_hi",
"ap_lo",
"cholesterol",
"gluc",
"smoke",
"alco",
"active",
# "error_group",
# "lo_14s",
# "hi_2s",
"BMI",
"ap_dif",
"MAP",

"age_group_MAPX",
]
mean_columns = [
    'cholesterol_gluc_smoke_active',
    'ap_hi',
    "error_group",
    'gluc', 
    'MAP_active',
    'gender_MAP', 
    'cholesterol_gluc',
    'gluc_smoke',
                'gluc_active', 'gluc_alco', 'smoke_active', 'cholesterol_gluc_active', 'gender_cholesterol_gluc'
] #, 'age_group_MAP', 'gender_MAP'

# for i in range(10):
#         str_i = str(i)
#         use_columns.append(str_i)
#         for j in range(10):
#             str_j = str_i + str(j)
#             use_columns.append(str_j)

In [225]:
import gc
gc.collect()

168

In [240]:
X_train['cholesterol_gluc_smoke_active'].value_counts()

1.0_1.0_0.0_1.0    36879
1.0_1.0_0.0_0.0     8125
2.0_1.0_0.0_1.0     4975
3.0_1.0_0.0_1.0     3178
1.0_1.0_1.0_1.0     3135
3.0_3.0_0.0_1.0     2632
2.0_2.0_0.0_1.0     1796
1.0_2.0_0.0_1.0     1616
1.0_3.0_0.0_1.0     1132
2.0_1.0_0.0_0.0     1085
3.0_3.0_0.0_0.0      626
2.0_1.0_1.0_1.0      569
3.0_1.0_0.0_0.0      533
1.0_1.0_1.0_0.0      533
2.0_2.0_0.0_0.0      439
1.0_2.0_0.0_0.0      401
3.0_2.0_0.0_1.0      385
3.0_1.0_1.0_1.0      312
2.0_3.0_0.0_1.0      292
1.0_3.0_0.0_0.0      227
1.0_2.0_1.0_1.0      188
3.0_3.0_1.0_1.0      176
2.0_2.0_1.0_1.0      167
2.0_1.0_1.0_0.0      100
1.0_3.0_1.0_1.0       99
3.0_2.0_0.0_0.0       80
3.0_1.0_1.0_0.0       55
2.0_3.0_0.0_0.0       54
3.0_2.0_1.0_1.0       42
3.0_3.0_1.0_0.0       38
2.0_2.0_1.0_0.0       37
1.0_2.0_1.0_0.0       30
2.0_3.0_1.0_1.0       29
1.0_3.0_1.0_0.0       20
3.0_2.0_1.0_0.0        9
2.0_3.0_1.0_0.0        6
Name: cholesterol_gluc_smoke_active, dtype: int64

In [251]:
train = X_train.copy()
train["target"] = y_train
test = X_test.copy()
c = ['cholesterol', 'gluc','smoke','active','MAP']
K = train.groupby(c).size()
mean_loc = train.groupby(c)["target"].mean()
values = (mean_loc * K + 0.5 * 13) / (K + 13)
values.name='RAZ'
test = test.join(values, on=c)
test.loc[test[values.name].isnull()]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,...,gender_cholesterol_gluc_active,gender_gluc_smoke_active,cholesterol_gluc_smoke_alco,cholesterol_gluc_smoke_active,cholesterol_gluc_alco_active,age_group_gender_gluc_smoke_active,gender_cholesterol_gluc_smoke_active,gender_cholesterol_gluc_alco_active,cholesterol_gluc_smoke_alco_active,RAZ
37,151,20891,2,157,66.0,150,1000,3,1,1.0,...,2.0_3.0_1.0_0.0,2.0_1.0_1.0_0.0,3.0_1.0_1.0_1.0,3.0_1.0_1.0_0.0,3.0_1.0_1.0_0.0,11.0_2.0_1.0_1.0_0.0,2.0_3.0_1.0_1.0_0.0,2.0_3.0_1.0_1.0_0.0,3.0_1.0_1.0_1.0_0.0,
279,940,16186,1,155,103.0,119,72,2,2,0.0,...,1.0_2.0_2.0_1.0,1.0_2.0_0.0_1.0,2.0_2.0_0.0_0.0,2.0_2.0_0.0_1.0,2.0_2.0_0.0_1.0,4.0_1.0_2.0_0.0_1.0,1.0_2.0_2.0_0.0_1.0,1.0_2.0_2.0_0.0_1.0,2.0_2.0_0.0_0.0_1.0,
297,1020,18719,2,166,62.0,120,71,3,1,1.0,...,2.0_3.0_1.0_1.0,2.0_1.0_1.0_1.0,3.0_1.0_1.0_0.0,3.0_1.0_1.0_1.0,3.0_1.0_0.0_1.0,8.0_2.0_1.0_1.0_1.0,2.0_3.0_1.0_1.0_1.0,2.0_3.0_1.0_0.0_1.0,3.0_1.0_1.0_0.0_1.0,
303,1079,22571,2,170,74.0,400,60,1,1,0.0,...,2.0_1.0_1.0_1.0,2.0_1.0_0.0_1.0,1.0_1.0_0.0_0.0,1.0_1.0_0.0_1.0,1.0_1.0_0.0_1.0,13.0_2.0_1.0_0.0_1.0,2.0_1.0_1.0_0.0_1.0,2.0_1.0_1.0_0.0_1.0,1.0_1.0_0.0_0.0_1.0,
328,1157,22018,2,176,80.0,132,80,1,1,1.0,...,2.0_1.0_1.0_1.0,2.0_1.0_1.0_1.0,1.0_1.0_1.0_0.0,1.0_1.0_1.0_1.0,1.0_1.0_0.0_1.0,12.0_2.0_1.0_1.0_1.0,2.0_1.0_1.0_1.0_1.0,2.0_1.0_1.0_0.0_1.0,1.0_1.0_1.0_0.0_1.0,
342,1217,18036,1,155,110.0,145,90,2,1,0.0,...,1.0_2.0_1.0_0.0,1.0_1.0_0.0_0.0,2.0_1.0_0.0_0.0,2.0_1.0_0.0_0.0,2.0_1.0_0.0_0.0,7.0_1.0_1.0_0.0_0.0,1.0_2.0_1.0_0.0_0.0,1.0_2.0_1.0_0.0_0.0,2.0_1.0_0.0_0.0_0.0,
448,1602,18122,2,171,76.0,180,110,3,3,1.0,...,2.0_3.0_3.0_0.0,2.0_3.0_1.0_0.0,3.0_3.0_1.0_0.0,3.0_3.0_1.0_0.0,3.0_3.0_0.0_0.0,7.0_2.0_3.0_1.0_0.0,2.0_3.0_3.0_1.0_0.0,2.0_3.0_3.0_0.0_0.0,3.0_3.0_1.0_0.0_0.0,
644,2195,19640,2,182,90.0,160,1110,1,2,0.0,...,2.0_1.0_2.0_0.0,2.0_2.0_0.0_0.0,1.0_2.0_0.0_0.0,1.0_2.0_0.0_0.0,1.0_2.0_0.0_0.0,9.0_2.0_2.0_0.0_0.0,2.0_1.0_2.0_0.0_0.0,2.0_1.0_2.0_0.0_0.0,1.0_2.0_0.0_0.0_0.0,
724,2475,19553,1,155,72.0,95,70,1,1,1.0,...,1.0_1.0_1.0_0.0,1.0_1.0_1.0_0.0,1.0_1.0_1.0_0.0,1.0_1.0_1.0_0.0,1.0_1.0_0.0_0.0,9.0_1.0_1.0_1.0_0.0,1.0_1.0_1.0_1.0_0.0,1.0_1.0_1.0_0.0_0.0,1.0_1.0_1.0_0.0_0.0,
995,3430,18771,1,159,85.0,160,1000,3,3,0.0,...,1.0_3.0_3.0_0.0,1.0_3.0_0.0_0.0,3.0_3.0_0.0_0.0,3.0_3.0_0.0_0.0,3.0_3.0_0.0_0.0,8.0_1.0_3.0_0.0_0.0,1.0_3.0_3.0_0.0_0.0,1.0_3.0_3.0_0.0_0.0,3.0_3.0_0.0_0.0_0.0,


In [227]:
# X_test['cholesterol_gluc_smoke_active'].value_counts()

In [228]:
#  5	(0.5387210612358031, 0.53908495582477467)
#  6	(0.53875910103167679, 0.53914751218811652)
#  7	(0.53880255564816204, 0.53916006288317908)
#  8	(0.53885523453108464, 0.53925112418043775)
#  9	(0.53883513664677474, 0.53926419236173784)
# 10	(0.53859977707105033, 0.53901168020149492)
# 11	(0.53885940801773513, 0.5392083353358218)
# 12	(0.5387449524031015, 0.53910693602820126)
# 13	(0.53860169446196149, 0.53894647849839095)
# 14	(0.53868075060547327, 0.53902469873650505)
# 15	(0.53862027392521605, 0.53901222140087546)
# 16	(0.53888470508839914, 0.53924245981066532)
# 17	(0.53871842606094211, 0.53910013326813211)
# 18	(0.53875072382243228, 0.53913302528622098)
# 19	(0.53871978044672797, 0.53907222634869367)
# 20	(0.53873956020603608, 0.5391327381456501)
# 21	(0.53867673120877213, 0.53906722821160313)
# 22	(0.53872675503165057, 0.53909177387999974)
# 23	(0.5387593012507248, 0.53915932529166388)
# 24	(0.53871720156688541, 0.53901944164662974)
# 25	(0.53871319790519423, 0.53907590822454066)
# 26	(0.5387557838828374, 0.53917015878887609)
# 27	(0.53871462107471579, 0.5390504570167638)

In [230]:
import utils
params = {
    'colsample_bytree': 0.8,
    #'gamma': 0.125,
    'learning_rate': 0.1,
#     'max_depth': 4,
    # 'min_child_weight': 1,
    'n_estimators': 224,
    'subsample': 0.8,
    # 'reg_alpha': 0.0,
    # 'reg_lambda': 2.0,
    
    'n_jobs': 4,
    'random_state': 2707,
    'silent': True,
}

utils.execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              X_test,
              model_name="model1-",
               mean_columns = mean_columns,
              use_columns = use_columns,
              n_splits=15,
              n_folds=10,
#               stratification_groups=rew,
              alpha=13,
#               seed=150
             )
gc.collect()
# 10 folds logloss:
# [0.53879171211127175, 0.54238427117159227, 0.53835795056164537, 0.53590450923434263, 0.53965623001675889, 0.53466719153279707, 0.53797157618311608, 0.54158551989326476, 0.53629106584952935, 0.541785575506043]
# mean: 0.538739560206
# std: 0.00250712719565
# 15 Splits logloss:
# [0.53990746477150675, 0.54170948794032159, 0.53577194222410518, 0.54126022680376951, 0.53938495633157968, 0.53626944090268058, 0.54246021538176914, 0.53646336087510926, 0.54158070565743344, 0.53745181211087723, 0.53646428880860408, 0.53886036689953631, 0.54046600754234764, 0.53630953872061249, 0.54263125721449978]
# mean: 0.539132738146
# std: 0.00241552376841
# model1+ results saved!


10 folds logloss:
[0.53780406650227752, 0.54321798307690283, 0.53768109113512974, 0.53502370267065347, 0.53999987807637873, 0.53447152769677753, 0.53612779679476119, 0.54165492304339824, 0.53749675539156083, 0.54079713834002863]
mean: 0.538427486273
std: 0.00275297684858
15 Splits logloss:
[0.54000550939349135, 0.54164752770875035, 0.53500783416896258, 0.54075476522308352, 0.53915419898300765, 0.53591072473764945, 0.54240650931976875, 0.53601667667049235, 0.54082981467044777, 0.53698305442433381, 0.53655938510635337, 0.53819483177733252, 0.54006342031754306, 0.53584646522918367, 0.54186097923605503]
mean: 0.538749446464
std: 0.00244198474517
model1- results saved!


854

In [231]:
params = {
    'colsample_bytree': 0.8,
    #'gamma': 0.125,
    'learning_rate': 0.1,
    # 'max_depth': 9,
    # 'min_child_weight': 1,
    'n_estimators': 200,
    'subsample': 0.8,
    # 'reg_alpha': 0.0,
    # 'reg_lambda': 2.0,
    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 1,
    'random_state': 2707,
    'silent': True,
}

utils.execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              X_test,
              model_name="model2-",
              mean_columns = mean_columns,
              use_columns = use_columns,
              n_splits=15,
              n_folds=10,
#               stratification_groups=rew,
              alpha=13,
             )
gc.collect()
# ds logloss:
# [0.53838775050345211, 0.54261874382933317, 0.53928867846307305, 0.53517951068627134, 0.53954022503431909, 0.53495162372994898, 0.53832780278071102, 0.54151861980412785, 0.53622335988762093, 0.54144787624529611]
# mean: 0.538748419096
# std: 0.00254352152431
# 15 Splits logloss:
# [0.53975054028836633, 0.54185785865419411, 0.53596000542052324, 0.541394881952169, 0.53921806647281545, 0.53599988924526454, 0.54255523619557255, 0.53663155144904706, 0.54115136838759059, 0.53740464346588646, 0.5365239498279466, 0.53874940825129614, 0.54019631961036263, 0.53671776224865975, 0.54252445237787772]
# mean: 0.539109062257
# std: 0.002355921174
# model2+ results saved!


10 folds logloss:
[0.53809696971452015, 0.54267932448817846, 0.53726353840112029, 0.53544755255448584, 0.5402763877548139, 0.53406033898307315, 0.53703481750885407, 0.54135055841265689, 0.53717603217301857, 0.54038460278769362]
mean: 0.538377012278
std: 0.00258393893803
15 Splits logloss:
[0.53966115798704184, 0.54167830830454466, 0.53496114205938894, 0.54087141052834009, 0.53897133728340307, 0.53589446620906811, 0.54223077546604503, 0.53641504173447263, 0.54053579379282068, 0.53726932214346268, 0.53622725034449714, 0.53836969903162391, 0.53996776625857246, 0.53606081640943903, 0.54188865725515345]
mean: 0.538733529654
std: 0.00237324027804
model2- results saved!


854

In [232]:
import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import KFold

params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
#     'num_leaves': 2**3,
#     'min_child_weight': 1,
    'n_estimators': 44,
    'subsample': 0.8,
#     'reg_alpha': 1.2,
#     'reg_lambda': 0.0,
       
    'nthread': 1,
    'seed': 2707,
    'silent': True,
}

#kf = KFold(n_splits=7, random_state=12345)
utils.execute_model(lgb.LGBMClassifier(**params), #CalibratedClassifierCV(lgb.LGBMClassifier(**params), cv=kf),
              X_train,
              y_train,
              X_test,
              model_name="model3-",
              mean_columns = mean_columns,
              use_columns = use_columns,
              n_splits=15,
              n_folds=10,
#               stratification_groups=rew,
              alpha=13
             )
gc.collect()
# 10 folds logloss:
# [0.53878261099232327, 0.54210784783582577, 0.53809482292303445, 0.53609552244928205, 0.54026202753931118, 0.53530128563380508, 0.53786962161525931, 0.54130416790795344, 0.53657631979284648, 0.54054578419130106]
# mean: 0.538694001088
# std: 0.00219428300863
# 15 Splits logloss:
# [0.53948937210350822, 0.54161359215348182, 0.53507513769243864, 0.54104127811197134, 0.53934383317359258, 0.53645848684775677, 0.54283650309623088, 0.53660968529008857, 0.54159792050850664, 0.53694623232395022, 0.53561972528550217, 0.5387600102110226, 0.53955752137824287, 0.53634922619574232, 0.54234585724637496]
# mean: 0.538909625441
# std: 0.00251014294768
# model3+ results saved!


10 folds logloss:
[0.53813999550530101, 0.54156197829297859, 0.53776042160086701, 0.53589112216067902, 0.54032527004830488, 0.53453887887254303, 0.53774912696694455, 0.54206452028545016, 0.5363155730521858, 0.54157050130437412]
mean: 0.538591738809
std: 0.00251526896901
15 Splits logloss:
[0.54000451972587482, 0.54157275118553672, 0.53512486654736513, 0.54144504712359687, 0.53922167136464838, 0.53688530338119778, 0.54294510573581534, 0.53671041587250357, 0.54090389878424905, 0.53698476620290192, 0.53586473090919862, 0.53861995242703531, 0.53941241054626288, 0.53601253218752976, 0.54198752987790799]
mean: 0.538913033458
std: 0.00244291407569
model3- results saved!


903

In [174]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.preprocessing import StandardScaler  

# scaler = StandardScaler()  
# X = pd.concat((X_train, X_test), axis=0)
# scaler.fit(X)
# gc.collect()

# clf = MLPClassifier(solver='adam', activation='relu', learning_rate = 'adaptive', tol = 1e-5,
#                     hidden_layer_sizes=(8, 3), random_state=1100)

# utils.execute_model(clf,
#               scaler.transform(X_train),
#               y_train,
#               scaler.transform(X_test) ,
#               model_name="model4",
#               mean_columns = mean_columns,
#               use_columns = use_columns,
#               n_splits=15,
#               n_folds=10,
#               alpha=20
#              )
# gc.collect()

In [239]:
models = ['model1','model2','model3','model1+','model2+','model3+']
result1 = utils.merge_models(models, method='mean')
pd.DataFrame(result1).to_csv('models1234_mean.csv', index=False, header=False, sep=';')


model1
0.538618972443	0.539337497689	0.538618972443	0.539337497689

model2
0.538615159126	0.539160708115	0.538496052347	0.539114773246

model3
0.538702687693	0.539402696417	0.538141859286	0.538727942758

model1+
0.538601643568	0.539670976936	0.538094189224	0.538763999819

model2+
0.538517667601	0.539501720811	0.538092073243	0.538807972121

model3+
0.538577200052	0.539349318922	0.537998452494	0.538697384187


In [238]:
models = ['model1','model2','model3']
result2 = utils.merge_models(models, method='mean')
pd.DataFrame(result2).to_csv('models1234+_mean.csv', index=False, header=False, sep=';')


model1
0.538618972443	0.539337497689	0.538618972443	0.539337497689

model2
0.538615159126	0.539160708115	0.538496052347	0.539114773246

model3
0.538702687693	0.539402696417	0.538141859286	0.538727942758


In [117]:
# X_test['error_group']==0

In [237]:
# pd.Series(result1-result2).hist(bins=100)

In [236]:
# pd.Series((result1-result2)[X_test['error_group']==0]).hist(bins=100)

In [235]:
# pd.Series((result1-result2)[X_test['error_group']!=0]).hist(bins=100)

In [128]:
result3 = result2.copy()
result3[X_test['error_group']==0] = result1[X_test['error_group']==0]

In [129]:
pd.DataFrame(result3).to_csv('test_mean.csv', index=False, header=False, sep=';')

In [185]:
import importlib
utils = importlib.reload(utils)

In [None]:
# 0.537797383673	0.538468964022 = 0.5433098
# 0.537998452494	0.538697384187 = 0.5433633
# 0.538141859286	0.538727942758 = 0.5434983

In [None]:
# 0.537735274001	0.537762262313 = 0.5439345
# 0.538040557134	0.538014744278 = 0.5436639
# 0.5374	0.5374                 = 0.5440495
# 0.537603843701	0.537606129043 = 0.5448082

# Со столбцами Стаса альфа=20
# 0.537387582699	0.537559051543 = 0.5482682

# 0.537684932504	0.537640948053 = 0.5514443 добавил один столбик "по Стасу"
# 0.537748490027	0.537673097079 = 0.5517563 20 фолдов для подсчета среднего

# Без них
# 0.538947547578	0.538808662263 = 0.5441786
# 0.538231869380	0.538120316604 = 0.5435621