In [1]:
###############################
#                             #
#        ENCODE FACTORS       #
#                             #
###############################
import pandas as pd
# performs label encoding
from sklearn.preprocessing import LabelEncoder
def label_encoding(df_train, df_valid, df_test):
    
    factors = df_train.select_dtypes('object').columns
    
    lbl = LabelEncoder()

    for f in factors:        
        lbl.fit(list(df_train[f].values) + list(df_valid[f].values) + list(df_test[f].values))
        df_train[f] = lbl.transform(list(df_train[f].values))
        df_valid[f] = lbl.transform(list(df_valid[f].values))
        df_test[f]  = lbl.transform(list(df_test[f].values))

    return df_train, df_valid, df_test

import numpy as np
def reduce_mem_usage(df, verbose = True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
train_ = pd.read_csv('../raw/training.csv')
test_ = pd.read_csv('../raw/unlabeled.csv')
y = train_['hospital_death']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
############ LIBRARIES

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')
%matplotlib inline

import os
import time
import random
import multiprocessing
import pickle

import scipy.stats
from scipy.stats import rankdata

import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb

from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.decomposition import PCA, FastICA, FactorAnalysis
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix
from sklearn.neighbors import NearestNeighbors

In [4]:
############ RANDOMNESS

# seed function
def seed_everything(seed = 23):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 23
seed_everything(seed)

In [5]:
files = sorted(os.listdir("../oof_preds"))
files.remove('.ipynb_checkpoints')

In [11]:
files = ['09072.csv','lgb_v17_90519.csv','lgb_v73_seed111_90648.csv',
'lgb_v80_seed111_90642.csv','lgb_v84_seed111_90668.csv',
         'lgb_v92_seed111_90701.csv','lgb_v97_seed111_90680.csv',
         'lgb_v11_90707.csv','lgb_v99_seed111_90749.csv',
        'goss_v1_seed111_90736.csv','goss_v2_seed111_90716.csv','goss_v3_seed111_90704.csv',
        'goss_v5_seed111_90764.csv',]

In [12]:
train_ = pd.read_csv('../raw/training.csv')
train_ = train_[-train_['hospital_death'].isnull()]
train_.shape

(91713, 186)

In [13]:
train = pd.DataFrame( columns=files)
test  = pd.DataFrame( columns=files)

In [14]:
y = train_['hospital_death']

In [15]:
for i, file in enumerate(files):
    predictions = pd.read_csv(f'../oof_preds/{file}')
    if file == files[0]:
        train['encounter_id'] = predictions['encounter_id']
    train[file] = predictions['hospital_death']

In [16]:
for i, file in enumerate(files):
    predictions = pd.read_csv(f'../submissions/{file}')
    if file == files[0]:
        test['encounter_id'] = predictions['encounter_id']
    test[file] = predictions['hospital_death']

In [17]:
excluded_feats = ['encounter_id']
features = [f for f in train.columns if f not in excluded_feats]

In [18]:
### PARAMETERS

# settings
cores = -1
seed  = 999

# cross-validation
num_folds = 10
shuffle   = False

# muner of rounds
max_rounds = 10000
stopping   = 100
verbose    = 100

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'binary',
    'metric':            'auc',
    'bagging_fraction':  0.5,
    'feature_fraction':  0.5,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0,
    'min_child_weight':  0.1,
    'min_child_samples': 20,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.001,
    'max_depth':         5,
    'num_leaves':        64,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
}


# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)

In [19]:
# placeholders
clfs = []
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])
importances  = pd.DataFrame()

In [20]:
### CROSS-VALIDATION LOOP
cv_start  = time.time()
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):
    
    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    
    # label encoding
    trn_x, val_x, test_x = label_encoding(trn_x, val_x, test[features])
    
    # print data dimensions
    print('Data shape:', trn_x.shape, val_x.shape)
           
    # train lightGBM
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric           = "auc", 
                  early_stopping_rounds = stopping,
                  verbose               = verbose)
    clfs.append(clf)
    
    # predict validation from the best iteration
    best_iter = clf.best_iteration_
       
    # predictions
    preds_oof[val_idx]    = clf.predict_proba(val_x,  num_iteration = best_iter)[:, 1]
    preds_test           += clf.predict_proba(test_x, num_iteration = best_iter)[:, 1] / folds.n_splits 

    ## importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = features
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold'] = n_fold + 1
    importances = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('----------------------')
    print('FOLD%2d: AUC = %.6f' % (n_fold + 1, roc_auc_score(y[val_idx], preds_oof[val_idx])))
    print('----------------------')
    print('')
        
    # clear memory
    #del trn_x, trn_y, val_x, val_y
    #gc.collect()
    
    
# print overall performance    
cv_perf = roc_auc_score(y, preds_oof)
print('--------------------------------')
print('- OOF AUC = %.6f' % cv_perf)
print('- CV TIME = {:.2f} min'.format((time.time() - cv_start) / 60))
print('--------------------------------')

Data shape: (82541, 13) (9172, 13)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.91221	valid_1's auc: 0.910757
[200]	training's auc: 0.912626	valid_1's auc: 0.910894
Early stopping, best iteration is:
[142]	training's auc: 0.912462	valid_1's auc: 0.911031
----------------------
FOLD 1: AUC = 0.911031
----------------------

Data shape: (82541, 13) (9172, 13)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.912345	valid_1's auc: 0.906639
[200]	training's auc: 0.912703	valid_1's auc: 0.906686
[300]	training's auc: 0.912927	valid_1's auc: 0.906757
Early stopping, best iteration is:
[248]	training's auc: 0.912805	valid_1's auc: 0.906869
----------------------
FOLD 2: AUC = 0.906869
----------------------

Data shape: (82541, 13) (9172, 13)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.910653	valid_1's auc: 0.921932
Early stopping, best iteration is:
[12]	training's auc: 0.91

In [21]:
############ RECHECK PERFORMANCE  

# check performance
print(np.round(roc_auc_score(y, preds_oof), 5))

0.87205


In [22]:
# file name
model = 'stack_lgb'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + str(len(features)) + '_' + perf
name

'stack_lgb13_87205'

In [24]:
# export OOF preds
oof = pd.DataFrame({'encounter_id': train['encounter_id'], 'hospital_death': preds_oof})
oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof.head()

Unnamed: 0,encounter_id,hospital_death
0,66154,0.086424
1,114252,0.139284
2,119783,0.075141
3,79267,0.082083
4,92056,0.075657


In [25]:

# export submission
sub = pd.DataFrame({'encounter_id': test['encounter_id'], 'hospital_death': preds_test})
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()

Unnamed: 0,encounter_id,hospital_death
0,2,0.080843
1,5,0.083022
2,7,0.080735
3,8,0.090118
4,10,0.128115
