In [98]:

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')
%matplotlib inline

import os
import time
import datetime
import random
import multiprocessing
import pickle

import scipy.stats

import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix

import lightgbm as lgb

In [99]:
############ HELPER FUNCTIONS

import functions
import importlib
importlib.reload(functions)
from functions import *

In [100]:
############ RANDOMNESS

# seed function
def seed_everything(seed = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 42
seed_everything(seed)

### IMPORT

In [141]:
############ DATA IMPORT

# id data
train = pd.read_csv('../raw/training.csv')
test  = pd.read_csv('../raw/unlabeled.csv')


# check dimensions
print(train.shape)
print(train.shape)

train = train[-train['hospital_death'].isnull()]

(110369, 186)
(110369, 186)


In [142]:
#train = reduce_mem_usage(train)
#test  = reduce_mem_usage(test)

In [143]:
train.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,readmission_status,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,apache_4a_hospital_death_prob,apache_4a_icu_death_prob,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154.0,25312.0,118.0,0.0,68.0,22.73,0.0,Caucasian,M,180.3,Floor,Floor,92.0,admit,CTICU,0.541667,0.0,73.9,2.3,113.0,502.01,0.0,0.0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0.0,40.0,,,,,36.0,134.0,39.3,,0.0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.1,0.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252.0,59342.0,81.0,0.0,77.0,27.42,0.0,Caucasian,F,160.0,Floor,Floor,90.0,admit,Med-Surg ICU,0.927778,0.0,70.2,,108.0,203.01,0.0,0.0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0.0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1.0,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0.47,0.29,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783.0,50777.0,118.0,0.0,25.0,31.95,0.0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,93.0,admit,Med-Surg ICU,0.000694,0.0,95.3,,122.0,703.03,0.0,0.0,,,,,3.0,6.0,0.0,5.0,,102.0,,0.0,68.0,,,,,37.0,,36.7,,0.0,,,,88.0,48.0,88.0,48.0,96.0,68.0,,,102.0,68.0,102.0,68.0,21.0,8.0,98.0,91.0,,,148.0,105.0,148.0,105.0,37.0,36.7,,,88.0,58.0,88.0,58.0,96.0,78.0,,,91.0,83.0,91.0,83.0,20.0,16.0,98.0,91.0,,,148.0,124.0,148.0,124.0,36.7,36.7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267.0,46918.0,118.0,0.0,81.0,22.64,1.0,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,92.0,admit,CTICU,0.000694,0.0,61.7,,203.0,1206.03,1.0,0.0,,,,0.6,4.0,6.0,0.0,5.0,185.0,114.0,25.9,1.0,60.0,30.0,30.0,142.0,7.39,4.0,,34.8,,1.0,8.0,62.0,30.0,48.0,42.0,48.0,42.0,116.0,92.0,92.0,52.0,84.0,84.0,84.0,84.0,23.0,7.0,100.0,95.0,164.0,78.0,158.0,84.0,158.0,84.0,38.0,34.8,62.0,44.0,62.0,44.0,,,100.0,96.0,92.0,71.0,92.0,71.0,,,12.0,11.0,100.0,99.0,136.0,106.0,136.0,106.0,,,35.6,34.8,,,,,,,,,,,185.0,88.0,,,11.6,8.9,34.0,25.9,1.6,1.1,,,198.0,43.0,5.0,3.5,,,9.0,8.0,,,,,,,,,,,,,,,11.6,11.6,34.0,34.0,1.6,1.1,,,43.0,43.0,,,,,8.8,8.8,37.0,27.0,7.44,7.34,337.0,102.0,342.5,236.666667,36.0,33.0,7.37,7.34,337.0,265.0,337.0,337.0,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056.0,34377.0,33.0,0.0,19.0,,0.0,Caucasian,M,188.0,,Accident & Emergency,91.0,admit,Med-Surg ICU,0.073611,0.0,,,119.0,601.01,0.0,0.0,,,,,,,,,,60.0,,0.0,103.0,,,,,16.0,,36.7,,0.0,,,,99.0,57.0,99.0,57.0,89.0,60.0,,,104.0,90.0,104.0,90.0,18.0,16.0,100.0,96.0,,,147.0,120.0,147.0,120.0,37.2,36.7,,,99.0,68.0,99.0,68.0,89.0,76.0,,,104.0,92.0,104.0,92.0,,,100.0,100.0,,,130.0,120.0,130.0,120.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [144]:
#pd.DataFrame(train.isnull().sum())

In [145]:
y     = train['hospital_death']
train = train.drop('hospital_death', axis=1)

In [146]:
train['d1_diasbp_span'] = train['d1_diasbp_max'] - train['d1_diasbp_min']
test['d1_diasbp_span']  = test['d1_diasbp_max'] - test['d1_diasbp_min']

train['d1_diasbp_invasive_span'] = train['d1_diasbp_invasive_max'] - train['d1_diasbp_invasive_min']
test['d1_diasbp_invasive_span']  = test['d1_diasbp_invasive_max'] - test['d1_diasbp_invasive_min']

train['d1_diasbp_noninvasive_span'] = train['d1_diasbp_noninvasive_max'] - train['d1_diasbp_noninvasive_min']
test['d1_diasbp_noninvasive_span']  = test['d1_diasbp_noninvasive_max'] - test['d1_diasbp_noninvasive_min']

train['d1_heartrate_span'] = train['d1_heartrate_max'] - train['d1_heartrate_min']
test['d1_heartrate_span']  = test['d1_heartrate_max'] - test['d1_heartrate_min']

In [147]:
#train['apache_prob_sum'] = train[['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']].sum(axis=1)
#test['apache_prob_sum']  = test[['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']].sum(axis=1)

train['apache_prob_prod'] = train['apache_4a_hospital_death_prob'] * train[ 'apache_4a_icu_death_prob']
test['apache_prob_prod'] = test['apache_4a_hospital_death_prob'] * train[ 'apache_4a_icu_death_prob']

In [148]:
############ FEAUTERS

# drop bad features
#excluded_feats = ["ethnicity", "gender", "hospital_admit_source", "icu_admit_source", "icu_stay_type", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"]
excluded_feats = ['encounter_id', 'patient_id', 'readmission_status']#, 'hospital_id', 'icu_id']
features = [f for f in train.columns if f not in excluded_feats]
print(train[features].shape)

(91713, 187)


In [149]:
train['hospital_id'] = train['hospital_id'].astype('category')
test['hospital_id']  = test['hospital_id'].astype('category')

In [150]:
############ PARAMETERS
seed = 999
# cores
cores = 20
# cross-validation
num_folds = 10
shuffle   = True

# number of trees
max_rounds = 10000
stopping   = 200
verbose    = 250

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'binary',
    'metric':            'auc',
    'bagging_fraction':  0.9,
    'feature_fraction':  0.9,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0,
    'min_child_weight':  0.1,
    'min_child_samples': 10,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.01,
    'max_depth':         5,
    'num_leaves':        64,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
}


# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)
#folds = GroupKFold(n_splits = num_folds)
#folds = model_selection.TimeSeriesSplit(n_splits = 10)

# SMOTE settings
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state = seed, n_jobs = cores, sampling_strategy = 0.05)

In [151]:

############ PLACEHOLDERS

# placeholders
clfs = []
importances = pd.DataFrame()

# predictions
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])

In [None]:
############ CROSS-VALIDATION LOOP
cv_start  = time.time()
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):

    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    test_x       = test[features]
    
    # Aggregate data
    #agg_features = list(trn_x.select_dtypes('number').columns)
    #f = pd.concat([trn_y, trn_x], axis=1)
    #for variable in ['hospital_id', 'icu_id']:
    #    features_ = f.groupby(variable)['hospital_death'].mean()
    #    trn_x  = trn_x.merge(features_, left_on=variable, right_on=variable, how='left')
    #    val_x  = val_x.merge(features_, left_on=variable, right_on=variable, how='left')
    #    test_x = test_x.merge(features_, left_on=variable, right_on=variable, how='left')
        
    '''agg_features = ['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'age', 'd1_spo2_min']
    for variable in ['ethnicity', 'gender']:
       # print(f'preparing {variable}')
        agg_features.append(variable)
        features_ = trn_x[agg_features].groupby(variable).mean()
        features_.columns = [f'{variable}_{column}_mean' for column in features_.columns]
        trn_x  = trn_x.merge(features_, left_on=variable, right_on=variable, how='left')
        val_x  = val_x.merge(features_, left_on=variable, right_on=variable, how='left')
        test_x = test_x.merge(features_, left_on=variable, right_on=variable, how='left')
        agg_features.remove(variable)
        
    for variable in ['ethnicity', 'gender']:
       # print(f'preparing {variable}')
        agg_features.append(variable)
        features_ = trn_x[agg_features].groupby(variable).sum()
        features_.columns = [f'{variable}_{column}_sum' for column in features_.columns]
        trn_x  = trn_x.merge(features_, left_on=variable, right_on=variable, how='left')
        val_x  = val_x.merge(features_, left_on=variable, right_on=variable, how='left')
        test_x = test_x.merge(features_, left_on=variable, right_on=variable, how='left')
        agg_features.remove(variable)
    
    for variable in ['ethnicity', 'gender']:
       # print(f'preparing {variable}')
        agg_features.append(variable)
        features_ = trn_x[agg_features].groupby(variable).std()
        features_.columns = [f'{variable}_{column}_std' for column in features_.columns]
        trn_x  = trn_x.merge(features_, left_on=variable, right_on=variable, how='left')
        val_x  = val_x.merge(features_, left_on=variable, right_on=variable, how='left')
        test_x = test_x.merge(features_, left_on=variable, right_on=variable, how='left')
        agg_features.remove(variable)'''
        
        
    # Fill Na
    trn_x['weight']  = trn_x['weight'].fillna(trn_x.groupby(['ethnicity','age','gender'])['weight'].transform('mean'))
    val_x['weight']  = val_x['weight'].fillna(trn_x.groupby(['ethnicity','age','gender'])['weight'].transform('mean'))
    test_x['weight'] = test_x['weight'].fillna(trn_x.groupby(['ethnicity','age','gender'])['weight'].transform('mean'))
    
    trn_x['height']  = trn_x['height'].fillna(trn_x.groupby(['ethnicity','age','gender'])['height'].transform('mean'))
    val_x['height']  = val_x['height'].fillna(trn_x.groupby(['ethnicity','age','gender'])['height'].transform('mean'))
    test_x['height'] = test_x['height'].fillna(trn_x.groupby(['ethnicity','age','gender'])['height'].transform('mean'))
    
    trn_x['bmi']  = trn_x['bmi'].fillna(trn_x.groupby(['ethnicity','age','gender'])['bmi'].transform('mean'))
    val_x['bmi']  = val_x['bmi'].fillna(trn_x.groupby(['ethnicity','age','gender'])['bmi'].transform('mean'))
    test_x['bmi'] = test_x['bmi'].fillna(trn_x.groupby(['ethnicity','age','gender'])['bmi'].transform('mean'))
    
    for column in trn_x.select_dtypes('object').columns:
        trn_x[column] = trn_x[column].fillna('')
        val_x[column] = val_x[column].fillna('')
        test_x[column] = test_x[column].fillna('')
        
    # label encoding
    trn_x, val_x, test_x = label_encoding(trn_x, val_x, test_x)
    
    
    ## remove outliers
   # num_vars = trn_x.select_dtypes(include='number')
    #num_vars = num_vars.columns
    #for num_var in num_vars:
    #    trn_x[num_var] = trn_x[num_var].replace([np.inf, -np.inf], np.nan)
    #    trn_x[num_var] = trn_x[num_var].fillna(trn_x[num_var].median())
    #out_idx = (np.abs(scipy.stats.zscore(trn_x[num_vars])) < 20).all(axis = 1) + (trn_y.values == 1)
    #trn_x = trn_x[out_idx]
    
    ## scale data
    #val_x  = val_x.replace([np.inf, -np.inf], np.nan)
    
    scaler   = MinMaxScaler()
    trn_x    = pd.DataFrame(scaler.fit_transform(trn_x))
    val_x    = pd.DataFrame(scaler.transform(val_x))
    tmp_test = pd.DataFrame(scaler.transform(test_x))
       
    ## add noise to train to reduce overfitting
    trn_x += np.random.normal(0, 0.01, trn_x.shape)
    
    # print data dimensions
    print('Data shape:', trn_x.shape, val_x.shape)
    #print('Data shape:', trn_y.shape, val_y.shape)    
    # train lightGBM
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf = clf.fit(trn_x, trn_y, 
                  eval_set              = [(trn_x, trn_y), (val_x, val_y)], 
                  eval_metric           = 'auc', 
                  early_stopping_rounds = stopping,
                  verbose               = verbose)
    clfs.append(clf)
    
    # find the best iteration
    best_iter = clf.best_iteration_

    # save predictions
    preds_oof[val_idx] = clf.predict_proba(val_x,  num_iteration = best_iter)[:, 1]
    preds_test        += clf.predict_proba(test_x, num_iteration = best_iter)[:, 1] / folds.n_splits 

    # importance
    fold_importance_df               = pd.DataFrame()
    fold_importance_df['Feature']    = trn_x.columns
    fold_importance_df['Importance'] = clf.feature_importances_
    fold_importance_df['Fold']       = n_fold + 1
    importances                      = pd.concat([importances, fold_importance_df], axis = 0)
    
    # print performance
    print('--------------------------------')
    print('FOLD%2d: AUC = %.6f' % (n_fold + 1, roc_auc_score(y[val_idx], preds_oof[val_idx])))
    print('--------------------------------')
    print('')
        
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    
# print overall performance    
cv_perf = roc_auc_score(y, preds_oof)
print('--------------------------------')
print('- OOF AUC = %.6f' % cv_perf)
print('- CV TIME = {:.2f} min'.format((time.time() - cv_start) / 60))
print('--------------------------------')

### EVALUATION

In [None]:
############ RECHECK PERFORMANCE  

# check performance
print(np.round(roc_auc_score(y, preds_oof), 5))


############ TRACK RESULTS

In [None]:
############ VARIABLE IMPORTANCE

# load importance    
top_feats = 300
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
    
# plot variable importance
plt.figure(figsize = (10, 150))
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False))
plt.tight_layout()
plt.savefig('../var_importance.pdf')

SUBMISSION

In [None]:
# file name
model = 'lgb_v25'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + '_' + perf
name

In [None]:
# export OOF preds
oof = pd.DataFrame({'encounter_id': train['encounter_id'], 'hospital_death': preds_oof})
oof.to_csv('../oof_preds/' + str(name) + '.csv', index = False)
oof.head()

In [None]:

# export submission
sub = pd.DataFrame({'encounter_id': test['encounter_id'], 'hospital_death': preds_test})
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.head()