In [2]:
!pip install h2o

Collecting h2o
[?25l  Downloading https://files.pythonhosted.org/packages/27/09/c10dda3ad7633c7d12c04f739de82e181c52b9c4efc5133470a542b6d942/h2o-3.28.0.2.tar.gz (126.2MB)
[K     |████████████████████████████████| 126.2MB 92kB/s 
Collecting colorama>=0.3.8
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.28.0.2-py2.py3-none-any.whl size=126306423 sha256=e0ed8ad935246e745461eed66c91aee0bcdcae91f2b5353ddcb131e0ada87765
  Stored in directory: /root/.cache/pip/wheels/aa/ef/05/cc37b576425ec5a47be07cc42aa60c6e8b3fc21119808a6b63
Successfully built h2o
Installing collected packages: colorama, h2o
Successfully installed colorama-0.4.3 h2o-3.28.0.2


In [0]:
import h2o
from h2o.automl import H2OAutoML

In [0]:
###############################
#                             #
#        ENCODE FACTORS       #
#                             #
###############################

# performs label encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
def label_encoding(df_train, df_valid, df_test):
    
    factors = df_train.select_dtypes('object').columns
    
    lbl = LabelEncoder()

    for f in factors:        
        lbl.fit(list(df_train[f].values) + list(df_valid[f].values) + list(df_test[f].values))
        df_train[f] = lbl.transform(list(df_train[f].values))
        df_valid[f] = lbl.transform(list(df_valid[f].values))
        df_test[f]  = lbl.transform(list(df_test[f].values))

    return df_train, df_valid, df_test

from sklearn import base
class KFoldTargetEncoderTrain(base.BaseEstimator,
                               base.TransformerMixin):
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = False, random_state=2019)
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] =  X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)
        if self.verbosity:
            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName, np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X

class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        mean =  self.train[[self.colNames,
                self.encodedName]].groupby(
                                self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X

In [5]:
# GOOGLE COLAB SETUP

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.chdir('drive/My Drive/Colab Notebooks/WIDS/WIDS/')

In [0]:

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')
%matplotlib inline

import os
import time
import datetime
import random
import multiprocessing
import pickle

import scipy.stats

import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix

import lightgbm as lgb

In [0]:
import sys
sys.path.append('drive/My Drive/Colab Notebooks/WIDS/WIDS/codes')

In [0]:
############ RANDOMNESS

# seed function
def seed_everything(seed = 42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 42
seed_everything(seed)

### IMPORT

In [10]:
############ DATA IMPORT

# id data
train = pd.read_csv('./raw/training_v2.csv')
test  = pd.read_csv('./raw/unlabeled.csv')


# check dimensions
print(train.shape)
print(test.shape)

train = train[-train['hospital_death'].isnull()]

(91713, 186)
(39308, 186)


In [0]:
train['NAs'] = train.isnull().sum(axis=1)
test['NAs']  = test.isnull().sum(axis=1)

In [0]:
train['hospital_id'] = train['hospital_id'].astype('object')
test['hospital_id']  = test['hospital_id'].astype('object')

train['icu_id'] = train['icu_id'].astype('object')
test['icu_id']  = test['icu_id'].astype('object')

In [13]:
for feature in train.select_dtypes('object').columns:    
    targetc = KFoldTargetEncoderTrain(feature,'hospital_death',n_fold=10)
    train = targetc.fit_transform(train)

    test_targetc = KFoldTargetEncoderTest(train,
                                          feature,
                                          f'{feature}_Kfold_Target_Enc')
    test = test_targetc.fit_transform(test)

Correlation between the new feature, hospital_id_Kfold_Target_Enc and, hospital_death is 0.061815698066015896.
Correlation between the new feature, ethnicity_Kfold_Target_Enc and, hospital_death is -0.021778031915991588.
Correlation between the new feature, gender_Kfold_Target_Enc and, hospital_death is -0.02454474750657809.
Correlation between the new feature, hospital_admit_source_Kfold_Target_Enc and, hospital_death is 0.09601692398142708.
Correlation between the new feature, icu_admit_source_Kfold_Target_Enc and, hospital_death is 0.10751265094405518.
Correlation between the new feature, icu_id_Kfold_Target_Enc and, hospital_death is 0.07331137852646111.
Correlation between the new feature, icu_stay_type_Kfold_Target_Enc and, hospital_death is -9.430062769421791e-05.
Correlation between the new feature, icu_type_Kfold_Target_Enc and, hospital_death is 0.0382226407880874.
Correlation between the new feature, apache_3j_bodysystem_Kfold_Target_Enc and, hospital_death is 0.122919143131

In [0]:
y     = train['hospital_death']
train = train.drop('hospital_death', axis=1)

In [0]:
############ FEAUTERS

# drop bad features
excluded_feats = ['encounter_id', 'patient_id', 'readmission_status', 'hospital_id', 'icu_id']
excluded_feats.extend(list(train.select_dtypes('object').columns))

In [0]:
train['apache_prob_prod'] = train['apache_4a_hospital_death_prob'] * train[ 'apache_4a_icu_death_prob']
test['apache_prob_prod'] = test['apache_4a_hospital_death_prob'] * train[ 'apache_4a_icu_death_prob']

In [17]:
features = [f for f in train.columns if f not in excluded_feats]
#features = ['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']
print(train[features].shape)

(91713, 184)


In [0]:
############ PARAMETERS

# cores
cores = -1
# cross-validation
num_folds = 10
shuffle   = True

# number of trees
max_rounds = 10000
stopping   = 200
verbose    = 250

# LGB parameters
lgb_params = {
    'boosting_type':     'gbdt',
    'objective':         'binary',
    'metric':            'auc',
    'bagging_fraction':  0.9,
    'feature_fraction':  0.9,
    'lambda_l1':         0.1,
    'lambda_l2':         0.1,
    'min_split_gain':    0.1,
    'min_child_weight':  0,
    'min_child_samples': 10,
    'silent':            True,
    'verbosity':         -1,
    'learning_rate':     0.01,
    'max_depth':         5,
    'num_leaves':        64,
    'scale_pos_weight':  1,
    'n_estimators':      max_rounds,
    'nthread' :          cores,
    'random_state':      seed,
    #"device" : "gpu"
}

# data partitinoing
folds = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = shuffle)
#folds = GroupKFold(n_splits = num_folds)
#folds = model_selection.TimeSeriesSplit(n_splits = 10)

# SMOTE settings
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state = seed, n_jobs = cores, sampling_strategy = 0.05)

In [0]:

############ PLACEHOLDERS

# placeholders
clfs = []
importances = pd.DataFrame()

# predictions
preds_test   = np.zeros(test.shape[0])
preds_oof    = np.zeros(train.shape[0])

In [20]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.5" 2019-10-15; OpenJDK Runtime Environment (build 11.0.5+10-post-Ubuntu-0ubuntu1.118.04); OpenJDK 64-Bit Server VM (build 11.0.5+10-post-Ubuntu-0ubuntu1.118.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.6/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp4zo1lbkr
  JVM stdout: /tmp/tmp4zo1lbkr/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp4zo1lbkr/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54323
Connecting to H2O server at http://127.0.0.1:54323 ... successful.


0,1
H2O cluster uptime:,06 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,2 days
H2O cluster name:,H2O_from_python_unknownUser_71hfe8
H2O cluster total nodes:,1
H2O cluster free memory:,3 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


In [0]:
############ CROSS-VALIDATION LOOP
cv_start  = time.time()
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, y)):

    # data partitioning
    trn_x, trn_y = train[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[features].iloc[val_idx], y.iloc[val_idx]
    test_x       = test[features]
        
    # Fill Na
    #trn_x['weight']  = trn_x['weight'].fillna(trn_x.groupby(['ethnicity','age','gender'])['weight'].transform('mean'))
    #val_x['weight']  = val_x['weight'].fillna(trn_x.groupby(['ethnicity','age','gender'])['weight'].transform('mean'))
    #test_x['weight'] = test_x['weight'].fillna(trn_x.groupby(['ethnicity','age','gender'])['weight'].transform('mean'))
    
    #trn_x['height']  = trn_x['height'].fillna(trn_x.groupby(['ethnicity','age','gender'])['height'].transform('mean'))
    #val_x['height']  = val_x['height'].fillna(trn_x.groupby(['ethnicity','age','gender'])['height'].transform('mean'))
    #test_x['height'] = test_x['height'].fillna(trn_x.groupby(['ethnicity','age','gender'])['height'].transform('mean'))
    
    #trn_x['bmi']  = trn_x['bmi'].fillna(trn_x.groupby(['ethnicity','age','gender'])['bmi'].transform('mean'))
    #val_x['bmi']  = val_x['bmi'].fillna(trn_x.groupby(['ethnicity','age','gender'])['bmi'].transform('mean'))
    #test_x['bmi'] = test_x['bmi'].fillna(trn_x.groupby(['ethnicity','age','gender'])['bmi'].transform('mean'))
    
    for column in trn_x.select_dtypes('object').columns:
        trn_x[column] = trn_x[column].fillna('')
        val_x[column] = val_x[column].fillna('')
        test_x[column] = test_x[column].fillna('')
        
    # label encoding
    trn_x, val_x, test_x = label_encoding(trn_x, val_x, test_x)
       
    ## add noise to train to reduce overfitting
    trn_x += np.random.normal(0, 0.01, trn_x.shape)
    
    x_ = list(trn_x.columns)
    y_ = 'hospital_death'

    trn_x = h2o.H2OFrame(pd.concat([trn_y, trn_x], axis=1))
    val_x = h2o.H2OFrame(pd.concat([val_y, val_x], axis=1))
    if n_fold == 0:
      test_x = h2o.H2OFrame(test_x)

    trn_x['hospital_death'] = trn_x['hospital_death'].asfactor()
    val_x['hospital_death'] = val_x['hospital_death'].asfactor()

    # print data dimensions
    print('Data shape:', trn_x.shape, val_x.shape)
    #print('Data shape:', trn_y.shape, val_y.shape)    
    # train lightGBM
    aml = H2OAutoML(max_models = 1, seed = seed, stopping_metric = 'auc')
    aml.train(x = x_, y = y_, training_frame = trn_x)
    

    # save predictions
    preds_oof[val_idx] = aml.leader.predict(val_x)
    preds_test        += aml.leader.predict(test_x) / folds.n_splits 

    
    # print performance
    print('--------------------------------')
    print('FOLD%2d: AUC = %.6f' % (n_fold + 1, roc_auc_score(y[val_idx], preds_oof[val_idx])))
    print('--------------------------------')
    print('')
        
    # clear memory
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    
# print overall performance    
cv_perf = roc_auc_score(y, preds_oof)
print('--------------------------------')
print('- OOF AUC = %.6f' % cv_perf)
print('- CV TIME = {:.2f} min'.format((time.time() - cv_start) / 60))
print('--------------------------------')

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Data shape: (82541, 185) (9172, 185)
AutoML progress: |██

### EVALUATION

In [0]:
############ RECHECK PERFORMANCE  

# check performance
print(np.round(roc_auc_score(y, preds_oof), 5))


############ TRACK RESULTS

In [0]:
############ VARIABLE IMPORTANCE

# load importance    
top_feats = 300
cols = importances[['Feature', 'Importance']].groupby('Feature').mean().sort_values(by = 'Importance', ascending = False)[0:top_feats].index
importance = importances.loc[importances.Feature.isin(cols)]
    
# plot variable importance
plt.figure(figsize = (10, 150))
sns.barplot(x = 'Importance', y = 'Feature', data = importance.sort_values(by = 'Importance', ascending = False))
plt.tight_layout()
plt.savefig('./var_importance.pdf')

SUBMISSION

In [0]:
# file name
model = 'h2o_v1'
perf  = str(round(cv_perf, 6))[2:7]
name  = model + '_' + perf
name

In [0]:
# export OOF preds
oof = pd.DataFrame({'encounter_id': train['encounter_id'], 'hospital_death': preds_oof})
oof.to_csv('./oof_preds/' + str(name) + '.csv', index = False)
oof.head()

In [0]:

# export submission
sub = pd.DataFrame({'encounter_id': test['encounter_id'], 'hospital_death': preds_test})
sub.to_csv('./submissions/' + str(name) + '.csv', index = False)
sub.head()