In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
import random 
import seaborn as sns

from os import listdir
from tqdm import tqdm
from os.path import isfile

import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.model_selection import StratifiedKFold,GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
#from sklearn.impute import SimpleImputer
from sklearn import preprocessing

#from bayes_opt import BayesianOptimization
#from bayes_opt.observer import JSONLogger
#from bayes_opt.event import Events
#from bayes_opt.util import load_logs

import fastai
import torch
#import xlearn as xl
#import autokeras as ak

from fastai.basic_data import load_data
from fastai.tabular import *
from fastai.callbacks import *

from torch import nn
import torch.nn.functional as F

import statsmodels.imputation.mice as smi

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)
print()
print("lightgbm:", lgb.__version__)
print("xgboost:", xgb.__version__)
print("catboost:", cb.__version__)
print("fastai:", fastai.__version__)
print("torch:", torch.__version__)
#print("xlearn:", xl.__version__)

pandas: 0.25.3
numpy: 1.18.1
sklearn: 0.22.1

lightgbm: 2.3.1
xgboost: 0.90
catboost: 0.21
fastai: 1.0.60
torch: 1.4.0


<h1>Data preprocessing</h1>

In [2]:
train = pd.read_csv('../input/wids2020/train6.csv')
test  = pd.read_csv('../input/wids2020/test6.csv')

In [3]:
train['hospital_admit_source'] = train['hospital_admit_source'].replace({'Other ICU': 'ICU','ICU to SDU':'SDU', 'Step-Down Unit (SDU)': 'SDU',
                                                                                               'Other Hospital':'Other','Observation': 'Recovery Room','Acute Care/Floor': 'Acute Care'})
test['hospital_admit_source'] = test['hospital_admit_source'].replace({'Other ICU': 'ICU','ICU to SDU':'SDU', 'Step-Down Unit (SDU)': 'SDU',
                                                                                               'Other Hospital':'Other','Observation': 'Recovery Room','Acute Care/Floor': 'Acute Care'})

In [4]:
train['icu_type'] = train['icu_type'].replace({'CCU-CTICU': 'Grpd_CICU', 'CTICU':'Grpd_CICU', 'Cardiac ICU':'Grpd_CICU'})
test['icu_type'] = test['icu_type'].replace({'CCU-CTICU': 'Grpd_CICU', 'CTICU':'Grpd_CICU', 'Cardiac ICU':'Grpd_CICU'})

In [5]:
cat_cols = [i for i in train.columns if type(train[i].iloc[0]) == str] + ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'Glasglow_comma_score_t', 'weightclass']

In [6]:
y = train.hospital_death.values

train = train.drop(['hospital_death'], axis=1)

In [7]:
le = preprocessing.LabelEncoder()

for i in cat_cols:
    if type(train[i].iloc[0]) == str:
        print(i)
        train[i].fillna('NaN', inplace=True)
        test[i].fillna('NaN', inplace=True)

        le.fit(train[i].values)
        train[i] = le.transform(train[i].values)
        test[i] = le.transform(test[i].values)

ethnicity
gender
hospital_admit_source
icu_admit_source
icu_stay_type
icu_type
apache_3j_bodysystem
Glasglow_comma_score_t
weightclass


In [8]:
DROP_COLS = ['readmission_status', 'encounter_id', 'patient_id', \
             'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob',
             'd1_calcium_min',
             'd1_glucose_min']
train.drop(DROP_COLS, axis=1, inplace=True)
test.drop(DROP_COLS, axis=1, inplace=True)

In [9]:
#train.to_csv('train_clean.csv', index=None)
#test.to_csv('test_clean.csv', index=None)

In [10]:
X = train.copy()
X_test = test.copy()

<h1>LGBM, CAT, XGB</h1>

In [11]:
!mkdir ./output

In [12]:
%%time

arch = "lgb"

train[arch] = 0
test[arch] = 0

rounds = 10000
early_stop_rounds = 300

params = {'objective': 'binary',
          'boosting_type': 'gbrt',
          'metric': 'auc',
          'seed': 42,
          'reg_alpha': 0, 
          'reg_lambda': 60, 
          'max_depth': 9,
          'num_leaves': 70,
          'learning_rate': 0.01,
          'min_split_gain': 0.02,
          'min_child_samples': 150,
          'min_child_weight': 0.02,
          'bagging_freq': 1,
          'bagging_fraction': 0.9,
          'feature_fraction': 0.8,
          'bagging_seed': 42,
          'verbose': -1,
          'n_jobs': -1}


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = X.iloc[train_index]
    X_valid = X.iloc[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]

    d_train = lgb.Dataset(X_train, y_train)
    d_valid = lgb.Dataset(X_valid, y_valid)    

    model = lgb.train(params,
                      d_train,
                      num_boost_round=rounds,
                      valid_sets=[d_train, d_valid],
                      valid_names=['train','valid'],
                      early_stopping_rounds=early_stop_rounds,
                      categorical_feature=cat_cols,
                      verbose_eval=0) 

    joblib.dump(model, "./output/lgb_"+str(i)+".pkl")

    y_pred = model.predict(X_valid)
    train.loc[valid_index, arch] = y_pred
    auc = roc_auc_score(y_valid, y_pred)
    print(i, "ROC AUC:", round(auc, 5))

    test[arch] += model.predict(X_test)
    
test[arch] /= 5

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.90997
1 ROC AUC: 0.90947
2 ROC AUC: 0.90489
3 ROC AUC: 0.90662
4 ROC AUC: 0.90849

OOF ROC AUC: 0.90785

CPU times: user 44min 25s, sys: 7.03 s, total: 44min 32s
Wall time: 44min 41s


In [13]:
%%time

arch = "cat"

train[arch] = 0
test[arch] = 0
#val[arch] = 0

early_stop_rounds = 100

params = {
          'loss_function': 'Logloss',
          'eval_metric':'AUC',
          'random_seed': 42
         }

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = X.iloc[train_index]
    X_valid = X.iloc[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]
    
    trn_data = Pool(X_train, y_train)
    val_data = Pool(X_valid, y_valid)
    
    clf = CatBoostClassifier(**params)
    clf.fit(trn_data,
            eval_set=val_data,
            use_best_model=True,
            early_stopping_rounds=early_stop_rounds,
            verbose=0)
    
    y_pred = clf.predict_proba(X_valid)[:, 1]
    train.loc[valid_index, arch] = y_pred
    auc = roc_auc_score(y_valid, y_pred)
    print(i, "ROC AUC:", round(auc, 5))

    test[arch] += clf.predict_proba(Pool(X_test))[:,1]
    #val[arch]  += clf.predict_proba(Pool(X_val))[:, 1]
    
test[arch] /= 5
#val[arch] /= 5

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.90423
1 ROC AUC: 0.90383
2 ROC AUC: 0.90029
3 ROC AUC: 0.90451
4 ROC AUC: 0.90586

OOF ROC AUC: 0.90367

CPU times: user 17min 58s, sys: 37.8 s, total: 18min 36s
Wall time: 9min 52s


In [14]:
%%time

arch = "xgb"

train[arch] = 0
test[arch] = 0
#val[arch] = 0

rounds = 10000
early_stop_rounds = 100

params = {'eval_metric': 'auc',
          #'booster': 'gbtree',
          'tree_method': 'hist',
          'objective': 'binary:logistic',
          'subsample': 0.9,
          'colsample_bytree': 0.3,
          'eta': 0.01,
          'max_depth': 6,
          'seed': 42,
          'verbosity': 0}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = X.iloc[train_index]
    X_valid = X.iloc[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]
    
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)    

    model = xgb.train(params,
                      d_train,
                      rounds,
                      [(d_train, 'train'), (d_valid, 'eval')],
                      early_stopping_rounds=early_stop_rounds,
                      verbose_eval=0) 
    
    best = model.best_iteration + 1
    
    y_pred = model.predict(d_valid, ntree_limit=best)
    train.loc[valid_index, arch] = y_pred
    auc = roc_auc_score(y_valid, y_pred)
    print(i, "ROC AUC:", round(auc, 5))
    
    test[arch] += model.predict(xgb.DMatrix(X_test), ntree_limit=best)
    #val[arch]  = model.predict(xgb.DMatrix(X_val), ntree_limit=best)

test[arch] /= 5
#val[arch] /= 5

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.90855
1 ROC AUC: 0.90969
2 ROC AUC: 0.90534
3 ROC AUC: 0.90817
4 ROC AUC: 0.91026

OOF ROC AUC: 0.9083

CPU times: user 34min 14s, sys: 6.56 s, total: 34min 21s
Wall time: 34min 27s


<h1>FastAI</h1>

In [15]:
def auroc_score(input, target):
    input, target = input.cpu().numpy()[:,1], target.cpu().numpy()
    return roc_auc_score(target, input)

class AUROC(Callback):
    _order = -20

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []
    
    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)
                
    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics,[metric])
        
class FocalLoss(nn.Module):
    def __init__(self, logits=False):
        super(FocalLoss, self).__init__()
        self.logits = logits

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = 0.1 * (1-pt)**2 * BCE_loss
        
        return F_loss

In [16]:
X['hospital_death'] = y

dep_var = 'hospital_death'
emb_drop=0.5 #
ps = 0.5
bs = 1024
wd = 0.1
procs = [FillMissing, Categorify, Normalize]
cat_names = ['gender', 'ethnicity', 'hospital_admit_source', 'icu_admit_source',  
             'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 
             'apache_2_diagnosis', 'apache_3j_diagnosis', 'icu_id',           
             'elective_surgery', 'apache_post_operative', 'arf_apache',
             'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'intubated_apache',
             'aids', 'cirrhosis', 'leukemia', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'lymphoma',
             'solid_tumor_with_metastasis', 'apache_3j_diagnosis_root', 'SIRS',
             'temp_flag', 'heart_rate_flag', 'resp_flag', 'wbc_apache_flag', 'Glasglow_comma_score_t',
             'weightclass']

In [17]:
%%time

arch = "nn"

train[arch] = 0
test[arch] = 0

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    data = TabularDataBunch.from_df('.', X, 
                                    dep_var=dep_var,
                                    cat_names=cat_names,
                                    valid_idx=valid_index, 
                                    procs=procs, 
                                    bs=bs, 
                                    test_df=X_test)

    learn = tabular_learner(data, 
                            layers=[1500, 750], 
                            emb_drop=emb_drop, 
                            ps = ps, metrics=accuracy) 
    learn.loss_fn = FocalLoss()
    
    learn.fit_one_cycle(10, slice(1e-2), callbacks=[AUROC(learn),
                                               SaveModelCallback(learn, every='improvement',
                                               monitor='AUROC', name='bestmodel_groupk_fold{}'.format(i))], 
                        wd=wd)
    learn.load('bestmodel_groupk_fold{}'.format(i))

    preds = learn.get_preds(ds_type=DatasetType.Valid)
    y_pred = [float(preds[0][i][1]) for i in range(len(preds[0]))]
    
    train.loc[valid_index, arch] = y_pred
    auc = roc_auc_score(y[valid_index], y_pred)
    print(i, "ROC AUC:", round(auc, 5))
    tmp_test = learn.get_preds(ds_type=DatasetType.Test)
    test[arch] += [float(i[1]) for i in tmp_test[0]]

test[arch] /= 5

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

epoch,train_loss,valid_loss,accuracy,AUROC,time
0,0.510381,0.350366,0.907812,0.698808,00:11
1,0.296735,0.220101,0.925476,0.862175,00:10
2,0.244253,0.200495,0.931036,0.896245,00:10
3,0.216899,0.207251,0.930546,0.901434,00:10
4,0.200971,0.186583,0.929455,0.901841,00:10
5,0.18847,0.188581,0.932454,0.901116,00:10
6,0.181792,0.195367,0.931091,0.902684,00:10
7,0.176473,0.184963,0.933435,0.901967,00:10
8,0.169461,0.186548,0.933217,0.901933,00:10
9,0.167198,0.185983,0.933163,0.901682,00:10


Better model found at epoch 0 with AUROC value: 0.6988077944810388.
Better model found at epoch 1 with AUROC value: 0.8621751922650719.
Better model found at epoch 2 with AUROC value: 0.8962445177505024.
Better model found at epoch 3 with AUROC value: 0.901433695876685.
Better model found at epoch 4 with AUROC value: 0.9018410106184898.
Better model found at epoch 6 with AUROC value: 0.9026844742091163.


0 ROC AUC: 0.90268


epoch,train_loss,valid_loss,accuracy,AUROC,time
0,0.511974,0.290263,0.918716,0.761764,00:10
1,0.29398,0.230624,0.927111,0.89028,00:10
2,0.234657,0.209383,0.927166,0.892237,00:10
3,0.213086,0.204071,0.931909,0.900134,00:10
4,0.196143,0.191404,0.930655,0.898795,00:11
5,0.187175,0.18979,0.931309,0.900728,00:10
6,0.179974,0.188444,0.932127,0.898291,00:10
7,0.174599,0.187687,0.932563,0.901608,00:10
8,0.169913,0.187597,0.932617,0.900992,00:10
9,0.165061,0.18696,0.932781,0.900801,00:10


Better model found at epoch 0 with AUROC value: 0.7617640706673079.
Better model found at epoch 1 with AUROC value: 0.8902800413703476.
Better model found at epoch 2 with AUROC value: 0.8922368972540884.
Better model found at epoch 3 with AUROC value: 0.900134333016221.
Better model found at epoch 5 with AUROC value: 0.9007277502461265.
Better model found at epoch 7 with AUROC value: 0.9016078878055472.


1 ROC AUC: 0.90161


epoch,train_loss,valid_loss,accuracy,AUROC,time
0,0.518979,0.294951,0.922859,0.766496,00:10
1,0.295889,0.229282,0.926511,0.832724,00:10
2,0.23602,0.206702,0.929346,0.888793,00:10
3,0.214458,0.198353,0.930273,0.884361,00:10
4,0.202259,0.193409,0.930927,0.895784,00:11
5,0.190903,0.195123,0.9318,0.899672,00:10
6,0.183258,0.18578,0.933762,0.900844,00:10


Better model found at epoch 0 with AUROC value: 0.7664961999285367.
Better model found at epoch 1 with AUROC value: 0.8327244122742081.
Better model found at epoch 2 with AUROC value: 0.8887934075808448.
Better model found at epoch 4 with AUROC value: 0.8957838316419836.
Better model found at epoch 5 with AUROC value: 0.8996718565546521.
Better model found at epoch 6 with AUROC value: 0.9008440289652739.
Better model found at epoch 9 with AUROC value: 0.9054799078927127.


3 ROC AUC: 0.90548


epoch,train_loss,valid_loss,accuracy,AUROC,time
0,0.511923,0.311996,0.907262,0.783223,00:10
1,0.305458,0.235734,0.9234,0.821927,00:10
2,0.238334,0.192632,0.929288,0.896198,00:10
3,0.216002,0.189264,0.930815,0.902411,00:10
4,0.20268,0.186926,0.930487,0.901405,00:10
5,0.192685,0.192944,0.930815,0.902166,00:10
6,0.183392,0.186795,0.932178,0.90465,00:10
7,0.178594,0.184163,0.93136,0.903778,00:10
8,0.171128,0.185183,0.932178,0.904612,00:10
9,0.167339,0.185609,0.932287,0.904127,00:10


Better model found at epoch 0 with AUROC value: 0.7832233306194988.
Better model found at epoch 1 with AUROC value: 0.8219270610370035.
Better model found at epoch 2 with AUROC value: 0.8961983523472006.
Better model found at epoch 3 with AUROC value: 0.9024109277307443.
Better model found at epoch 6 with AUROC value: 0.9046495114475785.


4 ROC AUC: 0.90465



OOF ROC AUC: 0.90254

CPU times: user 7min 46s, sys: 43.9 s, total: 8min 30s
Wall time: 9min 50s


<h1>Stacking</h1>

In [18]:
models = ["cat", 
          "lgb", 
          "xgb", 
          "nn",
          #"h2o"
         ]

train[models].corr()

Unnamed: 0,cat,lgb,xgb,nn
cat,1.0,0.959177,0.96746,0.921263
lgb,0.959177,1.0,0.984737,0.93308
xgb,0.96746,0.984737,1.0,0.939055
nn,0.921263,0.93308,0.939055,1.0


In [19]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
arch = "stack"

train[arch] = 0
test[arch] = 0

for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = train.loc[train_index, models]
    X_valid = train.loc[valid_index, models]

    y_train = y[train_index]
    y_valid = y[valid_index]
    
    reg = LogisticRegression(C=1,
                             solver="newton-cg", 
                             penalty="l2", 
                             n_jobs=-1, 
                             max_iter=100).fit(X_train, y_train) 
    
    y_pred = reg.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 5))
    
    test[arch] += reg.predict_proba(test[models])[:, 1]
    #val[arch]  += reg.predict_proba(val[models])[:, 1]

test[arch] /= 5
#val[arch] /= 5

print()
print("OOF ROC AUC:", round(roc_auc_score(y, train[arch]), 5))
print()

0 ROC AUC: 0.91125
1 ROC AUC: 0.91131
2 ROC AUC: 0.90762
3 ROC AUC: 0.9106
4 ROC AUC: 0.91196

OOF ROC AUC: 0.91039

CPU times: user 672 ms, sys: 707 ms, total: 1.38 s
Wall time: 3.46 s


In [20]:
print('Coefs:', reg.coef_[0]/np.sum(reg.coef_[0])*100)
#print('Validation score:', round(roc_auc_score(y_val, val['stack']), 5))

Coefs: [10.265807 46.586713  9.703621 33.443859]


In [21]:
submit = pd.read_csv('../input/widsdatathon2020/unlabeled.csv')
submit['hospital_death'] = test['stack']
submit[['encounter_id', 'hospital_death']].head()

Unnamed: 0,encounter_id,hospital_death
0,2,0.029814
1,5,0.030623
2,7,0.031312
3,8,0.059847
4,10,0.257472


In [22]:
submit[['encounter_id', 'hospital_death']].to_csv('./submission_stack.csv', index=None)