## Import Data and Libraries

In [1]:
!git clone https://github.com/laura-health/cbms2020/
!pip install catboost
!pip install lightgbm
!pip install missingpy

Cloning into 'cbms2020'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 9 (delta 2), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (9/9), done.
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)
[K     |████████████████████████████████| 64.4MB 41kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.22
Collecting missingpy
[?25l  Downloading https://files.pythonhosted.org/packages/b5/be/998d04d27054b58f0974b5f09f8457778a0a72d4355e0b7ae877b6cfb850/missingpy-0.2.0-py3-none-any.whl (49kB)
[K     |████████████████████████████████| 51kB 4.0MB/s 
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0


## Load Data and Libraries

In [1]:
import pandas as pd
import os
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import random
#from missingpy import MissForest
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
import torch
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv("heg_sample_data.csv") #normalized dataset
dataset.drop('Unnamed: 0', axis=1, inplace=True)
dataset.shape, dataset.columns

((13652, 72),
 Index(['days_from_entrance', 'age', 'document.sexo', 'UTI',
        'delta_collect_timestamp_t-t1', 'delta_collect_timestamp_t1-t2',
        'delta_collect_timestamp_t2-t3', 'delta_collect_timestamp_t3-t4',
        'document.freq_cardiaca(t)', 'document.freq_cardiaca(t-1)',
        'document.freq_cardiaca(t-2)', 'document.freq_cardiaca(t-3)',
        'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t)',
        'document.freq_respiratoria(t-1)', 'document.freq_respiratoria(t-2)',
        'document.freq_respiratoria(t-3)', 'document.freq_respiratoria(t-4)',
        'document.glicemia_capilar(t)', 'document.glicemia_capilar(t-1)',
        'document.glicemia_capilar(t-2)', 'document.glicemia_capilar(t-3)',
        'document.glicemia_capilar(t-4)', 'document.pa_diastolica(t)',
        'document.pa_diastolica(t-1)', 'document.pa_diastolica(t-2)',
        'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)',
        'document.pa_sistolica(t)', 'document.pa_si

In [3]:
dataset

Unnamed: 0,days_from_entrance,age,document.sexo,UTI,delta_collect_timestamp_t-t1,delta_collect_timestamp_t1-t2,delta_collect_timestamp_t2-t3,delta_collect_timestamp_t3-t4,document.freq_cardiaca(t),document.freq_cardiaca(t-1),...,delta_document.pa_sistolica_t3-t4,delta_document.sat_o2_t-t1,delta_document.sat_o2_t1-t2,delta_document.sat_o2_t2-t3,delta_document.sat_o2_t3-t4,delta_document.temperatura_t-t1,delta_document.temperatura_t1-t2,delta_document.temperatura_t2-t3,delta_document.temperatura_t3-t4,outcome
0,0.481160,-0.430375,1.0,0.0,0.181480,0.282895,0.397339,0.387741,-0.130331,0.337252,...,0.107772,0.202479,0.194332,0.432788,0.012866,-0.144525,-0.024481,0.152412,0.007067,0.0
1,1.837695,0.230175,1.0,0.0,1.201338,1.826415,1.625187,2.170093,1.451289,1.601233,...,-0.021933,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0
2,0.226810,-0.719366,1.0,0.0,2.049497,4.031576,1.846655,2.285346,0.301020,0.403377,...,0.139667,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0
3,-0.366674,1.716413,1.0,0.0,1.038056,0.686616,1.107306,0.877358,0.684443,0.823398,...,-0.101163,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,0.0
4,-0.366674,1.262285,0.0,0.0,0.879474,0.416499,0.427891,0.405360,0.492732,-0.537812,...,-0.021933,0.060615,0.114810,0.672236,-1.299337,-0.144525,-0.024481,0.152412,0.007067,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13647,-0.281891,0.643019,1.0,0.0,0.200458,0.235890,-0.313999,-0.015181,-1.184744,-0.732271,...,0.029118,1.636497,-0.833785,0.424839,0.011498,0.584695,-0.350259,-0.725431,-0.005497,0.0
13648,0.142026,1.097147,0.0,0.0,0.056699,0.282506,0.406512,0.300970,-0.274114,-0.148895,...,-0.676939,0.004101,0.829661,-0.001954,0.439924,-0.444251,-0.350259,0.012286,0.657215,0.0
13649,-0.281891,0.106322,0.0,0.0,0.139795,0.114485,0.240037,0.387741,-0.513754,-1.218417,...,0.272103,0.412200,-0.002062,-2.135920,-0.034930,0.422230,0.526740,-0.214704,-0.985154,0.0
13650,-0.281891,1.386138,1.0,0.0,-0.243519,-0.186716,-0.200702,-0.262950,-0.178259,0.094178,...,0.240936,-0.812096,-0.417923,-0.001954,0.011498,-1.310733,-0.240634,1.657963,-0.336854,0.0


In [4]:
# params
seed = 42
#num_folds = 5
scoring = "roc_auc"

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
seed_everything(seed)

## Setup Expetiments

In [5]:
X = dataset.drop(["outcome"], axis = 1)
Y = dataset["outcome"]
X.shape, Y.shape

((13652, 71), (13652,))

In [6]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
classifiers = {
    'TabNet': TabNetClassifier(),
    'XGBoost' : XGBClassifier(learning_rate=0.1, n_estimators=100,random_state=7),
    'LogReg': LogisticRegression(solver='liblinear', multi_class='ovr'),
    'D.Tree': DecisionTreeClassifier(),
    'RForest': RandomForestClassifier(n_estimators = 50),
    'CatBoos': CatBoostClassifier(learning_rate=0.1,n_estimators=100,random_state=7,task_type='GPU',verbose = False),
    'Naive': GaussianNB(),
    'Light': lgb.LGBMClassifier()
}

Device used : cuda


## Run Basic Experiments

In [33]:
import warnings
warnings.filterwarnings('ignore')
for c in classifiers:
  start = time.time()
  model = classifiers[c]
  scores = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
  scores_f1 = cross_val_score(model, X, Y, cv=kfold, scoring='f1')
  print (c + '\t', round(scores.mean(),4), '(' + str(round(scores_f1.mean(),4)) + ')', round(time.time() - start,2), 's')
  #print (c + '\t', scores, '(' scores_f1  ')', round(time.time() - start,2), 's')

Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights wi

## Cross Validation by Windowing

In [47]:
cols = ['age', 'document.sexo', 'UTI', 'days_from_entrance']
t_cols = [c for c in dataset.columns if '4)' in c and (not 'time' in c)]

for i in [4,3,2,1,0]:
  
  if i == 4: cols.extend(t_cols)
  if i == 0:
    tN_cols = [c for c in dataset.columns if ('t)' in c or '_t-' in c) and (not 'time' in c)]
    cols.extend(tN_cols)
  else: 
    tN_cols = [c for c in dataset.columns if ('t-'+str(i) in c or '_t'+str(i) in c) and (not 'time' in c)]
    cols.extend(tN_cols)

  cols = list(set(cols))
  print('Number of Columns:', len(cols), 'Exam(s):', 5-i)
  print(cols)

  X_W = dataset[cols]
  Y_W = dataset["outcome"]

  for c in classifiers:
    start = time.time()
    model = classifiers[c]
    scores = cross_val_score(model, X_W, Y_W, cv=kfold, scoring='roc_auc')
    print ('\t' + c + '\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')

Number of Columns: 11 Exam(s): 1
['UTI', 'document.pa_diastolica(t-4)', 'document.glicemia_capilar(t-4)', 'document.sexo', 'document.pa_sistolica(t-4)', 'document.freq_cardiaca(t-4)', 'document.sat_o2(t-4)', 'days_from_entrance', 'document.freq_respiratoria(t-4)', 'age', 'document.temperatura(t-4)']
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed, last training weights will be used.
Device used : cuda
No early stopping will be performed,

In [7]:
# TabNet
kf = KFold(n_splits=10, random_state=7, shuffle=True)
CV_score_array    =[]


def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return f1

X = dataset.drop(["outcome"], axis = 1)
y = dataset["outcome"]

X = X.to_numpy()
y = y.to_numpy()
for train_index, test_index in kf.split(X,y):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    tb_cls = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-3),
                       scheduler_params={"step_size":10, "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' #entmax' # "sparsemax"
                       )
    tb_cls.fit(X_train,y_train,
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               eval_name=['train', 'valid'],
               eval_metric=['auc'],
               max_epochs=1000 , patience=100,
               batch_size=20, drop_last=False)            
    CV_score_array.append(tb_cls.best_cost)
  

Device used : cuda
epoch 0  | loss: 0.5657  | train_auc: 0.49042 | valid_auc: 0.51875 |  0:00:16s
epoch 1  | loss: 0.40318 | train_auc: 0.56461 | valid_auc: 0.62865 |  0:00:33s
epoch 2  | loss: 0.37938 | train_auc: 0.64173 | valid_auc: 0.68767 |  0:00:49s
epoch 3  | loss: 0.36708 | train_auc: 0.69222 | valid_auc: 0.72342 |  0:01:06s
epoch 4  | loss: 0.36052 | train_auc: 0.71391 | valid_auc: 0.74866 |  0:01:22s
epoch 5  | loss: 0.35031 | train_auc: 0.74264 | valid_auc: 0.74223 |  0:01:38s
epoch 6  | loss: 0.3442  | train_auc: 0.74641 | valid_auc: 0.73756 |  0:01:54s
epoch 7  | loss: 0.33756 | train_auc: 0.74848 | valid_auc: 0.75101 |  0:02:10s
epoch 8  | loss: 0.33469 | train_auc: 0.75479 | valid_auc: 0.76444 |  0:02:26s
epoch 9  | loss: 0.32958 | train_auc: 0.76572 | valid_auc: 0.76633 |  0:02:42s
epoch 10 | loss: 0.321   | train_auc: 0.76845 | valid_auc: 0.77774 |  0:02:58s
epoch 11 | loss: 0.32632 | train_auc: 0.76648 | valid_auc: 0.77687 |  0:03:15s
epoch 12 | loss: 0.31971 | train_

In [8]:
y_pred = tb_cls.predict(X_valid)

In [10]:
from sklearn.metrics import classification_report, f1_score

print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95      1215
         1.0       0.79      0.30      0.43       150

    accuracy                           0.91      1365
   macro avg       0.85      0.65      0.69      1365
weighted avg       0.91      0.91      0.90      1365

