In [1]:
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchtext.data import BucketIterator

random.seed(696)

## Load and split dataset

In [2]:
patients = pd.read_csv('patient_data.csv')
patients = patients.fillna(method='backfill')
patients = patients.fillna(patients.mean())

In [3]:
def get_split_indices(pos_len, neg_len, ratios):
    train, val, test = ratios[0], ratios[1], ratios[2]
    pos_tr, neg_tr = int(round(pos_len * train)), int(round(neg_len * train))
    pos_val, neg_val = int(round(pos_len * val)), int(round(neg_len * val))
    pos_test, neg_test = int(round(pos_len * test)), int(round(neg_len * test))
    return ((pos_tr, pos_val, pos_test), (neg_tr, neg_val, neg_test))
    
    
def split_dataset(patients, ratios):
    positive = patients[patients['SepsisLabel'] == 1]['pid'].unique().tolist()
    negative = [i for i in range(1, 5000+1) if i not in positive]
    random.shuffle(positive)
    random.shuffle(negative)
    pos_idx, neg_idx = get_split_indices(len(positive), len(negative), ratios)
    
    train = positive[0:pos_idx[0]] + negative[0:neg_idx[0]]
    val = positive[pos_idx[0]:pos_idx[0] + pos_idx[1]] + negative[neg_idx[0]:neg_idx[0] + neg_idx[1]]
    test = positive[pos_idx[0] + pos_idx[1]:] + negative[neg_idx[0] + neg_idx[1]:]
    
    train_dict, val_dict, test_dict = {}, {}, {}
    for pid in train:
        train_dict[pid] = patients[patients['pid'] == pid]
    for pid in val:
        val_dict[pid] = patients[patients['pid'] == pid]
    for pid in test:
        test_dict[pid] = patients[patients['pid'] == pid]
    
    return train_dict, val_dict, test_dict

## Choose patient observation windows

In [4]:
def process_patient(patient, max_len, window_marker=70):
    patient = patient.reset_index()
    obs_len = patient.shape[0]
    
    if(patient[patient['SepsisLabel'] == 1].shape[0]):
        sepsis_idx = list(patient[patient['SepsisLabel']==1].index)[0]
        if obs_len > max_len: return process_longer_obs(patient, sepsis_idx, max_len, marker=window_marker)
        if obs_len < max_len: return process_shorter_obs(patient, sepsis_idx, max_len)
    else:
        if obs_len > max_len:
            return patient.iloc[0:max_len, 2:]
        if obs_len < max_len:
            p = patient.iloc[:, :]
            for i in range(max_len - obs_len):
                p = p.append(patient.iloc[-1, :])
            return p.iloc[:, 2:]
    return patient.iloc[:, 2:]

def process_shorter_obs(patient, sepsis_idx, max_len):
    p = pd.DataFrame()
    p = p.append(patient)
    for i in range(max_len - patient.shape[0]):
        p = p.append(p.iloc[-1, :])
    return p.reset_index().iloc[:, 3:]
        
def process_longer_obs(patient, sepsis_idx, max_len, marker=70):
    p = pd.DataFrame()
    avail_before = sepsis_idx - 1
    avail_after = patient.shape[0] - sepsis_idx
    need_before = int(max_len * marker/100)
    need_after = int(max_len * (100 - marker)/100) - 1
   
    if avail_before >= need_before and avail_after >= need_after:
        p = p.append(patient.iloc[avail_before - need_before:avail_before+1, :])
        p = p.append(patient.iloc[sepsis_idx, :])
        p = p.append(patient.iloc[sepsis_idx+1 : sepsis_idx + need_after, :])
    
    elif avail_before >= need_before and avail_after <= need_after:
        p = p.append(patient.iloc[avail_before - need_before:avail_before+1, :])
        p = p.append(patient.iloc[sepsis_idx, :])
        p = p.append(patient.iloc[sepsis_idx + 1:, :])
        for i in range(max_len - p.shape[0]):
            p = p.append(p.iloc[-1, :])
    
    elif avail_before <= need_before and avail_after >= need_after:
        p = p.append(patient.iloc[0:avail_before, :])
        p = p.append(patient.iloc[sepsis_idx, :])
        p = p.append(patient.iloc[sepsis_idx+1 : sepsis_idx + need_after, :])
        for i in range(max_len - p.shape[0]):
            p = p.concat([p.iloc[0, :], p], ignore_index = True)
    
    return p.reset_index().iloc[:, 3:]

## Create DataLoaders

In [5]:
class PatientDataset(Dataset):
    def __init__(self, patient_dict, max_obs_len, window_marker):
        self.patient_dict = patient_dict
        self.num_patients = len(patient_dict)
        self.pids = list(patient_dict.keys())
        self.max_obs_len = max_obs_len
        self.window_marker = window_marker
        
    def __len__(self):
        return self.num_patients
    
    def __getitem__(self, idx):
        patient = self.patient_dict[self.pids[idx]]
        patient = process_patient(patient, self.max_obs_len, self.window_marker)
        patient_features = torch.FloatTensor(patient.iloc[:, 1:-1].values)
        patient_labels = torch.FloatTensor(patient['SepsisLabel'])
        self.num_patients -= 1
        return patient_features, patient_labels

In [6]:
def data_loader(patient_dict, max_obs_len, batch_size, shuffle=True, window_marker=70):
    return DataLoader(PatientDataset(patient_dict, max_obs_len, window_marker), batch_size, shuffle)

## Model training and evaluation setup

In [7]:
def confusion_matrix(prediction, truth):
    confusion_vector = prediction/truth
    true_positives = torch.sum(confusion_vector == 1).item()
    false_positives = torch.sum(confusion_vector == float('inf')).item()
    true_negatives = torch.sum(torch.isnan(confusion_vector)).item()
    false_negatives = torch.sum(confusion_vector == 0).item()
    return true_positives, false_positives, true_negatives, false_negatives

def check_accuracy(model, loader, group):
    print('Checking ' + group + ' accuracy!')
    num_correct = 0
    num_samples = 0
    tp, fp, tn, fn, precision, recall, f1 = 0, 0, 0, 0, 0 ,0, 0
    model.eval()
    for t, (x, y) in enumerate(loader):
        scores = model(x)
        rounded_preds = torch.round(torch.sigmoid(scores))
        num_correct += (rounded_preds == y).sum()
        num_samples += y.size(0) * y.size(1)
        tp_t, fp_t, tn_t, fn_t = confusion_matrix(rounded_preds, y)
        tp += tp_t
        fp += fp_t
        tn += tn_t
        fn += fn_t

    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
    print('TP = ', tp, ', FP = ', fp, ', TN = ', tn, ', FN = ', fn)
    if tp != 0:
        precision = tp/(tp + fp)
        recall = tp/(tp + fn)
        f1 = 2 * ((precision * recall)/(precision + recall))
    print('Precision = ', precision, ', Recall = ', recall, ', F1 Score = ', f1)
    print()
    return 100*acc

In [8]:
def train(model, optimizer, loss_fn, train_dict, val_dict, max_obs_len, batch_size, epochs=1, print_every=50, window_marker=70):
    train_history, val_history = [], []
    for e in range(epochs):
        print('Epoch: ', e+1)
        for t, (x, y) in enumerate(data_loader(train_dict, max_obs_len, batch_size, window_marker)):
            model.train()
            scores = model(x)
            loss = loss_fn(scores, y)
            
            if (t + 1) % print_every == 0:
                print('t = %d, loss = %.4f' % (t + 1, loss.item()))
                
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        train_history.append(check_accuracy(model, data_loader(train_dict, max_obs_len, batch_size, window_marker), 'train'))
        val_history.append(check_accuracy(model, data_loader(val_dict, max_obs_len, batch_size, window_marker), 'val'))
        print()
    return (train_history, val_history)

In [9]:
def get_pos_weight(patient_dict):
    subset = patients[patients['pid'].isin(list(patient_dict.keys()))]
    total_samples = len(subset)
    pos_samples = subset[subset['SepsisLabel'] == 1]['pid'].count()
    return total_samples/pos_samples

## Simple LSTM model

In [10]:
class SimpleLSTM(nn.Module):
    def __init__(self, feature_dim, hidden_dim, out_dim):
        super().__init__()
        self.rnn = nn.RNN(feature_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, out_dim)
        
    def forward(self, x):
        output, hidden = self.rnn(x)
        fc_out = self.fc(hidden.squeeze(0))        
        return fc_out

In [11]:
def test_run(config):
    train_dict, val_dict, test_dict = split_dataset(patients, config['ratios'])
    feature_dim, hidden_dim, output_dim = config['feature_dim'], config['hidden_dim'], config['output_dim']
    model = SimpleLSTM(feature_dim, hidden_dim, output_dim)
    optimizer = optim.Adam(model.parameters(), lr=config['lr_rate'])
    criterion = nn.BCEWithLogitsLoss()
    if config['pos_weight'] is not None:
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([config['pos_weight']]))
    
    train_hist, val_hist = train(model, optimizer, criterion, 
                                     train_dict, val_dict, config['max_obs_len'], config['batch_size'], 
                                     epochs=config['epochs'], window_marker=config['window_marker'])
    return model, train_hist, val_hist

In [13]:
model_config = {
    'ratios': (.7, .2, .1),
    'feature_dim': 39,
    'hidden_dim': 128,
    'output_dim': 30,
    'max_obs_len': 30,
    'batch_size': 16,
    'lr_rate': 1e-3,
    'pos_weight': None,
    'epochs': 5,
    'window_marker': 70
}

test_run(model_config)

Epoch:  1
t = 50, loss = 0.0474
t = 100, loss = 0.1204
t = 150, loss = 0.0166
t = 200, loss = 0.0338
Checking train accuracy!
Got 101962 / 105000 correct (97.11)
TP =  0 , FP =  1 , TN =  101962 , FN =  3037
Precision =  0 , Recall =  0 , F1 Score =  0

Checking val accuracy!
Got 29100 / 30000 correct (97.00)
TP =  0 , FP =  0 , TN =  29100 , FN =  900
Precision =  0 , Recall =  0 , F1 Score =  0


Epoch:  2
t = 50, loss = 0.0768
t = 100, loss = 0.0274
t = 150, loss = 0.1263
t = 200, loss = 0.0373
Checking train accuracy!
Got 101963 / 105000 correct (97.11)
TP =  0 , FP =  0 , TN =  101963 , FN =  3037
Precision =  0 , Recall =  0 , F1 Score =  0

Checking val accuracy!
Got 29100 / 30000 correct (97.00)
TP =  0 , FP =  0 , TN =  29100 , FN =  900
Precision =  0 , Recall =  0 , F1 Score =  0


Epoch:  3
t = 50, loss = 0.0270
t = 100, loss = 0.0815
t = 150, loss = 0.0763
t = 200, loss = 0.0770
Checking train accuracy!
Got 101963 / 105000 correct (97.11)
TP =  0 , FP =  0 , TN =  101963 ,

(SimpleLSTM(
   (rnn): RNN(39, 128, batch_first=True)
   (fc): Linear(in_features=128, out_features=30, bias=True)
 ),
 [97.10666666666667,
  97.10761904761904,
  97.10761904761904,
  97.10761904761904,
  97.10761904761904],
 [97.0, 97.0, 97.0, 97.0, 97.0])

In [15]:
model_config = {
    'ratios': (.7, .2, .1),
    'feature_dim': 39,
    'hidden_dim': 128,
    'output_dim': 30,
    'max_obs_len': 30,
    'batch_size': 16,
    'lr_rate': 1e-3,
    'pos_weight': 17,
    'epochs': 30,
    'window_marker': 70
}

model, train_hist, val_hist = test_run(model_config)

Epoch:  1
t = 50, loss = 0.3756
t = 100, loss = 2.2152
t = 150, loss = 0.5475
t = 200, loss = 0.3556
Checking train accuracy!
Got 79128 / 105000 correct (75.36)
TP =  1470 , FP =  24283 , TN =  77658 , FN =  1589
Precision =  0.05708072845882033 , Recall =  0.4805491990846682 , F1 Score =  0.10204081632653063

Checking val accuracy!
Got 22593 / 30000 correct (75.31)
TP =  422 , FP =  6916 , TN =  22171 , FN =  491
Precision =  0.05750885799945489 , Recall =  0.46221248630887185 , F1 Score =  0.1022906314386135


Epoch:  2
t = 50, loss = 0.7533
t = 100, loss = 1.5338
t = 150, loss = 0.7460
t = 200, loss = 0.4481
Checking train accuracy!
Got 84400 / 105000 correct (80.38)
TP =  1171 , FP =  18712 , TN =  83229 , FN =  1888
Precision =  0.05889453301815621 , Recall =  0.38280483818241257 , F1 Score =  0.10208351495074536

Checking val accuracy!
Got 24041 / 30000 correct (80.14)
TP =  327 , FP =  5373 , TN =  23714 , FN =  586
Precision =  0.057368421052631575 , Recall =  0.358159912376779

t = 50, loss = 1.2236
t = 100, loss = 0.3639
t = 150, loss = 0.5560
t = 200, loss = 0.4862
Checking train accuracy!
Got 101952 / 105000 correct (97.10)
TP =  501 , FP =  490 , TN =  101451 , FN =  2558
Precision =  0.5055499495459133 , Recall =  0.16377901274926446 , F1 Score =  0.2474074074074074

Checking val accuracy!
Got 29043 / 30000 correct (96.81)
TP =  128 , FP =  172 , TN =  28915 , FN =  785
Precision =  0.4266666666666667 , Recall =  0.140197152245345 , F1 Score =  0.21104699093157464


Epoch:  18
t = 50, loss = 0.2991
t = 100, loss = 0.4107
t = 150, loss = 0.3103
t = 200, loss = 1.2483
Checking train accuracy!
Got 96365 / 105000 correct (91.78)
TP =  1109 , FP =  6685 , TN =  95256 , FN =  1950
Precision =  0.14228894021041827 , Recall =  0.36253677672441975 , F1 Score =  0.20436745600294848

Checking val accuracy!
Got 27488 / 30000 correct (91.63)
TP =  244 , FP =  1843 , TN =  27244 , FN =  669
Precision =  0.1169142309535218 , Recall =  0.2672508214676889 , F1 Score =  0

In [13]:
model_config = {
    'ratios': (.8, .1, .1),
    'feature_dim': 39,
    'hidden_dim': 256,
    'output_dim': 50,
    'max_obs_len': 50,
    'batch_size': 32,
    'lr_rate': 1e-4,
    'pos_weight': 17,
    'epochs': 30,
    'window_marker': 70
}

model, train_hist, val_hist = test_run(model_config)

Epoch:  1
t = 50, loss = 1.2850
t = 100, loss = 1.4239
Checking train accuracy!
Got 167206 / 200000 correct (83.60)
TP =  1730 , FP =  27836 , TN =  165476 , FN =  4958
Precision =  0.058513157004667526 , Recall =  0.2586722488038278 , F1 Score =  0.09543774480057374

Checking val accuracy!
Got 20886 / 25000 correct (83.54)
TP =  198 , FP =  3497 , TN =  20688 , FN =  617
Precision =  0.053585926928281465 , Recall =  0.24294478527607363 , F1 Score =  0.0878048780487805


Epoch:  2
t = 50, loss = 1.6523
t = 100, loss = 1.5878
Checking train accuracy!
Got 171241 / 200000 correct (85.62)
TP =  1670 , FP =  23741 , TN =  169571 , FN =  5018
Precision =  0.06571957026484593 , Recall =  0.24970095693779903 , F1 Score =  0.10405308576591171

Checking val accuracy!
Got 21357 / 25000 correct (85.43)
TP =  192 , FP =  3020 , TN =  21165 , FN =  623
Precision =  0.05977584059775841 , Recall =  0.23558282208588957 , F1 Score =  0.09535634467345419


Epoch:  3
t = 50, loss = 0.7810
t = 100, loss = 

KeyboardInterrupt: 