In [1]:
import pickle5 as pickle

# Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Evaluation

from sklearn.metrics import f1_score, multilabel_confusion_matrix, accuracy_score, classification_report
import seaborn as sns

In [2]:
CURR_PATH = os.getcwd()
PKL_PATH = CURR_PATH+'/pickles/'
DF_PATH = CURR_PATH +'/dataframes/'
TXT_DIR_TRAIN = CURR_PATH + '/txt_files/train'
destination_folder = CURR_PATH + '/training'

In [3]:
TXT_DIR_TRAIN = '/home/svetlana.maslenkova/LSTM/txt_files/train'

In [4]:
from tokenizers import  Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import os
from os.path import exists
import glob

# Training the tokenizer
if exists(CURR_PATH + '/aki_prediction/tokenizer.json'):
    print(f'Tokenizer is loaded from {CURR_PATH}')
    tokenizer = Tokenizer.from_file(CURR_PATH + '/aki_prediction/tokenizer.json')
else:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]"], min_frequency=3)
    files = glob.glob(TXT_DIR_TRAIN+'/*')
    tokenizer.train(files, trainer)
    os.environ["TOKENIZERS_PARALLELISM"] = "false"






In [19]:
tokenizer.save('/home/svetlana.maslenkova/LSTM' + '/tokenizer.json')

In [5]:
tokenizer.get_vocab_size()

21675

In [6]:
output = tokenizer.encode('D72190')
output.tokens

['D', '72190']

### Loading 

In [5]:
def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']

In [21]:
max_length = {'demographics':5, 'lab_tests':400, 'vitals':200, 'medications':255}

# Baseline model

In [9]:
# baseline model
class MyDataset(Dataset):

    def __init__(self, df, tokenizer, max_length, max_days):
        self.df = df
        self.tokenizer = tokenizer
        self.max_days = max_days
        self.max_len_demo = max_length['demographics']
        self.max_len_labs = max_length['lab_tests']
        self.max_len_vitals = max_length['vitals']
        self.max_len_meds = max_length['medications']
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        return self.make_matrices(idx, self.max_days)
    
    def tokenize(self, text, max_length):
        
        output = self.tokenizer.encode(text)
        # padding and truncation
        if len(output.ids) < max_length:
            len_missing_token = max_length - len(output.ids)
            padding_vec = [self.tokenizer.token_to_id('[PAD]') for _ in range(len_missing_token)]
            token_output = [*output.ids, *padding_vec]
        elif len(output.ids) > max_length:
            token_output = output.ids[:max_length]
        else:
            token_output = output.ids
        
        return token_output

    def make_matrices(self, idx, max_days):
        info_demo = self.df.demographics_in_visit.values[idx][0]
        info_med = self.df.medications_in_visit.values[idx]
        info_vitals = self.df.vitals_in_visit.values[idx]
        info_labs = self.df.lab_tests_in_visit.values[idx]
        aki_status = self.df.aki_status_in_visit.values[idx]
        days = list(self.df.days.values[idx])

        aki_happened = False
        labels = []
        info_med_list = []
        info_vitals_list = []
        info_labs_list = []

        for day in range(max_days+1):
            # if aki happened - pad labels and info for next days 
            if (day not in days) or (aki_happened==True):
                labels.append(0)
                info_med_list.append(self.tokenize('', self.max_len_meds))
                info_vitals_list.append(self.tokenize('', self.max_len_vitals))
                info_labs_list.append(self.tokenize('', self.max_len_labs))

            else:
                i = days.index(day)
                # indicate that aki happened
                if aki_status[i] == 1:
                    aki_happened = True

                labels.append(aki_status[i])
                if str(info_med[i]) == 'nan':
                    info_med_list.append(self.tokenize('[PAD]', self.max_len_meds))
                else:
                    info_med_list.append(self.tokenize(info_med[i], self.max_len_meds))

                if str(info_vitals[i]) == 'nan':
                    info_vitals_list.append(self.tokenize('[PAD]', self.max_len_vitals))
                else:
                    info_vitals_list.append(self.tokenize(info_vitals[i], self.max_len_vitals))

                if str(info_labs[i]) == 'nan':
                    info_labs_list.append(self.tokenize('[PAD]', self.max_len_labs))
                else:
                    info_labs_list.append(self.tokenize(info_labs[i], self.max_len_labs))
                    
        info_demo = self.tokenize(info_demo,  self.max_len_demo)

        #make tensors
        tensor_demo = torch.tensor(info_demo, dtype=torch.int32)
        tensor_med = torch.tensor(info_med_list, dtype=torch.int32)
        tensor_vitals = torch.tensor(info_vitals_list, dtype=torch.int32)
        tensor_labs = torch.tensor(info_labs_list, dtype=torch.int32)
        tensor_labels = torch.tensor(labels, dtype=torch.int32)
        return (tensor_demo, tensor_med, tensor_vitals, tensor_labs), tensor_labels



class LSTM_model(nn.Module):

    def __init__(self, H=128, max_length=max_length, max_day=13):
        super(LSTM_model, self).__init__()

		# Hyperparameters
        self.max_day = max_day
        L = (self.max_day+1) * (256 + 256 + 512) + 1280
        self.H = H
        self.max_length = max_length

        self.embedding = nn.Embedding(vocab_size, embedding_size)

        self.fc_med = nn.Linear(max_length['medications'] * 2 * self.H, 256)  #65,280
        self.fc_vit = nn.Linear(max_length['vitals'] * 2 * self.H, 256)   #51,200
        self.fc_lab = nn.Linear(max_length['lab_tests'] * 2 * self.H, 512) #102,400

        self.lstm_day = nn.LSTM(input_size=embedding_size,
                            hidden_size=self.H,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

        self.fc_1 = nn.Linear((self.max_day+1) * (256 + 256 + 512) + 1280, 2048)

        self.lstm_adm = nn.LSTM(input_size=2048,
                            hidden_size=self.H,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=False)

        self.drop = nn.Dropout(p=0.5)

        self.fc_2 = nn.Linear(self.H, (self.max_day+1))
        
        # self.sigmoid = nn.Sigmoid()


    def forward(self, tensor_demo, tensor_med, tensor_vitals, tensor_labs):

        batch_size = tensor_med.size()[0]
        days = self.max_day + 1

        out_emb_med_demo = self.embedding(tensor_demo.squeeze(1))
        output_lstm_day_demo, _ = self.lstm_day(out_emb_med_demo)
        full_output = output_lstm_day_demo.reshape(batch_size, self.max_length['demographics']* 2 * self.H)


        for d in range(days):
            # if i > 1:
            #     break
            # embedding layer applied to all tensors
            out_emb_med = self.embedding(tensor_med[:, d, :].squeeze(1))
            out_emb_vitals = self.embedding(tensor_vitals[:, d, :].squeeze(1))
            out_emb_labs =  self.embedding(tensor_labs[:, d, :].squeeze(1))
            # lstm layer applied to embedded tensors
            output_lstm_day_med = self.fc_med(\
                                    self.lstm_day(out_emb_med)[0]\
                                        .reshape(batch_size, max_length['medications'] * 2 * self.H))

            output_lstm_day_vitals = self.fc_vit(\
                                        self.lstm_day(out_emb_vitals)[0]\
                                            .reshape(batch_size,  max_length['vitals'] * 2 * self.H))

            output_lstm_day_labs = self.fc_lab(\
                                    self.lstm_day(out_emb_labs)[0]\
                                        .reshape(batch_size, max_length['lab_tests']* 2 * self.H))
                                        
            # concatenate for all 26 days
            full_output = torch.cat((full_output, \
                                        output_lstm_day_med,\
                                            output_lstm_day_vitals,\
                                                output_lstm_day_labs), dim=1)
        
        # print('full_output size: ', full_output.size())
        output = self.fc_1(full_output)
        output, _ = self.lstm_adm(output)
        output = self.drop(output)
        output = self.fc_2(output)
        output = torch.squeeze(output, 1)
        # output = self.sigmoid(output)

        return output


class LSTM_model(nn.Module):

    def __init__(self, max_length, pred_window, vocab_size, H=128,  max_day=7, embedding_size=200):
        super(LSTM_model, self).__init__()

		# Hyperparameters
        self.max_day = max_day
        self.pred_window = pred_window
        L = (self.max_day+1) * (256 + 256 + 512) + 1280
        self.H = H
        self.max_length = max_length
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size

        # self.embedding = pretrained_model
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
    

        self.fc_med = nn.Linear(max_length['medications'] * 2 * self.H, 256)  #65,280
        self.fc_vit = nn.Linear(max_length['vitals'] * 2 * self.H, 256)   #51,200
        self.fc_lab = nn.Linear(max_length['lab_tests'] * 2 * self.H, 512) #102,400

        self.lstm_day = nn.LSTM(input_size=embedding_size,
                            hidden_size=self.H,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

        self.fc_1 = nn.Linear((self.max_day - self.pred_window) * (256 + 256 + 512) + max_length['demographics']*2*H, 2048)

        self.lstm_adm = nn.LSTM(input_size=2048,
                            hidden_size=self.H,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=False)

        self.drop = nn.Dropout(p=0.5)

        self.fc_2 = nn.Linear(self.H, (self.max_day - self.pred_window))
        
        # self.sigmoid = nn.Sigmoid()


    def forward(self, tensor_demo, tensor_med, tensor_vitals, tensor_labs):

        batch_size = tensor_med.size()[0]
        days = self.max_day

        out_emb_med_demo = self.embedding(tensor_demo.squeeze(1))
        output_lstm_day_demo, _ = self.lstm_day(out_emb_med_demo)
        full_output = output_lstm_day_demo.reshape(batch_size, self.max_length['demographics']* 2 * self.H)


        for d in range(days - self.pred_window):
            # embedding layer applied to all tensors
            out_emb_med = self.embedding(tensor_med[:, d, :].squeeze(1))
            out_emb_vitals = self.embedding(tensor_vitals[:, d, :].squeeze(1))
            out_emb_labs =  self.embedding(tensor_labs[:, d, :].squeeze(1))
            # lstm layer applied to embedded tensors
            output_lstm_day_med = self.fc_med(\
                                    self.lstm_day(out_emb_med)[0]\
                                        .reshape(batch_size, max_length['medications'] * 2 * self.H))

            output_lstm_day_vitals = self.fc_vit(\
                                        self.lstm_day(out_emb_vitals)[0]\
                                            .reshape(batch_size,  max_length['vitals'] * 2 * self.H))

            output_lstm_day_labs = self.fc_lab(\
                                    self.lstm_day(out_emb_labs)[0]\
                                        .reshape(batch_size, max_length['lab_tests']* 2 * self.H))
                                        
            # concatenate for all * days
            full_output = torch.cat((full_output, \
                                        output_lstm_day_med,\
                                            output_lstm_day_vitals,\
                                                output_lstm_day_labs), dim=1)
        
        # print('full_output size: ', full_output.size())
        output = self.fc_1(full_output)
        output, _ = self.lstm_adm(output)
        output = self.drop(output)
        output = self.fc_2(output)
        output = torch.squeeze(output, 1)
        # if self.criterion == 'BCELoss':
        #     output = self.sigmoid(output)

        return output

# Pretrained model

In [10]:
# pretrained model
max_length = {'demographics':5, 'lab_tests':400, 'vitals':200, 'medications':255}
class MyDataset(Dataset):

    def __init__(self, df, tokenizer, max_length, max_days, pred_window):
        self.df = df
        self.tokenizer = tokenizer
        self.max_days = max_days
        self.pred_window = pred_window
        self.max_len_demo = max_length['demographics']
        self.max_len_labs = max_length['lab_tests']
        self.max_len_vitals = max_length['vitals']
        self.max_len_meds = max_length['medications']
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        return self.make_matrices(idx, self.max_days)
    
    def tokenize(self, text, max_length):
        
        output = self.tokenizer.encode(text)
        # padding and truncation
        if len(output.ids) < max_length:
            len_missing_token = max_length - len(output.ids)
            padding_vec = [self.tokenizer.token_to_id('[PAD]') for _ in range(len_missing_token)]
            token_output = [*output.ids, *padding_vec]
        elif len(output.ids) > max_length:
            token_output = output.ids[:max_length]
        else:
            token_output = output.ids
        
        return token_output

    def make_matrices(self, idx, max_days):
        info_demo = self.df.demographics_in_visit.values[idx][0]
        info_med = self.df.medications_in_visit.values[idx]
        info_vitals = self.df.vitals_in_visit.values[idx]
        info_labs = self.df.lab_tests_in_visit.values[idx]
        aki_status = self.df.aki_status_in_visit.values[idx]
        days = self.df.days.values[idx]

        aki_happened = False
        labels = []
        info_med_list = []
        info_vitals_list = []
        info_labs_list = []

        for day in range(days[0], days[0]+ max_days - self.pred_window):
            if day not in days:
                labels.append(0)
                info_med_list.append(self.tokenize('', self.max_len_meds))
                info_vitals_list.append(self.tokenize('', self.max_len_vitals))
                info_labs_list.append(self.tokenize('', self.max_len_labs))
            else:
                i = days.index(day)
                
                if (day + self.pred_window) not in days:
                    labels.append(0)
                else:              
                    if np.isnan(aki_status[i + self.pred_window]):
                        labels.append(0)
                    else:
                        labels.append(aki_status[i + self.pred_window])

                if str(info_med[i]) == 'nan':
                    info_med_list.append(self.tokenize('[PAD]', self.max_len_meds))
                else:
                    info_med_list.append(self.tokenize(info_med[i], self.max_len_meds))

                if str(info_vitals[i]) == 'nan':
                    info_vitals_list.append(self.tokenize('[PAD]', self.max_len_vitals))
                else:
                    info_vitals_list.append(self.tokenize(info_vitals[i], self.max_len_vitals))

                if str(info_labs[i]) == 'nan':
                    info_labs_list.append(self.tokenize('[PAD]', self.max_len_labs))
                else:
                    info_labs_list.append(self.tokenize(info_labs[i], self.max_len_labs))
                    
        info_demo = self.tokenize(info_demo,  self.max_len_demo)

        #make tensors
        tensor_demo = torch.tensor(info_demo, dtype=torch.int32)
        tensor_med = torch.tensor(info_med_list, dtype=torch.int32)
        tensor_vitals = torch.tensor(info_vitals_list, dtype=torch.int32)
        tensor_labs = torch.tensor(info_labs_list, dtype=torch.int32)
        tensor_labels = torch.tensor(labels, dtype=torch.int32)
        return (tensor_demo, tensor_med, tensor_vitals, tensor_labs), tensor_labels
   
class EHR_Embedding(nn.Module):
    def __init__(self, embedding_size, vocab_size=15463, drop=0.1):
        super(EHR_Embedding, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        
        self.projection = nn.Sequential(
            nn.ReLU(),
            nn.Linear(in_features=embedding_size, out_features=embedding_size)
        )
        self.drop = nn.Dropout(p=drop)
        
    def forward(self, tensor_demo, tensor_med, tensor_vitals, tensor_labs):   
        batch_size = tensor_med.size()[0]

        # first traansformation
        emb_demo_X = self.drop(self.embedding(tensor_demo.squeeze(1)))
        emb_med_X = self.drop(self.embedding(tensor_med[:,:].squeeze(1)))
        emb_vitals_X = self.drop(self.embedding(tensor_vitals[:,:].squeeze(1)))
        emb_labs_X =  self.drop(self.embedding(tensor_labs[:,:].squeeze(1)))

        projection_demo_X = self.projection(emb_demo_X)
        projection_med_X = self.projection(emb_med_X)
        projection_vitals_X = self.projection(emb_vitals_X)
        projection_labs_X = self.projection(emb_labs_X)

        embedding_X = (emb_demo_X, emb_med_X, emb_vitals_X, emb_labs_X)
        projection_X = (projection_demo_X, projection_med_X, projection_vitals_X, projection_labs_X)

        # second transformation
        emb_demo_Y = self.drop(self.embedding(tensor_demo.squeeze(1)))
        emb_med_Y = self.drop(self.embedding(tensor_med[:,:].squeeze(1)))
        emb_vitals_Y = self.drop(self.embedding(tensor_vitals[:,:].squeeze(1)))
        emb_labs_Y =  self.drop(self.embedding(tensor_labs[:,:].squeeze(1)))

        projection_demo_Y = self.projection(emb_demo_Y)
        projection_med_Y = self.projection(emb_med_Y)
        projection_vitals_Y = self.projection(emb_vitals_Y)
        projection_labs_Y = self.projection(emb_labs_Y)

        embedding_Y = (emb_demo_Y, emb_med_Y, emb_vitals_Y, emb_labs_Y)
        projection_Y = (projection_demo_Y, projection_med_Y, projection_vitals_Y, projection_labs_Y)

        return embedding_X, projection_X, embedding_Y, projection_Y

## Testing

In [None]:
# device = 'cuda'

res = evaluate(test_model, test_loader, threshold=0.5, log_res=False)

In [None]:
def evaluate(model, test_loader, threshold=0.5, log_res=True):
    
    stacked_labels = torch.tensor([]).to(device)
    stacked_preds = torch.tensor([]).to(device)
    model.eval()
    step = 1
    correct_preds = []

    with torch.no_grad():
        for (tensor_demo, tensor_med, tensor_vitals, tensor_labs), tensor_labels in test_loader:
            print(f'Step {step}/{len(test_loader)}' )
            labels = tensor_labels.to(device)
            demo = tensor_demo.to(device)
            med = tensor_med.to(device)
            vitals = tensor_vitals.to(device)
            labs = tensor_labs.to(device)

            output = model(demo, med, vitals, labs)
            output = nn.Sigmoid()(output)
            output = (output > threshold).int()

            # stacking labels and predictions
            stacked_labels = torch.cat([stacked_labels, labels], dim=0, )
            stacked_preds = torch.cat([stacked_preds, output], dim=0, )

            get_list_correct_preds(output, labels, correct_preds)
            step += 1

    # calculate accuracy
    acc = np.round(np.sum(correct_preds) / len(correct_preds), 2)
    # transfer to device
    stacked_labels = stacked_labels.cpu().detach().numpy()
    stacked_preds = stacked_preds.cpu().detach().numpy()
    # get classification metrics for all samples in the test set
    classification_report_res = classification_report(stacked_labels, stacked_preds, zero_division=0, output_dict=True)
    # classification_report_res.update({'epoch':epoch+1})

    if log_res:
        for key, value in classification_report_res.items():
            # wandb.log({key:value, 'epoch':epoch+1})
            wandb.log({key+'test':value})
        # wandb.log({'epoch':epoch+1, 'accuracy':acc})
        
    return classification_report_res, acc

In [None]:
sigmoid = nn.Sigmoid()
threshold = 0.5
correct_preds = []
stacked_labels = torch.tensor([]).to(device)
stacked_preds = torch.tensor([]).to(device)
step = 1
for (tensor_demo, tensor_med, tensor_vitals, tensor_labs), tensor_labels in test_loader:
    print(f'step {step}/{len(test_loader)}')
    labels = tensor_labels.to(device)
    demo = tensor_demo.to(device)
    med = tensor_med.to(device)
    vitals = tensor_vitals.to(device)
    labs = tensor_labs.to(device)

    output = test_model(demo, med, vitals, labs)
    output = sigmoid(output)
    # get_list_correct_preds(output, labels, correct_preds,  threshold)
    output = (output > threshold).int()
    # stacking labels and predictions
    stacked_labels = torch.cat([stacked_labels, labels], dim=0, )
    stacked_preds = torch.cat([stacked_preds, output], dim=0, )
    step += 1
    # if step > 9:break
    

# Fixed model

In [6]:
#paths
print('Filtering admissions...')
CURR_PATH = os.getcwd()
PKL_PATH = CURR_PATH+'/pickles/'
DF_PATH = CURR_PATH +'/dataframes/'

# loading the data
with open(DF_PATH + 'pid_train_df_finetuning_6days_aki.pkl', 'rb') as f:
    pid_train_df = pickle.load(f)

with open(DF_PATH + 'pid_val_df_finetuning_6days_aki.pkl', 'rb') as f:
    pid_val_df = pickle.load(f)

with open(DF_PATH + 'pid_test_df_finetuning_6days_aki.pkl', 'rb') as f:
    pid_test_df = pickle.load(f)

observing_window = 3 

train_admissions = []
for adm in pid_train_df.hadm_id.unique():   
    if ({1,2,3,4}.issubset(set(pid_train_df[pid_train_df.hadm_id==adm].days.values[0])) or \
        {-1,0,1,2}.issubset(set(pid_train_df[pid_train_df.hadm_id==adm].days.values[0]))or \
            {0,1,2,3}.issubset(set(pid_train_df[pid_train_df.hadm_id==adm].days.values[0]))) and \
        (len(pid_train_df[pid_train_df.hadm_id==adm].days.values[0])>3) and\
            sum(pid_train_df[pid_train_df.hadm_id==adm].aki_status_in_visit.values[0][:observing_window])==0:
        train_admissions.append(adm)

val_admissions = []
for adm in pid_val_df.hadm_id.unique():   
    if ({1,2,3,4}.issubset(set(pid_val_df[pid_val_df.hadm_id==adm].days.values[0])) or \
        {-1,0,1,2}.issubset(set(pid_val_df[pid_val_df.hadm_id==adm].days.values[0]))or \
            {0,1,2,3}.issubset(set(pid_val_df[pid_val_df.hadm_id==adm].days.values[0]))) and \
        (len(pid_val_df[pid_val_df.hadm_id==adm].days.values[0])>3) and\
            sum(pid_val_df[pid_val_df.hadm_id==adm].aki_status_in_visit.values[0][:observing_window])==0:
        val_admissions.append(adm)

test_admissions = []
for adm in pid_test_df.hadm_id.unique():   
    if ({1,2,3,4}.issubset(set(pid_test_df[pid_test_df.hadm_id==adm].days.values[0])) or \
        {-1,0,1,2}.issubset(set(pid_test_df[pid_test_df.hadm_id==adm].days.values[0]))or \
            {0,1,2,3}.issubset(set(pid_test_df[pid_test_df.hadm_id==adm].days.values[0]))) and \
        (len(pid_test_df[pid_test_df.hadm_id==adm].days.values[0])>3) and\
            sum(pid_test_df[pid_test_df.hadm_id==adm].aki_status_in_visit.values[0][:observing_window])==0:
        test_admissions.append(adm)

print('train_admissions', len(train_admissions))
print('val_admissions', len(val_admissions))
print('test_admissions', len(test_admissions))

pid_train_df = pid_train_df[pid_train_df.hadm_id.isin(train_admissions)]
pid_val_df = pid_val_df[pid_val_df.hadm_id.isin(val_admissions)]
pid_test_df = pid_test_df[pid_test_df.hadm_id.isin(test_admissions)]

Filtering admissions...


FileNotFoundError: [Errno 2] No such file or directory: '/home/svetlana.maslenkova/LSTM/aki_prediction/other/dataframes/pid_train_df_finetuning_6days_aki.pkl'

In [27]:
min_frequency = 5
print('Training tokenizer...')
os.environ["TOKENIZERS_PARALLELISM"] = "true"
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[PAD]", "[UNK]"], min_frequency=min_frequency)
files = glob.glob(TXT_DIR_TRAIN+'/*')
tokenizer.train(files, trainer)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
print(f'Vocab size is {tokenizer.get_vocab_size()}')

Training tokenizer...



Vocab size is 28577


In [9]:
output = tokenizer.encode('D72190')
output.tokens

['D', '72190']

In [106]:
class MyDataset(Dataset):

    def __init__(self, df, tokenizer, max_length_day=400, pred_window=2, observing_window=3):
        self.df = df
        self.tokenizer = tokenizer
        self.observing_window = observing_window
        self.pred_window = pred_window
        self.max_length_day = max_length_day
        self.max_length_diags = 30

        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        return self.make_matrices(idx)
    
    def tokenize(self, text, max_length):
        
        try:
            output = self.tokenizer.encode(text)
        except:
            print(type(text), text, max_length)
            output = self.tokenizer.encode(text)

        # padding and truncation
        if len(output.ids) < max_length:
            len_missing_token = max_length - len(output.ids)
            padding_vec = [self.tokenizer.token_to_id('[PAD]') for _ in range(len_missing_token)]
            token_output = [*output.ids, *padding_vec]
        elif len(output.ids) > max_length:
            token_output = output.ids[:max_length]
        else:
            token_output = output.ids
        
        return token_output

    def make_matrices(self, idx):
        
        day_info = self.df.days_in_visit.values[idx]
        diagnoses_info = self.df.previous_diagnoses.values[idx][0]
        aki_status = self.df.aki_status_in_visit.values[idx]
        days = self.df.days.values[idx]
        # print(idx)

        labels = []
        day_info_list = []
        label = None

        for day in range(days[0], days[0] + self.observing_window + self.pred_window):
            # print('day', day)
            if day not in days:
                labels.append(0)
                day_info_list.append(self.tokenize('', self.max_length_day))
            else:
                i = days.index(day)
                
                if np.isfinite(aki_status[i]):                    
                    labels.append(aki_status[i])
                else:
                    labels.append(0)

                if (str(day_info[i]) == 'nan') or (day_info[i] == np.nan):
                    day_info_list.append(self.tokenize('[PAD]', self.max_length_day))
                else:
                    day_info_list.append(self.tokenize(day_info[i], self.max_length_day))


        if sum(labels[-self.pred_window:]) > 0:
            label = 1
        else:
            label = 0

        if (str(diagnoses_info) == 'nan') or (diagnoses_info == np.nan):
            diagnoses_info = self.tokenize('[PAD]', self.max_length_diags)
        else:
            diagnoses_info = self.tokenize(diagnoses_info, self.max_length_diags)

        #make tensors
        tensor_day = torch.tensor(day_info_list[:self.observing_window], dtype=torch.int64)
        tensor_diags = torch.tensor(diagnoses_info, dtype=torch.int64)
        # tensor_labels = torch.tensor(labels[- self.pred_window:], dtype=torch.float64)
        tensor_labels = torch.tensor(label, dtype=torch.float64)
    

        return tensor_day, tensor_diags, tensor_labels, idx


class EHR_MODEL(nn.Module):
    def __init__(self, max_length, vocab_size, device, pred_window=2, observing_window=3,  H=128, embedding_size=200):
        super(EHR_MODEL, self).__init__()

        self.observing_window = observing_window
        self.pred_window = pred_window
        self.H = H
        self.max_length = max_length
        self.max_length_diags = 30
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.device = device

        # self.embedding = pretrained_model
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)

        self.lstm_day = nn.LSTM(input_size=embedding_size,
                            hidden_size=self.H,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

        self.fc_day = nn.Linear(self.max_length * 2 * self.H, 2048)

        self.fc_adm = nn.Linear(2048*self.observing_window +  self.max_length_diags * 2 * self.H, 2048)

        self.lstm_adm = nn.LSTM(input_size=2048,
                            hidden_size=self.H,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)

        self.drop = nn.Dropout(p=0.5)

        self.fc_2 = nn.Linear(self.H*2, 1)

    def forward(self, tensor_day, tensor_diagnoses):

        batch_size = tensor_day.size()[0]

        # full_output = torch.tensor([]).to(device=self.device)
        out_emb_diags = self.embedding(tensor_diagnoses.squeeze(1))
        out_lstm_diags, _ = self.lstm_day(out_emb_diags)
        full_output = out_lstm_diags.reshape(batch_size, self.max_length_diags * 2 * self.H)
        

        for d in range(self.observing_window):
            # embedding layer applied to all tensors [16,400,200]
            out_emb = self.embedding(tensor_day[:, d, :].squeeze(1))
            # print('out_emb', out_emb.size())

            # lstm layer applied to embedded tensors
            output_lstm_day= self.fc_day(\
                                    self.lstm_day(out_emb)[0]\
                                        .reshape(batch_size, self.max_length * 2 * self.H))

            # print('output_lstm_day', output_lstm_day.size())                   
            # concatenate for all * days
            full_output = torch.cat([full_output, output_lstm_day], dim=1) # [16, 768]

        # print('full_output size: ', full_output.size(), '\n')
        output = self.fc_adm (full_output)
        # print('output after fc_adm size: ', output.size(), '\n')
        output, _ = self.lstm_adm(output)
        # print('output after lstm_adm', output.size())
        output = self.drop(output)
        output = self.fc_2(output)
        # print('output after fc_2', output.size())
        output = torch.squeeze(output, 1)

        # output = nn.Sigmoid()(output)

        return output

In [11]:
# with open(DF_PATH + 'pid_test_df.pkl', 'rb') as f:
#    pid_test_df = pickle.load(f)
# print('test set shape: ', pid_test_df.shape)

# with open(DF_PATH + 'pid_test_df_finetuning.pkl', 'rb') as f:
#    pid_test_df = pickle.load(f)

with open(DF_PATH + 'pid_test_df_finetuning_6days_aki.pkl', 'rb') as f:
    pid_test_df = pickle.load(f)

print('test set shape: ', pid_test_df.shape)

test set shape:  (1755, 10)


In [103]:
from torch.utils.data import DataLoader

vocab_size = tokenizer.get_vocab_size() #+ 1
embedding_size = 200
dimension = 128
BATCH_SIZE = 64

frac = 1
train_dataset = MyDataset(pid_train_df.sample(frac=frac), tokenizer=tokenizer, max_length_day=400)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = MyDataset(pid_val_df.sample(frac=frac), tokenizer=tokenizer, max_length_day=400)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = MyDataset(pid_test_df.sample(frac=frac), tokenizer=tokenizer, max_length_day=400)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Load trained model and evaluate

In [None]:
# test_model = LSTM_model(max_day=13).to(device)
# test_model = LSTM_model(max_length=max_length, H=128, max_day=max_days, pred_window=pred_window, vocab_size=vocab_size).to(device)
# optimizer = optim.Adam(test_model.parameters(), lr=0.00001)

# load_checkpoint(file_path + '/model.pt', test_model, optimizer)


In [27]:
def get_list_correct_preds(output, target, correct_preds):
    # correct_preds - the list of size (number of samples) where 1 when predicted value 
    # is equal to target, othrwise 0.

    # predicted = (output > threshold).int()
    predicted = output
    for i in range(target.size(0)):
        true = target[i].cpu().numpy()
        pred = predicted[i].cpu().numpy()

        if any(target[i] == 1):
            idx_of_one = np.where(true==1)[0][0]
            if (pred[idx_of_one] == 1) & all(pred[:idx_of_one] == 0):
                correct_preds.append(1)
            else:
                correct_preds.append(0)
        
        else:
            if all(pred == 0):
                correct_preds.append(1)
            else:
                correct_preds.append(0)        
                       
    return correct_preds

In [14]:
def evaluate(model, test_loader, device, threshold=0.5, log_res=True):
    model = model.to(device)
    stacked_labels = torch.tensor([]).to(device)
    stacked_preds = torch.tensor([]).to(device)
    
    model.eval()
    step = 1
    with torch.no_grad():
        for tensor_day, tensor_diags, tensor_labels, idx in test_loader:
            # print(f'Step {step}/{len(test_loader)}' )
            labels = tensor_labels.to(device)
            day_info = tensor_day.to(device)
            tensor_diags = tensor_diags.to(device)

            output = model(day_info, tensor_diags)
            output = nn.Sigmoid()(output)
            output = (output > threshold).int()

            # stacking labels and predictions
            stacked_labels = torch.cat([stacked_labels, labels], dim=0, )
            stacked_preds = torch.cat([stacked_preds, output], dim=0, )

            step += 1

    # calculate accuracy
    acc = torch.round(torch.sum(stacked_labels==stacked_preds) / len(stacked_labels), decimals=2)
    # transfer to device
    stacked_labels = stacked_labels.cpu().detach().numpy()
    stacked_preds = stacked_preds.cpu().detach().numpy()
    # get classification metrics for all samples in the test set
    classification_report_res = classification_report(stacked_labels, stacked_preds, zero_division=0, output_dict=True)
    print(classification_report(stacked_labels, stacked_preds, zero_division=0, output_dict=False))
    if log_res:
        for k_day, v_day in classification_report_res.items():
            if k_day is not 'accuracy':
                for k, v in v_day.items():
                    if k is not 'support':
                        wandb.log({"test_" + k + k_day : v})
                        # print("test_" + k +'_'+ k_day, v)
            else:
                # print('accuracy', v_day)
                wandb.log({"test_" + k_day: v_day})

    return classification_report_res, acc

### Testing

In [137]:
res = evaluate(test_model, test_loader, device=device, threshold=0.3, log_res=False)

              precision    recall  f1-score   support

           0       0.04      0.01      0.02       146
           1       0.19      0.12      0.15       193

   micro avg       0.15      0.07      0.10       339
   macro avg       0.12      0.07      0.08       339
weighted avg       0.13      0.07      0.09       339
 samples avg       0.02      0.02      0.02       339



In [30]:
res[0].keys()

dict_keys(['0', '1', '2', '3', '4', '5', 'micro avg', 'macro avg', 'weighted avg', 'samples avg'])

In [None]:
logs = {}
for k_day, v_day in res[0].items():
    if k_day is not 'epoch':
        for k, v in v_day.items():
            print("test_" + k + "_day_" + k_day, v)
            print(f'-------------------')
    #     v_day.update({'day':k_day})

### Test trained model and compare against randomly initialized model

Trained model

In [15]:
file_path = '/l/users/svetlana.maslenkova/models/finetuning/embeddings/FT_PREemb_DIAGS_11k_lr1e-05_h128_pw2_ow3_wd0__drop0.4'
test_model = EHR_MODEL(vocab_size=vocab_size, max_length=400, device=device, pred_window=2, observing_window=3).to(device)
optimizer = optim.Adam(test_model.parameters(), lr=0.00001)
load_checkpoint(file_path + '/model.pt', test_model, optimizer)

Model loaded from <== /l/users/svetlana.maslenkova/models/finetuning/embeddings/FT_PREemb_DIAGS_11k_lr1e-05_h128_pw2_ow3_wd0__drop0.4/model.pt


0.4763745665550232

In [129]:
from sklearn.metrics import precision_recall_curve, f1_score, recall_score, precision_score, confusion_matrix
import wandb

def evaluate(model, test_loader, device, threshold=None, log_res=False):
    model = model.to(device)
    stacked_labels = torch.tensor([]).to(device)
    stacked_probs = torch.tensor([]).to(device)
    
    model.eval()
    step = 1
    with torch.no_grad():
        for tensor_day, tensor_diags, tensor_labels, idx in test_loader:
            # print(f'Step {step}/{len(test_loader)}' )
            labels = tensor_labels.to(device)
            day_info = tensor_day.to(device)
            tensor_diags = tensor_diags.to(device)

            probs = test_model(day_info, tensor_diags)
            probs = nn.Sigmoid()(probs)
            # output = (probs > threshold).int()

            # stacking labels and predictions
            stacked_labels = torch.cat([stacked_labels, labels], dim=0, )
            # stacked_preds = torch.cat([stacked_preds, output], dim=0, )
            stacked_probs = torch.cat([stacked_probs, probs], dim=0, )
            step += 1
            
    # transfer to device
    if device=='cpu':
        stacked_labels = stacked_labels.cpu().detach().numpy()
        stacked_probs = stacked_probs.cpu().detach().numpy()

    if threshold==None:
        if stacked_labels.ndim > 1:
            precision, recall, thresholds = precision_recall_curve(stacked_labels[:].sum(axis=1)>0, np.max(stacked_probs, axis=1))
        else:
            precision, recall, thresholds = precision_recall_curve(stacked_labels, stacked_probs)
            
        # convert to f score
        fscore = (2 * precision * recall) / (precision + recall)
        # locate the index of the largest f score
        ix = np.argmax(np.nan_to_num(fscore))
        threshold = np.round(thresholds[ix], 2)
        print('Best Threshold=%.2f, F-Score=%.2f' % (threshold, fscore[ix]))

    stacked_preds = (stacked_probs > threshold).astype(int)
    if stacked_labels.ndim > 1:
        y_true = (stacked_labels[:].sum(axis=1)>0) 
        y_pred = (stacked_preds[:].sum(axis=1)>0)
    else:
        y_true = stacked_labels
        y_pred = stacked_preds

    accuracy = np.round(accuracy_score(y_true, y_pred), 2)
    print(f'Accuracy: {accuracy}')

    f1_score_ = np.round(f1_score(y_true, y_pred, pos_label=1, average='binary'), 2)
    print(f'F1: ', f1_score_)

    recall_score_ = np.round(recall_score(y_true, y_pred, pos_label=1, average='binary'), 2)
    print(f'Recall: ', recall_score_)

    precision_score_ = np.round(precision_score(y_true, y_pred, pos_label=1, average='binary'), 2)
    print(f'Precision: ', precision_score_)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity =  np.round(tn / (tn + fp), 2)
    print(f'Specificity: ', specificity)

    print(f'Confusion matrix:\n', confusion_matrix(y_true, y_pred))

    if log_res:
        wandb.log({'test_accuracy':accuracy, 'test_f1_score':f1_score_, 'test_recall_score':recall_score_, 'test_precision_score':precision_score_, 'test_specificity':specificity})

    # get classification metrics for all samples in the test set
    classification_report_res = classification_report(stacked_labels, stacked_preds, zero_division=0, output_dict=True)
    print(classification_report(stacked_labels, stacked_preds, zero_division=0, output_dict=False))
    # if log_res:
    #     for k_day, v_day in classification_report_res.items():
    #         if k_day is not 'accuracy':
    #             for k, v in v_day.items():
    #                 if k is not 'support':
    #                     wandb.log({"test_" + k + k_day : v})
    #                     # print("test_" + k +'_'+ k_day, v)
    #         else:
    #             # print('accuracy', v_day)
    #             wandb.log({"test_" + k_day: v_day})

    return {'accuracy':accuracy, 'f1_score':f1_score_, 'recall_score':recall_score_, 'precision_score':precision_score_, 'specificity':specificity}

In [None]:
result = evaluate(test_model, test_loader, device='cpu', log_res=False)

In [26]:
sigmoid = nn.Sigmoid()
stacked_labels = torch.tensor([]).to(device)
stacked_probs = torch.tensor([]).to(device)
step = 1
for tensor_day, tensor_diags, tensor_labels, idx in test_loader:
    labels = tensor_labels.to(device)
    day_info = tensor_day.to(device)
    tensor_diags = tensor_diags.to(device)

    probs = test_model(day_info, tensor_diags)
    probs = nn.Sigmoid()(probs)

    # stacking labels and predictions
    stacked_labels = torch.cat([stacked_labels, labels], dim=0, )
    stacked_probs = torch.cat([stacked_probs, probs], dim=0, )
    step += 1
    # if step > 9:break
    

In [27]:
stacked_labels = stacked_labels.cpu().detach().numpy()
# stacked_preds = stacked_preds.cpu().detach().numpy()
stacked_probs = stacked_probs.cpu().detach().numpy()

In [81]:
precision, recall, thresholds = precision_recall_curve(stacked_labels[:].sum(axis=1)>0, np.max(stacked_probs, axis=1), pos_label=1)
# convert to f score
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(np.nan_to_num(fscore))
threshold = np.round(thresholds[ix], 2)
print('Best Threshold=%.2f, F-Score=%.2f' % (threshold, fscore[ix]))

Best Threshold=0.17, F-Score=0.56


  This is separate from the ipykernel package so we can avoid doing imports until


In [76]:
np.round(thresholds[ix], 2)

0.17

In [72]:
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix
# threshold = 0.2
stacked_preds = (stacked_probs > threshold).astype(int)
y_true = (stacked_labels[:].sum(axis=1)>0) 
y_pred = (stacked_preds[:].sum(axis=1)>0)
a = y_true  == y_pred
accuracy = np.round(a.sum() / len(a) , 2)
print(accuracy)
print(np.round(accuracy_score(y_true, y_pred), 2))

f1_score_ = np.round(f1_score(y_true, y_pred, pos_label=1, average='binary'), 2)
print(f'F1 is ', f1_score_)

recall_score_ = np.round(recall_score(y_true, y_pred, pos_label=1, average='binary'), 2)
print(f'Recall is ', recall_score_)

precision_score_ = np.round(precision_score(y_true, y_pred, pos_label=1, average='binary'), 2)
print(f'Precision is ', precision_score_)


tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
specificity =  np.round(tn / (tn + fp), 2)
print(f'Specificity is ', specificity)


0.5
0.5
F1 is  0.55
Recall is  0.85
Precision is  0.41
Specificity is  0.29


In [184]:
# trained model
a = (stacked_labels[:].sum(axis=1)>0) == (stacked_preds[:].sum(axis=1)>0)
print(a.sum())
print(len(a))
print(a.sum() / len(a))

960
1755
0.5470085470085471


In [185]:
# results for 24, 24h prediction
print(classification_report(stacked_labels, stacked_preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.36      0.57      0.44       472
           1       0.31      0.70      0.43       449

   micro avg       0.33      0.63      0.44       921
   macro avg       0.34      0.63      0.44       921
weighted avg       0.34      0.63      0.44       921
 samples avg       0.19      0.22      0.20       921



In [186]:
np.round(np.sum((stacked_labels.sum(axis=1)>0)==(stacked_preds.sum(axis=1)>0)) / len(stacked_labels), decimals=2)

0.55

In [187]:
# results for 48h prediction
# 0.2
print(classification_report(stacked_labels[:].sum(axis=1)>0, stacked_preds[:].sum(axis=1)>0))

              precision    recall  f1-score   support

       False       0.72      0.46      0.56      1106
        True       0.43      0.69      0.53       649

    accuracy                           0.55      1755
   macro avg       0.57      0.58      0.55      1755
weighted avg       0.61      0.55      0.55      1755



In [188]:
from sklearn.metrics import confusion_matrix

confusion_matrix(stacked_labels[:].sum(axis=1)>0, stacked_preds[:].sum(axis=1)>0)

array([[510, 596],
       [199, 450]])

In [152]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(stacked_labels[:].sum(axis=1)>0, np.max(stacked_probs, axis=1))
# convert to f score
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(np.nan_to_num(fscore))
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))


Best Threshold=0.190138, F-Score=0.549


In [201]:
(stacked_probs > threshold).astype(int)

array([[1, 1],
       [1, 1],
       [1, 1],
       ...,
       [1, 1],
       [1, 1],
       [1, 1]])

In [202]:
from matplotlib import pyplot
# plot the roc curve for the model
testy = stacked_labels[:].sum(axis=1)>0
no_skill = len(testy[testy==1]) / len(testy)
pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
pyplot.plot(recall, precision, marker='.', label='Model')
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.title('PR Curve')
pyplot.legend()
# show the plot
pyplot.show()

AttributeError: 'Figure' object has no attribute 'plot'

<Figure size 432x288 with 0 Axes>

Randomly initialized model

In [130]:
test_model = EHR_MODEL(vocab_size=vocab_size, max_length=400, device=device, pred_window=2, observing_window=3).to(device)
optimizer = optim.Adam(test_model.parameters(), lr=0.00001)
    

In [131]:
result = evaluate(test_model, test_loader, device='cuda', log_res=False)

RuntimeError: No CUDA GPUs are available

In [114]:
tensor_day, tensor_diags, tensor_labels, idx = next(iter(test_loader))
output = test_model(day_info, tensor_diags)

In [117]:
stacked_labels = torch.cat([tensor_labels, tensor_labels], dim=0, )
stacked_labels.size()

torch.Size([128])

In [119]:
output = output.cpu().detach().numpy()

In [120]:
tensor_labels.ndim

1

In [93]:
# random model
a = (stacked_labels[:].sum(axis=1)>0) == (stacked_preds[:].sum(axis=1)>0)
print(a.sum())
print(len(a))
print(a.sum() / len(a))

1135
1755
0.6467236467236467


In [61]:
print(classification_report(stacked_labels[:].sum(axis=1)>0, stacked_preds[:].sum(axis=1)>0, zero_division=0))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00      1106
        True       0.37      1.00      0.54       649

    accuracy                           0.37      1755
   macro avg       0.18      0.50      0.27      1755
weighted avg       0.14      0.37      0.20      1755



In [78]:
stacked_preds[:15]

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]], dtype=float32)

In [79]:
stacked_labels[:15]

array([[0., 0.],
       [1., 1.],
       [0., 0.],
       [0., 0.],
       [1., 1.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 0.]])

In [136]:
print(np.sum(stacked_labels))
print(np.sum(stacked_preds))

339.0
172.0


It looks ike the model puts zeroes all the time. Now we are going to check the train dataset to see how balanced it is.

In [276]:
def calculate_class_weights(data_loader):
    i = 0
    for tensor_day, tensor_diags, tensor_labels, idx in data_loader:
        if i == 0:
            labels = np.array([]).reshape(0, tensor_labels.size(-1))
        labels = np.vstack([labels, tensor_labels], )
        i += 1
    train_stacked_labels = labels.T
    n_pos = np.sum(train_stacked_labels, axis=-1)
    n_neg = train_stacked_labels.shape[-1] - n_pos
    weights = np.round(n_neg / n_pos, 2)
    
    return weights

In [299]:
i = 0
for tensor_day, tensor_diags, tensor_labels, idx in test_loader:
    if i == 0:
        labels = np.array([])
    labels = np.hstack([labels, tensor_labels], ) if labels.size else tensor_labels
    i += 1

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 64 and the array at index 1 has size 27

In [296]:
labels.shape

(1755,)

In [294]:
tensor_labels.shape

torch.Size([27])

In [271]:
print(labels.shape)
train_stacked_labels = (labels[:].sum(axis=1)>0).T
n_pos = np.sum(train_stacked_labels, axis=-1)
n_neg = train_stacked_labels.shape[-1] - n_pos
weights = np.round(n_neg / n_pos, 2)
weights

(0, 64)


  """


nan

In [277]:
calculate_class_weights(train_loader)

array([2.32, 2.77, 2.02, 2.07, 2.46, 2.13, 2.07, 2.53, 2.02, 2.39, 1.91,
       2.32, 2.46, 2.53, 2.19, 1.68, 2.53, 2.95, 2.19, 2.32, 2.02, 1.96,
       2.25, 2.32, 2.32, 2.25, 1.96, 2.25, 2.39, 2.02, 2.53, 2.46, 3.05,
       2.69, 2.02, 2.02, 2.46, 2.25, 1.77, 3.15, 1.86, 2.02, 2.95, 1.96,
       2.02, 1.96, 2.32, 1.91, 2.61, 1.96, 1.77, 2.32, 2.69, 2.61, 2.61,
       3.26, 3.05, 2.69, 1.81, 2.95, 1.72, 3.05, 2.13, 2.69])

In [152]:
labels = np.array([]).reshape(0,2)
for tensor_day, tensor_diags, tensor_labels, idx in train_loader:
    labels = np.concatenate([labels, tensor_labels], axis=0, )
    

In [153]:
train_stacked_labels = labels.reshape(2, len(labels))
print(train_stacked_labels.shape)
print('The number of positive examples for the 1st day: {}, for the 2nd day: {}'.format(np.sum(train_stacked_labels, axis=1)[0], np.sum(train_stacked_labels, axis=1)[1]))
print(f'The total number of samples: {len(labels)}')
print('The ratio for the 1st day: {}, the 2nd day: {}'.format(np.round(np.sum(train_stacked_labels, axis=1)[0]/len(labels),2), np.round(np.sum(train_stacked_labels, axis=1)[1]/len(labels),2)))

(2, 10624)
The number of positive examples for the 1st day: 2084.0, for the 2nd day: 1984.0
The total number of samples: 10624
The ratio for the 1st day: 0.2, the 2nd day: 0.19


In [41]:
unique, counts = np.unique(np.where(stacked_labels==1)[1], return_counts=True)
counts_aki_day = dict(zip(unique, counts))
counts_aki_day


{0: 135,
 1: 71,
 2: 40,
 3: 39,
 4: 32,
 5: 22,
 6: 16,
 7: 8,
 8: 10,
 9: 12,
 10: 8,
 11: 7,
 12: 6,
 13: 3}

In [42]:
unique, counts = np.unique(np.where(stacked_preds==1)[1], return_counts=True)
counts_aki_day_preds = dict(zip(unique, counts))
counts_aki_day_preds

{0: 13, 2: 1, 4: 1, 5: 2, 6: 1, 8: 1, 9: 2}

In [37]:
preds_ids = np.c_[np.where(stacked_preds==1)]
labels_ids = np.c_[np.where(stacked_labels==1)]

In [38]:
print(len(preds_ids))
print(len(labels_ids))


21
409


In [39]:
print(classification_report(stacked_labels, stacked_preds, zero_division=0, output_dict=False))

              precision    recall  f1-score   support

           0       0.85      0.48      0.61       316
           1       0.84      0.60      0.70       464
           2       0.82      0.55      0.66       430
           3       0.79      0.53      0.63       400
           4       0.79      0.48      0.60       376
           5       0.54      0.24      0.33       331

   micro avg       0.79      0.49      0.61      2317
   macro avg       0.77      0.48      0.59      2317
weighted avg       0.78      0.49      0.60      2317
 samples avg       0.32      0.28      0.28      2317



In [40]:
stacked_labels[:10]

tensor([[0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [41]:
stacked_preds[:10]

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [4]:
with open(DF_PATH + 'train_df_pretraining.pkl', 'rb') as f:
    pid_train_df = pickle.load(f)
print(f'Train dataset is loaded from <=== {DF_PATH}train_df_pretraining.pkl')
pid_train_df.head()

Train dataset is loaded from <=== /home/svetlana.maslenkova/LSTM/dataframes/train_df_pretraining.pkl


Unnamed: 0,subject_id,hadm_id,day_id,demographics,lab_tests,medications,vitals
1411116,10467237,20000019,0,HISPANIC/LATINO F 76.0,Hematology Blood hematocrit {26.5} %; Hematolo...,PNEUMOcoccal 23-valent polysaccharide vaccine ...,temp {98.0} heartrate {65.0} resprate {16.0} o...
1411117,10467237,20000019,1,HISPANIC/LATINO F 76.0,Hematology Blood hematocrit {28.1} %; Hematolo...,PNEUMOcoccal 23-valent polysaccharide vaccine ...,
1411118,10467237,20000019,2,HISPANIC/LATINO F 76.0,Hematology Blood hematocrit {23.9} %; Hematolo...,PNEUMOcoccal 23-valent polysaccharide vaccine ...,
306556,16925328,20000024,0,WHITE F 92.0,Hematology Blood hematocrit {32.1} %; Hematolo...,OxyCODONE (Immediate Release) {2.5} mg ; Gabap...,temp {98.2} heartrate {53.0} resprate {18.0} o...
306557,16925328,20000024,1,WHITE F 92.0,,Sodium Chloride 0.9% Flush {3} mL ; Heparin {...,


In [5]:
with open('/home/svetlana.maslenkova/LSTM/dataframes/pid_train_df_finetuning.pkl', 'rb') as f:
    pid_train_df_finetuning = pickle.load(f)
print(f'Train dataset is loaded from <=== {DF_PATH}pid_train_df_finetuning.pkl')
pid_train_df_finetuning.head()


Train dataset is loaded from <=== /home/svetlana.maslenkova/LSTM/dataframes/pid_train_df_finetuning.pkl


Unnamed: 0,subject_id,hadm_id,demographics_in_visit,lab_tests_in_visit,medications_in_visit,vitals_in_visit,days_in_visit,aki_status_in_visit,days
3,16679562,20001395,"[HISPANIC/LATINO M 73.0, HISPANIC/LATINO M 73....",[Hematology Blood hematocrit {51.2} %; Hematol...,[Influenza Vaccine Quadrivalent {0.5} mL ; Bis...,[temp {} heartrate {80.0} resprate {16.0} o2sa...,[HISPANIC/LATINO M 73.0$temp {} heartrate {80....,"[0, 0, 0, 1, 1, 0, 0, 1, 0]","[0, 1, 2, 3, 4, 5, 6, 7, 8]"
5,10189736,20001789,"[WHITE M 71.0, WHITE M 71.0, WHITE M 71.0, WHI...",[Hematology Blood hematocrit {29.4} %; Hematol...,[PNEUMOcoccal 23-valent polysaccharide vaccine...,[temp {98.7} heartrate {66.0} resprate {18.0} ...,[WHITE M 71.0$temp {98.7} heartrate {66.0} res...,"[0, 0, 0, 0, nan]","[0, 1, 2, 3, 4]"
9,13390157,20002497,"[HISPANIC/LATINO M 44.0, HISPANIC/LATINO M 44....","[nan, Hematology Blood hematocrit {41.4} %; He...","[Metoprolol Succinate XL {100} mg ; , Bag {1} ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[HISPANIC/LATINO M 44.0$nan$nan$Metoprolol Suc...,"[nan, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, nan]","[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]"
10,10995547,20003740,"[WHITE M 69.0, WHITE M 69.0, WHITE M 69.0, WHI...","[nan, Hematology Blood hematocrit {30.9} %; He...",[Lansoprazole Oral Disintegrating Tab {30} mg ...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[WHITE M 69.0$nan$nan$Lansoprazole Oral Disint...,"[nan, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0...","[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,..."
11,19657904,20004357,"[BLACK/AFRICAN AMERICAN F 77.0, BLACK/AFRICAN ...","[nan, Hematology Blood hematocrit {28.7} %; He...",[Aspirin {325} mg ; Amlodipine {5} mg ; Isosor...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan]",[BLACK/AFRICAN AMERICAN F 77.0$nan$nan$Aspirin...,"[nan, 0, 0, 1, 1, 0, nan, 0, nan]","[-1, 0, 1, 2, 3, 4, 5, 6, 7]"


In [303]:
class EHR_PRETRAINING(nn.Module):
    def __init__(self, max_length, vocab_size, device, pred_window=2, observing_window=3,  H=128, embedding_size=200, drop=0.6):
        super(EHR_PRETRAINING, self).__init__()

        self.observing_window = observing_window
        self.pred_window = pred_window
        self.H = H
        self.max_length = max_length
        self.max_length_diags = 30
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.device = device
        self.drop = drop

        # self.embedding = pretrained_model
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)

        self.lstm_day = nn.LSTM(input_size=embedding_size,
                            hidden_size=self.H,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)

        self.fc_day = nn.Linear(self.max_length * 2 * self.H, 2048)

        self.fc_adm = nn.Linear(2048*self.observing_window +  self.max_length_diags * 2 * self.H, 2048)

        self.lstm_adm = nn.LSTM(input_size=2048,
                            hidden_size=self.H,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)

        self.drop = nn.Dropout(p=drop)

        # self.fc_2 = nn.Linear(self.H*2, 2)
        self.projection = nn.Sequential(
            nn.ReLU(),
            nn.Linear(in_features=self.H*2, out_features=256)
        )

    def forward(self, tensor_day, tensor_diagnoses):

        batch_size = tensor_day.size()[0]

        # full_output = torch.tensor([]).to(device=self.device)
        out_emb_diags = self.embedding(tensor_diagnoses.squeeze(1))
        out_lstm_diags, _ = self.lstm_day(out_emb_diags)
        full_output = out_lstm_diags.reshape(batch_size, self.max_length_diags * 2 * self.H)
        

        for d in range(self.observing_window):
            # embedding layer applied to all tensors [16,400,200]
            out_emb = self.embedding(tensor_day[:, d, :].squeeze(1))
            # print('out_emb', out_emb.size())

            # lstm layer applied to embedded tensors
            output_lstm_day= self.drop(self.fc_day(\
                                    self.lstm_day(out_emb)[0]\
                                        .reshape(batch_size, self.max_length * 2 * self.H)))

            # print('output_lstm_day', output_lstm_day.size())                   
            # concatenate for all * days
            full_output = torch.cat([full_output, output_lstm_day], dim=1) # [16, 768]

        # print('full_output size: ', full_output.size(), '\n')
        output = self.fc_adm(full_output)
        # print('output after fc_adm size: ', output.size(), '\n')
        output_vector, _ = self.lstm_adm(output)
        
        # the fisrt transformation
        output_vector_X = self.drop(output_vector)
        projection_X = self.projection(output_vector_X)
        # the second transformation
        output_vector_Y = self.drop(output_vector)
        projection_Y = self.projection(output_vector_Y)

        return output_vector_X, projection_X, output_vector_Y, projection_Y

In [304]:
model = EHR_PRETRAINING(max_length=400, vocab_size=vocab_size, device=device)

In [307]:
output_vector_X, projection_X, output_vector_Y, projection_Y = model(tensor_day, tensor_diags)

In [309]:
projection_Y

tensor([[-0.0077, -0.0525, -0.0229,  ..., -0.0229, -0.0504, -0.0723],
        [-0.0111, -0.0547,  0.0023,  ..., -0.0253, -0.0226, -0.0172],
        [-0.0487, -0.0492,  0.0155,  ..., -0.0660, -0.0363, -0.0215],
        ...,
        [ 0.0125, -0.0496, -0.0211,  ...,  0.0148, -0.0285, -0.0610],
        [ 0.0076, -0.0459, -0.0400,  ..., -0.0580, -0.0201, -0.0605],
        [ 0.0199, -0.0673, -0.0606,  ..., -0.0248, -0.0150, -0.0662]],
       grad_fn=<AddmmBackward0>)