In [11]:
# IMPORTS
from distutils.command.config import config
import pickle5 as pickle

# Libraries
import matplotlib.pyplot as plt
import pandas as pd
import wandb
import os
from os.path import exists

# Evaluation
from sklearn.metrics import classification_report, precision_recall_curve, auc, roc_auc_score, \
    accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

import wandb

# Tokenization
from tokenizers import  Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Punctuation, Whitespace
from tokenizers.normalizers import Lowercase
from tokenizers import pre_tokenizers, normalizers
from tokenizers.processors import BertProcessing
import glob

# data
import numpy as np
import matplotlib.pyplot as plt

#torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch.utils.data import WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel

In [2]:
# Save and Load Functions
def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer, device):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [3]:
small_dataset = True
CURR_PATH = os.getcwd()
PKL_PATH = CURR_PATH+'/pickles/'
DF_PATH = '/home/svetlanamaslenkova/Documents/AKI_deep/LSTM/dataframes/'

# loading the data
with open(DF_PATH + 'pid_train_df_finetuning.pkl', 'rb') as f:
    pid_train_df = pickle.load(f)

with open(DF_PATH + 'pid_val_df_finetuning.pkl', 'rb') as f:
    pid_val_df = pickle.load(f)

with open(DF_PATH + 'pid_test_df_finetuning.pkl', 'rb') as f:
    pid_test_df = pickle.load(f)

In [4]:
#paths
diagnoses = 'icd'
min_frequency = 10

destination_folder = '/l/users/svetlana.maslenkova/models' + '/pretraining/three_stages/'
# destination_folder = '/home/svetlanamaslenkova/Documents/AKI_deep/training/'

if diagnoses=='icd':
        TOKENIZER_PATH = CURR_PATH  + '/tokenizer.json'
        TXT_DIR_TRAIN = CURR_PATH + '/txt_files/train'
elif diagnoses=='titles':
        TOKENIZER_PATH = CURR_PATH + '/tokenizer_titles.json'
        TXT_DIR_TRAIN = CURR_PATH + '/txt_files/titles_diags'

# Training the tokenizer
if exists(TOKENIZER_PATH):
        tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
        print(f'Tokenizer is loaded from ==> {TOKENIZER_PATH}/tokenizer.json. Vocab size is {tokenizer.get_vocab_size()}')
else:
        print('Training tokenizer...')
        os.environ["TOKENIZERS_PARALLELISM"] = "true"
        tokenizer = Tokenizer(BPE(unk_token="UNK"))
        tokenizer.normalizer = normalizers.Sequence([Lowercase()])
        # tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=False), Punctuation( behavior = 'removed')])
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Punctuation(behavior = 'isolated')])

        trainer = BpeTrainer(special_tokens=["<s>", "</s>", "PAD", "UNK", "$"], min_frequency=min_frequency)

        files = glob.glob(TXT_DIR_TRAIN+'/*')
        tokenizer.train(files, trainer)
        tokenizer.post_processor = BertProcessing(
                ("</s>", tokenizer.token_to_id("</s>")),
                ("<s>", tokenizer.token_to_id("<s>")), 
                )
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        print(f'Vocab size is {tokenizer.get_vocab_size()}')

Training tokenizer...



Vocab size is 5


In [5]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_length_day=400, diags='icd', pred_window=2, observing_window=2):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.observing_window = observing_window
        self.pred_window = pred_window
        self.max_length_day = max_length_day
        self.diags = diags

        if self.diags == 'titles':
            self.max_length_diags = 400
        else:
            self.max_length_diags = 30
        
        self.max_len = 512

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        hadm_id = self.df.hadm_id.values[idx]
        if self.diags == 'titles':
            diagnoses_info = self.df.previous_diags_titles.values[idx][0]
        else:
            diagnoses_info = self.df.previous_diags_icd.values[idx][0]
        
        day_info = self.df.days_in_visit.values[idx]
        days = self.df.days.values[idx]
        AKI_2_status = self.df.AKI_2_in_visit.values[idx]
        AKI_3_status = self.df.AKI_3_in_visit.values[idx]
        day_info_list = []
        AKI_2_labels = []
        AKI_3_labels = []

        for day in range(0, self.observing_window + self.pred_window):

            if day not in days:
                day_info_list.append('')
                AKI_2_labels.append(0)
                AKI_3_labels.append(0)
            else:
                i = days.index(day)

                if np.isfinite(AKI_2_status[i]):                    
                    AKI_2_labels.append(AKI_2_status[i])
                else:
                    AKI_2_labels.append(0)

                if np.isfinite(AKI_3_status[i]):                    
                    AKI_3_labels.append(AKI_3_status[i])
                else:
                    AKI_3_labels.append(0)

                if (str(day_info[i]) == 'nan') or (day_info[i] == np.nan):
                    day_info_list.append('')
                else:
                    day_info_list.append(day_info[i])
        # diagnoses
        if (str(diagnoses_info) == 'nan') or (diagnoses_info == np.nan):
            diagnoses_info = '' + '$'
        else:
            diagnoses_info = diagnoses_info + '$'

        if sum(AKI_3_labels[-self.pred_window:]) > 0:
            AKI_2 = 1
            AKI_3 = 1
        elif sum(AKI_2_labels[-self.pred_window:]) > 0:
            AKI_2 = 1
            AKI_3 = 0
        else:
            AKI_2 = 0
            AKI_3 = 0
        
        self.text = ' '.join([*[diagnoses_info], *day_info_list[:self.observing_window]]).lower()

        inputs = self.tokenizer.encode_plus(
            self.text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor([AKI_2, AKI_3], dtype=torch.float64)
        }

In [16]:
from transformers import AutoTokenizer, AutoModel

device = 'cpu'
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT").to(device)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
BATCH_SIZE = 4
frac = 1
device = 'cuda'

train_dataset = MyDataset(pid_train_df.sample(frac=frac), tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = MyDataset(pid_val_df.sample(frac=frac), tokenizer=tokenizer)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = MyDataset(pid_test_df.sample(frac=frac), tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [7]:
out = next(iter(train_loader))

ids = out['ids'].to(device, dtype = torch.long)
mask = out['mask'].to(device, dtype = torch.long)
token_type_ids = out['token_type_ids'].to(device, dtype = torch.long)
labels = out['labels'].to(device, dtype = torch.float)

In [87]:
torch.cuda.empty_cache()

In [12]:
output = model(ids, attention_mask = mask, token_type_ids = token_type_ids)

In [24]:
output[0].size()

torch.Size([7, 512, 768])

In [9]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.l2 = torch.nn.Dropout(0.3)
        self.lstm = nn.LSTM(input_size=768,
                            hidden_size=128,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.l3 = torch.nn.Linear(256, 2)
    
    def forward(self, ids, mask, token_type_ids):
        output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output = self.l2(output[1]).unsqueeze(1)
        output, _ = self.lstm(output)
        output = self.l3(output).squeeze(1)
        return output


In [12]:
ft_model = BERTClass().to(device)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [30]:
outputs = ft_model(ids, mask, token_type_ids)

In [13]:
def train(model, 
        optimizer,
        train_loader,
        valid_loader,
        file_path,
        device='cpu',
        num_epochs=5,
        criterion = nn.BCELoss(),
        pos_weight = torch.tensor([]),
        best_valid_loss = float("Inf"),
        dimension=128,
        epoch_patience=15,
        threshold=None,
        scheduler=None):

    # initialize running values
    running_loss = 0.0
    running_acc = 0.0
    valid_running_loss = 0.0
    valid_running_acc = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    train_acc_list = []
    valid_acc_list = []
    global_steps_list = []
    stop_training = 0

    sigmoid_fn = nn.Sigmoid()

    if criterion == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss()
        criterion.pos_weight = pos_weight.to(device)
        use_sigmoid=False
    else:
        criterion = nn.BCELoss()
        use_sigmoid = True

    # training loop
    for epoch in range(num_epochs):  
        stacked_labels = torch.tensor([]).to(device)
        stacked_preds = torch.tensor([]).to(device)

        model.train()
        for data in train_loader:
            # transferring everything to GPU
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data['labels'].to(device, dtype = torch.float)

            output = model(ids, mask, token_type_ids)

            if use_sigmoid:
                loss = criterion(sigmoid_fn(output), labels.type(torch.float32))
            else:
                loss = criterion(output, labels.type(torch.float32))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()            
            global_step += 1
            wandb.log({'step_train_loss': loss.item(), 'global_step': global_step})
            
        # calculate accuracy
        epoch_train_accuracy = torch.round(torch.sum(stacked_labels==stacked_preds) / len(stacked_labels), decimals=2)
        if scheduler is not None:
            scheduler.step()
            print(f'Learning rate is {get_lr(optimizer)}')

        model.eval()
        stacked_labels = torch.tensor([]).to(device)
        stacked_probs = torch.tensor([]).to(device)
        with torch.no_grad():
            # validation loop
            for data in valid_loader:
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                labels = data['labels'].to(device, dtype = torch.float)
                
                output = model(ids, mask, token_type_ids)

                if use_sigmoid:
                    loss = criterion(sigmoid_fn(output), labels.type(torch.float32))
                else:
                    loss = criterion(output, labels.type(torch.float32))

                valid_running_loss += loss.item()
                probs = sigmoid_fn(output)

                # stacking labels and predictions
                stacked_labels = torch.cat([stacked_labels, labels], dim=0)
                stacked_probs = torch.cat([stacked_probs, probs], dim=0, )

        # transfer to device
        stacked_labels = stacked_labels.cpu().detach().numpy()
        stacked_probs = stacked_probs.cpu().detach().numpy()
        # valid loss
        epoch_average_train_loss = running_loss / len(train_loader)  
        epoch_average_valid_loss = valid_running_loss / len(valid_loader)

        train_loss_list.append(epoch_average_train_loss)
        valid_loss_list.append(epoch_average_valid_loss)
        stages = ['AKI 2', 'AKI 2,3']
        for w in range(stacked_labels.ndim):
            stage = stages[w]
            precision, recall, thresholds = precision_recall_curve(stacked_labels.T[w], stacked_probs.T[w])
            precision, recall, thresholds = np.round(precision, 2), np.round(recall,2), np.round(thresholds,2)
            
            # convert to f score
            fscore = np.round((2 * precision * recall) / (precision + recall), 2)
            # locate the index of the largest f score
            ix = np.argmax(np.nan_to_num(fscore))
            threshold = np.round(thresholds[ix], 2)
            stacked_preds = (stacked_probs.T[w] > threshold).astype(int)
            y_true = stacked_labels.T[0]
            y_pred = stacked_preds
            f1_score_ = np.round(f1_score(y_true, y_pred, pos_label=1, average='binary', zero_division=0), 2)
            recall_score_ = np.round(recall_score(y_true, y_pred, pos_label=1, average='binary', zero_division=0), 2)
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
            specificity =  np.round(tn / (tn + fp), 2)
            pr_auc = np.round(auc(recall, precision), 2)
            wandb.log({'val_f1_score_' + stage: f1_score_, 'val_recall_score_'+stage:recall_score_, \
                        'val_specificity'+stage:specificity, 'val_pr_auc'+stage:pr_auc,\
                            'epoch': epoch+1})

        global_steps_list.append(global_step)
        wandb.log({'epoch_average_train_loss': epoch_average_train_loss,
                    'epoch_average_valid_loss': epoch_average_valid_loss,
                    'epoch': epoch+1})

        # resetting running values
        running_loss = 0.0                
        valid_running_loss = 0.0
        
        # print progress
        print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                        epoch_average_train_loss, epoch_average_valid_loss))      

        # checkpoint
        if best_valid_loss > epoch_average_valid_loss:
            best_valid_loss = epoch_average_valid_loss
            save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
            stop_training = 0
        else:
            stop_training +=1
        
        if stop_training == epoch_patience:
            break


# save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [17]:
LR = 0.00001
file_path = '/home/svetlanamaslenkova/Documents/AKI_deep/LSTM/training/test_model'
optimizer = optim.Adam(ft_model.parameters(), lr=LR)

NameError: name 'ft_model' is not defined

In [41]:
file_path

'/home/svetlanamaslenkova/Documents/AKI_deep/LSTM/training/test_model'

In [26]:
train_params = {'model':ft_model, 
                'optimizer':optimizer,
                'train_loader':train_loader,
                'valid_loader':val_loader,
                'file_path':file_path,
                'device':device,
                'num_epochs':1,
                'criterion':nn.BCELoss(),
                'pos_weight' : torch.tensor([]),
                'best_valid_loss' : float("Inf"),
                'dimension':128,
                'epoch_patience':15,
                'threshold':None,
                'scheduler':None  
}

In [43]:
wandb.init(project='test', name='test_bert', mode='online')
train(**train_params)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmaslenkovas[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  recall = tps / tps[-1]
  recall = tps / tps[-1]


ValueError: not enough values to unpack (expected 4, got 1)

In [31]:
stacked_labels = torch.tensor([]).to(device)
stacked_probs = torch.tensor([]).to(device)
for data in val_loader:
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    labels = data['labels'].to(device, dtype = torch.float)
    
    output = ft_model(ids, mask, token_type_ids)

    probs = nn.Sigmoid()(output)

    # stacking labels and predictions
    stacked_labels = torch.cat([stacked_labels, labels], dim=0)
    stacked_probs = torch.cat([stacked_probs, probs], dim=0, )

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 23.64 GiB total capacity; 21.90 GiB already allocated; 31.81 MiB free; 22.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [48]:
# transfer to device
stacked_labels = stacked_labels.cpu().detach().numpy()
stacked_probs = stacked_probs.cpu().detach().numpy()

stages = ['AKI 2', 'AKI 2,3']
for w in range(stacked_labels.ndim):
    stage = stages[w]
    precision, recall, thresholds = precision_recall_curve(stacked_labels.T[w], stacked_probs.T[w])
    precision, recall, thresholds = np.round(precision, 2), np.round(recall,2), np.round(thresholds,2)
    
    # convert to f score
    fscore = np.round((2 * precision * recall) / (precision + recall), 2)
    # locate the index of the largest f score
    ix = np.argmax(np.nan_to_num(fscore))
    threshold = np.round(thresholds[ix], 2)
    stacked_preds = (stacked_probs.T[w] > threshold).astype(int)
    y_true = stacked_labels.T[0]
    y_pred = stacked_preds
    f1_score_ = np.round(f1_score(y_true, y_pred, pos_label=1, average='binary', zero_division=0), 2)
    recall_score_ = np.round(recall_score(y_true, y_pred, pos_label=1, average='binary', zero_division=0), 2)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity =  np.round(tn / (tn + fp), 2)
    pr_auc = np.round(auc(recall, precision), 2)


  recall = tps / tps[-1]
  recall = tps / tps[-1]


ValueError: not enough values to unpack (expected 4, got 1)