In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import  LabelEncoder
from tqdm.auto import tqdm
import random
import ast
import os
import dill
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoConfig,AutoModel
import json
import torch
import torch.nn.functional as F
from ast import literal_eval
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import optim
from sklearn.model_selection import train_test_split
import torch.nn as nn
import time

In [2]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7fb2b8655cd0>

# Load dataframes

##### Let's load all the datasets from the competition:

In [3]:
DIR = '/kaggle/input/'

In [4]:
features = pd.read_csv(DIR+"nbme-score-clinical-patient-notes/features.csv")
patient_notes = pd.read_csv(DIR+"nbme-score-clinical-patient-notes/patient_notes.csv")
test = pd.read_csv(DIR+"nbme-score-clinical-patient-notes/test.csv")
train= pd.read_csv(DIR+"nbme-score-clinical-patient-notes/train.csv")
sample_submission= pd.read_csv(DIR+"nbme-score-clinical-patient-notes/sample_submission.csv")


##### The features.csv file contain all the features:

In [5]:
features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


##### The patient_notes.csv fie contains all the patient history notes:

In [6]:
patient_notes.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


 As we can see the training set contains the annotation feature that is comprised of strings of medical features detected in the patient history and the corresponding location that is the target feature that our model aims to detect. The id feature is simply the result of concatenating the case_num, pn_num and feature_num features values toghether. 


In [7]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']


The test.csv dataset contains only information about the case number, the patient number and the feature number. The feature number will have to be identified in the corresponding text.


In [8]:
test.head()

Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


##### This is an example of the format of the submission on Kaggle:

In [9]:
sample_submission.head()

Unnamed: 0,id,location
0,00016_000,0 100
1,00016_001,
2,00016_002,200 250;300 400
3,00016_003,
4,00016_004,75 110


##### Merging the train and test datasets with the patient notes and features datasets:

In [10]:
train = train.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])
test = test.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])

In [11]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00041_000,0,41,0,[],[],17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,Family-history-of-MI-OR-Family-history-of-myoc...
2,00046_000,0,46,0,['father: heart attack'],['824 844'],Mr. Cleveland is a 17yo M who was consented by...,Family-history-of-MI-OR-Family-history-of-myoc...
3,00082_000,0,82,0,['Father MI'],['622 631'],17 yo M w/ no cardiac or arrhythmia PMH presen...,Family-history-of-MI-OR-Family-history-of-myoc...
4,00100_000,0,100,0,['Dad-MI'],['735 741'],HPI: Dillon Cleveland is an otherwise healthy ...,Family-history-of-MI-OR-Family-history-of-myoc...


In [12]:
test.head()

Unnamed: 0,id,case_num,pn_num,feature_num,pn_history,feature_text
0,00016_000,0,16,0,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,HPI: 17yo M presents with palpitations. Patien...,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,HPI: 17yo M presents with palpitations. Patien...,Chest-pressure
3,00016_003,0,16,3,HPI: 17yo M presents with palpitations. Patien...,Intermittent-symptoms
4,00016_004,0,16,4,HPI: 17yo M presents with palpitations. Patien...,Lightheaded


# Hyperparameters

##### For the choice of the epochs, lr and batch size we followed the guidance of the BERT paper and tried the following combinations: 
-  ##### Batch size: 16, 32
-  ##### Learning rate (Adam): 5e-5, 3e-5, 2e-5
-  ##### Number of epochs: 2, 3, 

The rules of the competition do not consent to use the internet when doing the final submission. Thus, when this code runs on Kaggle the BERT model is taken from a dataset on Kaggle in which it was loaded and so the model_name in the hyperparameters is set to: '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'. 
Otherwise, when internet can be used the model_name is simply 'bert-base-uncased'. 


In [13]:
hyperparameters = {
    'max_length': 512,
    'padding': 'max_length',
    'return_offsets_mapping': True,
    'truncation': 'only_second',
    'model_name': '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased',
    'dropout': 0.1,
    'lr': [5e-5, 3e-5, 2e-5],
    'val_size': 0.2,
    'seed': 999,
    'batch_size': [16,32],
    'epochs': [2,3,4]
}

# BERT Tokenizer

In [14]:
DATA_PATH = DIR + 'nbmemodel'
DATA_EXISTS = os.path.exists(DATA_PATH)

I fine-tuned the bert-base-uncased model. The Bert model requires a certain input format so we first use the Bert tokenizer to achieve that trough the AutoTokenizer class. By quoting the Hugging Face tutorial, "AutoClasses are here so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary, that will directly create a class of the relevant architecture".. 

In [15]:
if DATA_EXISTS:
    tokenizer = AutoTokenizer.from_pretrained(DATA_PATH+"/my_tokenizer/",normalization=True)
    config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
else:
    tokenizer = AutoTokenizer.from_pretrained(hyperparameters['model_name'],normalization=True) 
    config = AutoConfig.from_pretrained(hyperparameters['model_name'])
    tokenizer.save_pretrained('/my_tokenizer')
    config.save_pretrained('/my_tokenizer')

# Label Encoding

##### There are 144 unique classes, i.e. unique medical features. As we can see however the numbers are not ordered. 

In [16]:
EMPTY =  -1
CLASSES = [EMPTY,]+features.feature_num.unique().tolist()
print("Unique classes: ", CLASSES)
print("\nNumber of unique classes: ", len(CLASSES))

Unique classes:  [-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 700, 701, 702, 703, 704, 705, 706, 707, 708, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916]

Number of unique classes:  144


Each feature number is encoded into unique labels that go from 1 to 143 plus the label 0 that signals the "empty feature". This will be used when the patient_history text will be tokenized and the scope of the model will be to classify each token, if the label is 0 it means that the corresponding token is not part of any medical feature while if the label is a number between 1-143 that token is a part of the medical feature. 

In [17]:
if DATA_EXISTS:
    label_encoder = dill.load(open(DATA_PATH+"/label_encoder.dill",'rb'))
else:
    # label_encoder
    label_encoder = LabelEncoder()
    # Encode labels
    label_encoder.fit(CLASSES)
    dill.dump(label_encoder,open('label_encoder.dill','wb'))
    
train['TARGET']= label_encoder.transform(train['feature_num'])
test['TARGET']= label_encoder.transform(test['feature_num'])
N_CLASSES = len(label_encoder.classes_)
EMPTY_IDX = label_encoder.transform([EMPTY,])[0]
print(f"Empty label: {EMPTY_IDX}")

Empty label: 0


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


# Ausiliary functions

In [18]:
def decode_location(locations):
    """This function transform the location features from a string representation of a list to a list of tuples"""
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])

def decode_metrics(locations):
    res = []
    for location in ast.literal_eval(locations):
        if location:
            for loc in location.split(';'):
                x,y = loc.split()
                res.append(np.arange(int(x), int(y)))
    res = np.array(res, dtype = object)
    try:
        res = np.concatenate(res)
        res = np.array(list(set(res)))
    except:
        pass
    return res

def decode_position(pos):
    """This function transforms the predicted position to the format required in the Kaggle submission """
    return ";".join([" ".join(np.array(p).astype(str)) for p in pos])


def translate(preds,targets_to_row_ids,offsets):
    """This function takes the predicitons and for each target feature in the test dataset 
    checks whether that feature is predicted somewhere in the sequence.
    If a target feature is detected in the prediction vector, it returns the characters positions of the feature."""
    all_ids = []
    all_pos = []

    for k in range(len(preds)):
        offset = offsets[k]
        pred = preds[k]
        
        targets_to_ids = targets_to_row_ids[k]
        prediction = {targets_to_ids[t]:[] for t in targets_to_ids}
        i = 0
        while i<hyperparameters['max_length']:
            label = pred[i]
        
            if label == EMPTY_IDX:
                i += 1
                continue
            if label in targets_to_ids:
                key = targets_to_ids[label]
                start = offset[i][0]
                while i<hyperparameters['max_length']:
                    if pred[i] != label:
                        break
                    else:
                        end = max(offset[i])
                    i += 1
                if  end == 0:
                    break
                prediction[key].append((start,end))
            else:
                i+=1
        for key in prediction:
            all_ids.append(key)
            all_pos.append(decode_position(prediction[key]))
    df = pd.DataFrame({
        "id":all_ids,
        "location": all_pos
    })
    return df


# Preparing the inputs for the model

In the following cell we use the encode_plus function from the Hugging Face transformers library. This takes in input a text, in this case the patient history and it tokenizes it. Therefore, each word is mapped to a unique index that corresponds to a word in the tokenizer vocabulary. Of course, not all words are present in the Bert vocabulary so some words are broken into subwords. Moreover, there are some special characters that are added: [SEP] and [CLS]. The [SEP] token is added to indicate the end of a sentence and the other one is a special classification token. Bert has also two additional constraints: each sentence must be padded or truncated to a fixed length and the maximum sentence length is 512 tokens. For our task, we use the maximum possible length since there are no reasons to truncate the text and padding will not cause any issues since the dataset is quite small. The [PAD] characters are thus special tokens that indicate that those tokens are not 'expected' tokens but rather tokens added just to align the sequences to the same length. 
The encoder_plus returns two inputs that will be directly used by our model:
- the input_ids vector that is the vector of indexes described before in which each token is mapped to a index in the Bert vocabulary
- the attention masks vector which explicitly differentiate real tokens from [PAD] tokens.

Moreover, by setting the parameter "return_offsets_mapping" this function also returns a vector that specify the characters start position and end position for each token. This is very useful to build the ground truth vector because it can be checked whether each token is part of an annotation in the patient history. If that is the case, it will be assigned the matching feautre to the ground truth vector entry corresponding to that token, otherwise it will be assigned a zero, the empty feauture. This procedure is done for the unique patient history texts, so the ground truth vector has shape (1000, 512) which when hot-encoded becomes (1000, 512, 144).

In [19]:
if not DATA_EXISTS:
    sequences, labels, masks,  offsets_list  = [], [], [], []
    row_ids = []
    targets = []

    for group in tqdm(train.groupby('pn_num')): #the training set is grouped by the patient number. There are 1000 unique patient numbers
        group_df = group[1] #to extract the grouped df corresponding to each pn_num, group[0] instead returns the pn_num
        pn_history  = group_df.iloc[0].pn_history
        tokens = tokenizer.encode_plus(pn_history, max_length=hyperparameters['max_length'], padding='max_length', truncation=True, return_offsets_mapping=True)
        sequence = tokens['input_ids'] #token embedding, each token is mapped to a index that represent a word or a subword in the Bert vocabulary
        attention_mask = tokens['attention_mask'] #0 is a padded value

        targets.append([])
        row_ids.append([])

        label = np.array([EMPTY_IDX for _ in range(hyperparameters['max_length'])]) 
        offsets = tokens['offset_mapping'] #vector that retuns the starting character position and the end character position of each token

        label_empty = True
        for index, row in group_df.iterrows():
            target = row.TARGET #this is the target label representing a unique feature that has to be detected in the patient history
            targets[-1].append(target)
            row_ids[-1].append(row.id)

            for i, (w_start, w_end) in enumerate(offsets):
                for start, end in decode_location(row.location): 
                    if w_start < w_end and (w_start >= start) and (end >= w_end): 
                        #if the position of a token in the offset mapping vector is included between the index positions indicated by the location feature
                        #then assign the target label of the row that is considered to the label vector entry that matches the index position of that token 
                        label[i] = target
                        label_empty = False
                    if w_start >= w_end:
                        break
        if not label_empty:
            sequences.append(sequence)
            masks.append(attention_mask)
            labels.append(label)
            offsets_list.append(offsets)

    sequences = np.array(sequences).astype(np.int32) #transform the list into an array
    masks = np.array(masks).astype(np.uint8) 
    labels = F.one_hot(torch.Tensor(np.array(labels)).long(), num_classes=N_CLASSES) #one hot encoding of labels
    labels = np.array(labels) #to transform the tensor back to an array 
    targets_to_row_ids = [dict(zip(a,b)) for a,b in zip(targets,row_ids)]
    np.save(open("masks.npy",'wb'), masks)
    np.save(open("sequences.npy",'wb'), sequences)
    np.save(open("labels.npy",'wb'), labels)
    np.save(open("targets_to_row_ids.npy", 'wb'), targets_to_row_ids)
    np.save(open("offsets_list", 'wb'), offsets_list)
else:
    masks = np.load(open(DATA_PATH+"/masks.npy",'rb'))
    sequences = np.load(open(DATA_PATH+"/sequences.npy",'rb'))
    labels = np.load(open(DATA_PATH+"/labels.npy",'rb'))
    targets_to_row_ids = np.load(open(DATA_PATH+"/targets_to_row_ids.npy", 'rb'),allow_pickle=True)
    offsets_list = np.load(open(DATA_PATH+'/offsets_list', 'rb'))

We create a new dataframe with only the input that is required by the Bert model:

In [20]:
train_bert = pd.DataFrame({'sequence': sequences.tolist(),'mask': masks.tolist(), 'label': labels.tolist()})

# Validation Split

If both flag variables TRAIN and VAL are set to True then the training set is splitted into train and validation sets. If only TRAIN is true then the model is trained on all the dataset to achieve the best training before the testing phase. When both the flag variables TRAIN and VAL are set to False then that indicates it is the submission phase.

In [21]:
TRAIN = False
VAL = False

In [22]:
if VAL:
    train_bert, val_bert = train_test_split(train_bert, test_size=hyperparameters['val_size'], random_state=hyperparameters['seed'])
    targets_to_row_ids_train, targets_to_row_ids_val = np.take(targets_to_row_ids, train_bert.index), np.take(targets_to_row_ids, val_bert.index)
    offsets_train, offsets_val =np.take(np.array(offsets_list), train_bert.index, axis = 0), np.take(np.array(offsets_list), val_bert.index, axis= 0)

# Define Dataset and Model classes

In [23]:
class CustomDataset(Dataset):
    
    def __init__(self, data, tokenizer, hyperparameters):
        self.data = data
        self.tokenizer = tokenizer
        self.hyperparameters = hyperparameters

    def __len__(self):
        '''Method that returns the length of the dataset'''
        return len(self.data)

    def __getitem__(self, index):
        '''Method that processes and returns 1 datapoint at a time.'''
        sequence = self.data.iloc[index]["sequence"]
        mask = self.data.iloc[index]['mask']
        label = self.data.iloc[index]['label']
        return np.array(sequence), np.array(mask), np.array(label)

In [24]:
class CustomModel(nn.Module):
    def __init__(self, hyperparameters):
        super().__init__()
        self.bert = AutoModel.from_pretrained(hyperparameters['model_name']) # BERT model
        self.dropout = nn.Dropout(p=hyperparameters['dropout'])
        self.config = config
        self.fc1 = nn.Linear(768, N_CLASSES) 

    def summary(self):
        return summary(self)

    def forward(self, input_ids, attention):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention)
        logits = self.fc1(self.dropout(outputs[0]))
        return logits

# Training and validation functions

In [25]:
def train_model(model, dataloader, optimizer, criterion):
    ''' 
    Function for training the model.
        inputs : model, dataloader, optimizer, criterion
        outputs : accuracy, precision, recall, f1_micro, train_loss
    '''
    model.train()

    tp, tn, fp, fn, train_loss = 0, 0, 0, 0, 0
    logits_list = []
    for batch in tqdm(dataloader):

        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        logits = model(input_ids, attention_mask)
        
        loss = criterion(torch.permute(logits, (0,2,1)),torch.argmax(labels, dim = 2))
        predicted = torch.argmax(logits,dim =  2).detach().cpu().numpy()
        labelled = torch.argmax(labels,dim = 2).detach().cpu().numpy()

        tp +=  (np.sum((predicted == labelled) & (predicted != 0)))
        tn += (np.sum((predicted == labelled) & (predicted == 0)))
        fp +=  (np.sum((predicted != 0) & (labelled == 0)))
        fn += (np.sum((predicted == 0) & (labelled != 0)))
        train_loss += (loss.item() * input_ids.size(0))

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), 1.0) #clipping the gradient to avoid exploding gradients
        optimizer.step()

    accuracy = ((tp+tn)/(hyperparameters['max_length']*len(train_bert)))*100
    try:
        precision = (tp /(tp+fp))
    except:
        precision = 0.0 
    recall = tp/(tp+fn)
    f1_micro = tp/(tp+0.5*(fp+fn))
    train_loss = train_loss/len(train_bert)
    return accuracy, precision, recall, f1_micro, train_loss

In [26]:
def eval_model(model, dataloader, criterion):

    model.eval()

    tp, tn, fp, fn, val_loss  = 0, 0, 0, 0, 0
    logits_list = []

    for batch in tqdm(dataloader): 
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        logits = model(input_ids, attention_mask)

        loss = criterion(torch.permute(logits, (0,2,1)),torch.argmax(labels, dim = 2))
        predicted = torch.argmax(logits,dim =  2).detach().cpu().numpy()
        labelled = torch.argmax(labels,dim = 2).detach().cpu().numpy()

        tp +=  (np.sum((predicted == labelled) & (predicted != 0)))
        tn += (np.sum((predicted == labelled) & (predicted == 0)))
        fp +=  (np.sum((predicted != 0) & (labelled == 0)))
        fn += (np.sum((predicted == 0) & (labelled != 0)))
        val_loss += (loss.item() * input_ids.size(0))

    try:
        precision = (tp /(tp+fp))
    except:
        precision = 0.0 
    recall = tp/(tp+fn)
    f1_micro = tp/(tp+0.5*(fp+fn))
    val_loss = val_loss/len(val_bert)
    return accuracy, precision, recall, f1_micro, val_loss

# Training the model

To create a dataframe to test the results:

In [27]:
df_old = pd.DataFrame([['','','', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], columns = ['batch_size', 'lr', 'epochs', 'train_acc', 'val_acc', 'train_loss', 'val_loss', 'train_prec', 'val_prec', 'train_rec', 'val_rec', 'train_f1', 'val_f1'])

The nested for-loops have been created just to test all possible combinations of hyperparameters. The best model is then saved. When the final training with all data is needed to be run then the loop can be commented out and the batch_size, epochs and lr set to the "right" values.

In [28]:
device = "cuda"
model = CustomModel(hyperparameters).to(device)
model.to(device)

Some weights of the model checkpoint at ../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [29]:
if TRAIN:
    
    for comb in range(3):
        for comb_2 in range(3):
            for size in range(2):
                
                training_data = CustomDataset(train_bert, tokenizer, hyperparameters)
                train_dataloader = DataLoader(training_data, batch_size=hyperparameters['batch_size'][size], shuffle=True, worker_init_fn=seed_worker,generator=g)

                # Define the loss function
                criterion = nn.CrossEntropyLoss()

                # Define the optimizer
                optimizer = optim.AdamW(model.parameters(), lr=hyperparameters['lr'][comb])


                if VAL:
                    val_data = CustomDataset(val_bert, tokenizer, hyperparameters)
                    val_dataloader = DataLoader(val_data, batch_size=hyperparameters['batch_size'][size], shuffle=False, worker_init_fn=seed_worker,generator=g)
                    val_loss_min = np.Inf
                    best_loss = np.inf
                    
                since = time.time()
                epochs = hyperparameters['epochs'][comb_2]
                train_acc, val_acc, train_loss, val_loss, train_prec, val_prec, train_rec, val_rec, train_f1, val_f1 = list(), list(), list(), list(), list(), list(), list(), list(), list(), list()

                for i in range(epochs):

                    print("Epoch: {}/{}".format(i + 1, epochs))
                    #TRAIN THE MODEL 
                    t_acc, t_prec, t_rec, t_f1, t_loss = train_model(model, train_dataloader, optimizer, criterion) # t_prec, t_rec, t_f1
                    train_acc.append(t_acc)
                    train_loss.append(t_loss)
                    train_prec.append(float(t_prec))
                    train_rec.append(float(t_rec))
                    train_f1.append(float(t_f1))
                    print(f"TRAINING. Loss: {t_loss:.2f};, Accuracy: {t_acc:.2f}; Precision: {t_prec:.2f}; Recall: {t_rec:.2f}; Micro-F1 score: {t_f1:.2f}")
                    #VALIDATE THE MODEL
                    if VAL:
                        v_acc, v_prec, v_rec, v_f1, v_loss = eval_model(model, val_dataloader, criterion)
                        val_acc.append(v_acc)
                        val_loss.append(v_loss)
                        val_prec.append(float(v_prec))
                        val_rec.append(float(v_rec))
                        val_f1.append(float(v_f1))
                        print(f"VALIDATION. Loss: {v_loss:.2f};, Accuracy: {v_acc:.2f}; Precision: {v_prec:.2f}; Recall: {v_rec:.2f}; Micro-F1 score: {v_f1:.2f}")

                        if v_loss < best_loss: #to save the model with best validation loss
                            best_loss = v_loss
                            torch.save(model.state_dict(), "nbme_bert_v2.pth")

                time_elapsed = time.time() - since
                print('Training completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))


                if VAL:
                    
                    #save the results to a dataframe
                    df = pd.DataFrame([[str(hyperparameters['batch_size'][size]), str(hyperparameters['lr'][comb]), str(hyperparameters['epochs'][comb_2]), train_acc[-1], val_acc[-1], train_loss[-1], val_loss[-1], train_prec[-1], val_prec[-1], train_rec[-1], val_rec[-1], train_f1[-1], val_f1[-1]]], columns = ['batch_size', 'lr', 'epochs', 'train_acc', 'val_acc', 'train_loss', 'val_loss', 'train_prec', 'val_prec', 'train_rec', 'val_rec', 'train_f1', 'val_f1'])
                    df_old = pd.concat([df_old, df], join = 'inner')
                    df_old = df_old.sort_values(by = 'val_f1',ascending=False)
                    df_old.to_csv('df.csv')
                    
                    # Plot the results
                    plt.plot(train_loss, "-o", label='Training loss')
                    plt.plot(val_loss, "-o", label='Validation loss')
                    plt.title('Training and validation loss')
                    plt.xlabel('epochs')
                    plt.ylabel('loss')
                    plt.legend()
                    plt.savefig(f'loss.png', dpi=300)
                    plt.figure()

                    plt.plot(train_acc, "-o", label='Training accuracy')
                    plt.plot(val_acc, "-o", label='Validation accuracy')
                    plt.title('Training and validation accuracy')
                    plt.xlabel('epochs')
                    plt.ylabel('accuracy')
                    plt.legend()
                    plt.savefig(f'acc.png', dpi=300)
                    plt.figure()

                    plt.plot(train_prec, "-o", label='Training precision')
                    plt.plot(val_prec, "-o", label='Validation precision')
                    plt.title('Training and validation precision')
                    plt.xlabel('epochs')
                    plt.ylabel('precision')
                    plt.legend()
                    plt.savefig(f'prec.png', dpi=300)
                    plt.figure()

                    plt.plot(train_rec, "-o", label='Training recall')
                    plt.plot(val_rec, "-o", label='Validation recall')
                    plt.title('Training and validation recall')
                    plt.xlabel('epochs')
                    plt.ylabel('recall')
                    plt.legend()
                    plt.savefig(f'recall.png', dpi=300)
                    plt.figure()

                    plt.plot(train_f1, "-o", label='Training F1-Score')
                    plt.plot(val_f1, "-o", label='Validation F1-Score')
                    plt.title('Training and validation F1-Score')
                    plt.xlabel('epochs')
                    plt.ylabel('f1-score')
                    plt.legend()
                    plt.savefig(f'f1score.png', dpi=300)
                    plt.figure()
                    
                else: #if training on the all dataset without validating
                    torch.save(model.state_dict(), "new_model.pth")
else: #testing phase
    model.load_state_dict(torch.load(DATA_PATH + "/new_model.pth", map_location = device))

# Testing

##### Creating the ids vector, the attention mask vector and the offset mapping vector for the test set:

In [30]:
test_sequences, test_masks, test_offsets = [], [],[]
row_ids = []
targets = []

for g1 in tqdm(test.groupby('pn_num')):
    gdf = g1[1]
    pn_history  = gdf.iloc[0].pn_history
    targets.append([])
    row_ids.append([])

    test_tokens = tokenizer.encode_plus(pn_history, max_length=hyperparameters['max_length'], padding='max_length',truncation=True, return_offsets_mapping=True)
    test_sequence = test_tokens['input_ids']
    test_attention_mask = test_tokens['attention_mask'] 

    # BUILD THE TARGET ARRAY
    offset = test_tokens['offset_mapping']
    for index, row in gdf.iterrows():
        targets[-1].append(row.TARGET)
        row_ids[-1].append(row.id)

    test_sequences.append(test_sequence)
    test_masks.append(test_attention_mask)
    test_offsets.append(offset)

test_sequences = np.array(test_sequences).astype(np.int32)
test_masks = np.array(test_masks).astype(np.uint8)
targets_to_row_ids_test = [dict(zip(a,b)) for a,b in zip(targets,row_ids)]
test_bert = pd.DataFrame({'sequence': test_sequences.tolist(),'mask': test_masks.tolist()})

  0%|          | 0/1 [00:00<?, ?it/s]

##### The SubmissionDataset class is similar to the CustomDataset class except that it does not return any ground truth in the \__geitem\__ method. 

In [31]:
class SubmissionDataset(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.config = config
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        '''Function that processes and returns 1 datapoint at a time.'''
        sequence = self.data.iloc[index]["sequence"]
        mask = self.data.iloc[index]['mask']
        return np.array(sequence), np.array(mask)

In [32]:
submission_data = SubmissionDataset(test_bert, tokenizer, hyperparameters)
submission_dataloader = DataLoader(submission_data, batch_size=4, shuffle=False, worker_init_fn=seed_worker,generator=g)

In [33]:
model.eval()

logits_list = []
val_loss = 0

for batch in tqdm(submission_dataloader): 
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    logits = model(input_ids, attention_mask)
    logits_list.append(logits.detach().cpu().numpy())

logits = np.concatenate(logits_list, axis = 0)
preds = np.argmax((logits), -1) #to transform it from one-hot encoding predictions to numerical predicitons from 0 to 143

  0%|          | 0/1 [00:00<?, ?it/s]

The predictions vector output of the model has the first dimension equal to the number of unique patient history in the test set and the second dimension, after applying np.argmax(),  equals the number of unique features, 144. So the translate() function is applied in order to take the prediction vectors and to output the corresponding location of every target feature in the test set. Every row is a different feature to be searched and many rows share the same patient history text. Since this model outputs the predictions of different features all concatenated toghether in the same vector, the translate() function splits the prediction vector into the format required by the competition.

In [34]:
sub = translate(preds.reshape(len(preds),512),targets_to_row_ids_test,test_offsets).sort_values('id')

sub.to_csv('submission.csv',index=False)
sub.head()

Unnamed: 0,id,location
0,00016_000,696 724
1,00016_001,668 693
2,00016_002,203 217
3,00016_003,70 91
4,00016_004,
