The first two commands install the required dependencies

In [None]:
!pip install gpustat transformers



In [None]:
!pip install seqeval[gpu]



In [None]:
# Run this to import the required libraries
import torch
import pandas as pd
import numpy as np
import re

from torch import cuda
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from seqeval.metrics import classification_report
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
# Checks to see if a GPU is available. Otherwise, device is set to CPU.
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'
print(device)

cuda


In [None]:
#The training and validation data that is used to fine-tune BERT is located at the URLs below
train_url = 'https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/JNLPBA/train.tsv'
validation_url = 'https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/JNLPBA/test.tsv'

In [None]:
# Utilize the pandas library to convert the training and validaton data from a csv file into a pandas dataframe.
# Additionally, the data is sanitized using regex.
df_train = pd.read_csv(train_url, sep = '\t', names=['Word', 'Tag'])
df_validation = pd.read_csv(validation_url, sep = '\t', names=['Word', 'Tag'])
df_train = df_train.replace(r'^\-DOCSTART\-*$', float('NaN'), regex = True)
df_validation = df_validation.replace(r'^\-DOCSTART\-$', float('NaN'), regex = True)
df_train.dropna(inplace = True)
df_validation.dropna(inplace = True)
df_train['Sentence #'] = (df_train['Word'] == '.').cumsum()
df_train = df_train[df_train['Word'] != '.']
df_validation['Sentence #'] = (df_validation['Word'] == '.').cumsum()
df_validation = df_validation[df_validation['Word'] != '.']

In [None]:
df_train

Unnamed: 0,Word,Tag,Sentence #
1,IL-2,B-DNA,0
2,gene,I-DNA,0
3,expression,O,0
4,and,O,0
5,NF-kappa,B-protein,0
...,...,...,...
446884,stimuli,O,16695
446885,allowing,O,16695
446886,nuclear,O,16695
446887,NF-KB,B-protein,16695


In [None]:
# Run this cell to view the first 5 rows of the pandas dataframe.
print(df_train.head())
print(df_validation.head())

         Word        Tag  Sentence #
1        IL-2      B-DNA           0
2        gene      I-DNA           0
3  expression          O           0
4         and          O           0
5    NF-kappa  B-protein           0
             Word        Tag  Sentence #
1          Number          O           0
2              of          O           0
3  glucocorticoid  B-protein           0
4       receptors  I-protein           0
5              in          O           0


In [None]:
print("Number of tags: {}".format(len(df_train.Tag.unique())))

Number of tags: 11


In [None]:
# Displays the frequency of each tag in the training dataset.
frequencies = df_train.Tag.value_counts()
frequencies

O              329041
B-protein       27803
I-protein       22434
I-DNA           14067
B-DNA            8481
I-cell_type      8131
I-cell_line      6619
B-cell_type      6191
B-cell_line      3430
I-RNA            1348
B-RNA             844
Name: Tag, dtype: int64

In [None]:
# Displays the frequency of the 5 unique labels, which includes the number of I and B tags that appear in the training set.
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        new_tag = re.sub(".\-", '', tag)
        if new_tag not in tags.keys():
            tags[new_tag] = count
        else:
            tags[new_tag] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('protein', 50237), ('DNA', 22548), ('cell_type', 14322), ('cell_line', 10049), ('RNA', 2192)]


In [None]:
# Creates a dictionary mapping labels to indices and vice versa.
labels_to_ids = {k: v for v, k in enumerate(df_train.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(df_train.Tag.unique())}
labels_to_ids

{'B-DNA': 0,
 'B-RNA': 9,
 'B-cell_line': 7,
 'B-cell_type': 5,
 'B-protein': 3,
 'I-DNA': 1,
 'I-RNA': 10,
 'I-cell_line': 8,
 'I-cell_type': 6,
 'I-protein': 4,
 'O': 2}

In [None]:
sentence = []

In [None]:
# Creates a column in the pandas dataframe for sentences and the label that corresponds to each word in a given sentence.
df_train['sentence'] = df_train[['Word','Tag', 'Sentence #']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
df_train['word_labels'] = df_train[['Word','Tag', 'Sentence #']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
df_validation['sentence'] = df_validation[['Word','Tag', 'Sentence #']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
df_validation['word_labels'] = df_validation[['Word','Tag', 'Sentence #']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
print(df_train.head())
print(df_validation.head())

         Word        Tag  Sentence #  \
1        IL-2      B-DNA           0   
2        gene      I-DNA           0   
3  expression          O           0   
4         and          O           0   
5    NF-kappa  B-protein           0   

                                            sentence  \
1  IL-2 gene expression and NF-kappa B activation...   
2  IL-2 gene expression and NF-kappa B activation...   
3  IL-2 gene expression and NF-kappa B activation...   
4  IL-2 gene expression and NF-kappa B activation...   
5  IL-2 gene expression and NF-kappa B activation...   

                                         word_labels  
1  B-DNA,I-DNA,O,O,B-protein,I-protein,O,O,B-prot...  
2  B-DNA,I-DNA,O,O,B-protein,I-protein,O,O,B-prot...  
3  B-DNA,I-DNA,O,O,B-protein,I-protein,O,O,B-prot...  
4  B-DNA,I-DNA,O,O,B-protein,I-protein,O,O,B-prot...  
5  B-DNA,I-DNA,O,O,B-protein,I-protein,O,O,B-prot...  
             Word        Tag  Sentence #  \
1          Number          O           0   
2   

In [None]:
# Removes columns other than sentence and word_labels
df_train = df_train[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
df_validation = df_validation[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
print(df_train.head())
print(df_validation.head())
print(df_validation.shape)

                                            sentence  \
0  IL-2 gene expression and NF-kappa B activation...   
1  Activation of the CD28 surface receptor provid...   
2  In primary T lymphocytes we show that CD28 lig...   
3  Delineation of the CD28 signaling cascade was ...   
4  Our data suggest that lipoxygenase metabolites...   

                                         word_labels  
0  B-DNA,I-DNA,O,O,B-protein,I-protein,O,O,B-prot...  
1  O,O,O,B-protein,I-protein,I-protein,O,O,O,O,O,...  
2  O,B-cell_type,I-cell_type,I-cell_type,O,O,O,B-...  
3  O,O,O,B-protein,O,O,O,O,O,O,B-protein,I-protei...  
4  O,O,O,O,B-protein,I-protein,O,O,O,O,O,O,B-prot...  
                                            sentence  \
0  Number of glucocorticoid receptors in lymphocy...   
1  The study demonstrated a decreased level of gl...   
2  In the lymphocytes with a high GR number , dex...   
3  On the other hand , a decreased GR number resu...   
4  These data showed that the sensitivity of lymp... 

In [None]:
 print(max([len(sent.split()) for sent in list(df_train['sentence'])]))

165


Hyperparameters

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 1
EPOCHS = 9
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
# This class prepares the sentences in the dataset for the model, which makes sentences the same size by adding a padding. 
# Also, this class takes care of tag alignment since NER deals with wordpiece tokenization.
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        
        sentence = self.data.sentence[index].strip()
        word_labels = self.data.word_labels[index].split(",")
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        labels = [labels_to_ids[label] for label in word_labels] 
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:

            encoded_labels[idx] = labels[i]
            i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len


In [None]:
# This cell splits the training data into 80 percent for training and 20 percent for testing, and utilizes the dataset class for preparing the data.
# The validation set is also prepared using the dataset class.
train_size = 0.8
train_dataset = df_train.sample(frac=train_size,random_state=200)
test_dataset = df_train.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
vali_ = df_validation.sample(frac=0.999,random_state=200)
validation_dataset  = vali_.reset_index(drop=True)

print("FULL Dataset: {}".format(df_train.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
print("VALIDATION Dataset: {}".format(validation_dataset.shape))


training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
validation_set = dataset(validation_dataset, tokenizer, MAX_LEN)

FULL Dataset: (16648, 2)
TRAIN Dataset: (13318, 2)
TEST Dataset: (3330, 2)


In [None]:
training_set[1]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101, 19962,  2121, 17701,  2213,  3223,  2019, 10109, 24665,  6064,
          1011,  8031,  5884,  1010,  6168, 13096,  1011,  2030,  2482,  8758,
          8516,  1011,  5536, 13100,  2020,  4487, 13102,  6132,  3085,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     

In [None]:
# This cell is responsible for creating the dataloaders, which will be used in the training and validation loops.
train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
vali_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0 }


training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
validation_loader = DataLoader(validation_set, **vali_params)

In [None]:
# Creates the pre-trained BERT model that will be used in this fine-tuning process. Also, sets the dropout rate to 0.1 for the model, which can be adjusted.
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids), output_attentions = False, output_hidden_states = False, hidden_dropout_prob = 0.1, attention_probs_dropout_prob = 0.1)
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.type(torch.LongTensor) 
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss
print(input_ids)
print(attention_mask)
print(labels)

tr_logits = outputs[1]
tr_logits.shape

tensor([[  101,  2241,  2006,  2122,  1998,  2256,  3522,  9556,  1997,  8924,
          2090, 20912, 10464,  2015,  1998,  2771,  1011,  1016,  1011, 10751,
          6305,  1010,  2057, 16599,  2008, 20912, 10464,  2015,  2155,  2372,
         16913,  9869,  4962,  3670,  2076,  1048, 24335,  8458, 10085, 17250,
          2458,  2011, 14357,  5664,  2010,  5524, 26709,  3401,  3723, 25002,
         15420,  2000,  3563, 26512,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

torch.Size([1, 128, 11])

In [None]:
# Function for the training loop, which utilizes the Adam optimizer.

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        flattened_targets = labels.view(-1) 
        active_logits = tr_logits.view(-1, model.num_labels) 
        flattened_predictions = torch.argmax(active_logits, axis=1) 
 
        active_accuracy = labels.view(-1) != -100 

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
  
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
       
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
# Trains the model for the specified number of epochs, which can be adjusted in the cell under the hyperparameters label above. 
# Other values can be altered as well, such as the training batch size and the learning rate.
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.534815788269043
Training loss per 100 training steps: 0.658555130225302
Training loss per 100 training steps: 0.4620791719624059
Training loss per 100 training steps: 0.39336670352226477
Training loss per 100 training steps: 0.3598541391704538
Training loss per 100 training steps: 0.3352563403604437
Training loss per 100 training steps: 0.31089410710670734
Training loss per 100 training steps: 0.2992704715294107
Training loss per 100 training steps: 0.2843752953766865
Training loss per 100 training steps: 0.27067649870884547
Training loss per 100 training steps: 0.262681759038166
Training loss per 100 training steps: 0.25430948641168105
Training loss per 100 training steps: 0.2503738223393211
Training loss per 100 training steps: 0.2430990893826129
Training loss per 100 training steps: 0.2372803666728943
Training loss per 100 training steps: 0.23193583616349028
Training loss per 100 training steps: 0.22772098679533537
Training l

In [None]:
# Function for the validation loop, which returns the predicted labels and the true labels.
def valid(model, testing_loader):
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            flattened_targets = labels.view(-1)
            active_logits = eval_logits.view(-1, model.num_labels) 
            flattened_predictions = torch.argmax(active_logits, axis=1)
          
            active_accuracy = labels.view(-1) != -100 
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 5.936446541454643e-05
Validation loss per 100 evaluation steps: 0.12108551286245088
Validation loss per 100 evaluation steps: 0.22197806821295943
Validation loss per 100 evaluation steps: 0.1491359233419958
Validation loss per 100 evaluation steps: 0.16564527935963697
Validation loss per 100 evaluation steps: 0.19575989486758205
Validation loss per 100 evaluation steps: 0.19700043662249395
Validation loss per 100 evaluation steps: 0.18635604732672734
Validation loss per 100 evaluation steps: 0.18838100004567618
Validation loss per 100 evaluation steps: 0.20531890940949676
Validation loss per 100 evaluation steps: 0.20443943529775258
Validation loss per 100 evaluation steps: 0.20359741131487202
Validation loss per 100 evaluation steps: 0.2093905877896747
Validation loss per 100 evaluation steps: 0.20204187268312004
Validation loss per 100 evaluation steps: 0.202484629981197
Validation loss per 100 evaluation steps: 0.2028650393226826
Validation 

In [None]:
# Classification report takes in the true labels and the predicted labels for comparison purposes. 
# Since the F1-score is the evaluation metric being used in this study that is the column to focus on in the table that is printed below.
labels = [labels]
predictions = [predictions]
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

         DNA       0.63      0.56      0.59        34
         RNA       0.89      0.89      0.89         9
   cell_line       0.76      0.87      0.81        15
   cell_type       0.85      0.82      0.84        34
     protein       0.82      0.89      0.85       277

   micro avg       0.81      0.85      0.83       369
   macro avg       0.79      0.81      0.80       369
weighted avg       0.80      0.85      0.83       369



In [None]:
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DNA', 'B-protein', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'B-protein', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'B-protein', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-RNA', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-prot

In [None]:
print(predictions[0])

['O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DNA', 'B-protein', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'B-protein', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DNA', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [None]:
def valid(model, vali_loader):
    #evaluation Mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(vali_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            flattened_targets = labels.view(-1)
            active_logits = eval_logits.view(-1, model.num_labels) 
            flattened_predictions = torch.argmax(active_logits, axis=1)

            #only for active labels
            active_accuracy = labels.view(-1) != -100 
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, validation_loader)

Validation loss per 100 evaluation steps: 4.708655978902243e-05
Validation loss per 100 evaluation steps: 0.3304267713598291
Validation loss per 100 evaluation steps: 0.22286422231784977
Validation loss per 100 evaluation steps: 0.20108525166393895
Validation loss per 100 evaluation steps: 0.21848487316839552
Validation loss per 100 evaluation steps: 0.21895720448507805
Validation loss per 100 evaluation steps: 0.209347576550148
Validation loss per 100 evaluation steps: 0.21076531378882507
Validation loss per 100 evaluation steps: 0.20399476318780851
Validation loss per 100 evaluation steps: 0.22818158245967754
Validation loss per 100 evaluation steps: 0.23562624731544152
Validation loss per 100 evaluation steps: 0.24244618112414512
Validation loss per 100 evaluation steps: 0.26697692855520594
Validation loss per 100 evaluation steps: 0.26815188495196657
Validation loss per 100 evaluation steps: 0.2747635175974113
Validation loss per 100 evaluation steps: 0.287097430086951
Validation l

In [None]:
labels = [labels]
predictions = [predictions]

print(classification_report(labels, predictions))

              precision    recall  f1-score   support

         DNA       0.60      0.36      0.45        25
         RNA       0.75      0.60      0.67         5
   cell_line       0.15      0.31      0.20        13
   cell_type       0.72      0.63      0.67        83
     protein       0.78      0.82      0.80       279

   micro avg       0.72      0.74      0.73       405
   macro avg       0.60      0.54      0.56       405
weighted avg       0.74      0.74      0.73       405



In [None]:
O_ = labels[0].count('O')
B_DNA_ = labels[0].count('B-DNA')
I_DNA_ = labels[0].count("I-DNA")
B_RNA_ = labels[0].count("B-RNA")
I_RNA_ = labels[0].count("I-RNA")
B_cell_line_ = labels[0].count("B-cell_line")
I_cell_line_ = labels[0].count("I-cell_line")
B_cell_type_ = labels[0].count("B-cell_type")
I_cell_type_ = labels[0].count("I-cell_type")
B_protein_ = labels[0].count("B-protein")
I_protein_ = labels[0].count("I-protein")
print(O_, B_DNA_, B_RNA_, B_cell_line_, B_cell_type_, B_protein_, I_DNA_, I_RNA_, I_cell_line_, I_cell_type_, I_protein_)

3364 25 5 13 83 279 0 0 0 0 0


In [None]:
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'B-protein', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DNA', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_line', 'O', 'O', 'O', 'B-protein', 'B-cell_type', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O

In [None]:
print(predictions[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_line', 'O', 'O', 'O', 'B-protein', 'B-cell_type', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cell_type', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-protein', 'O', 'O'