In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# set google drive for files
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/drive/My Drive/Colab Notebooks/temp/b08d5871a151.json"
!echo $GOOGLE_APPLICATION_CREDENTIALS

/content/drive/My Drive/Colab Notebooks/temp/b08d5871a151.json


In [None]:
# set path for files
path = "/content/drive/My Drive/thesis_dataset/"

In [None]:
# install required packages
!pip install transformers
!pip install seqeval



In [None]:
# import required packages/modules
import pandas as pd
import numpy as np
import torch
import transformers
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
import csv

print(f'Torch: {torch.__version__}, Transformers: {transformers.__version__}')


Torch: 1.6.0+cu101, Transformers: 3.1.0


In [None]:
# define constants
MAX_TOKENS = 64
BATCH_SIZE = 32
NR_EPOCHES = 20
MAX_NORM = 1.0
BERT_PRETRAIN_MODEL_NAME = "bert-base-cased"
NR_WARM_STEPS = 0

In [None]:
# read data from excel
df = pd.read_excel(path+"All_Questions_V1.xlsx",'data', encoding='utf-8') 
df.head(1)

Unnamed: 0,SlNo,Question,Relation,NER_Tag,Q_Len,T_Len,Subject,Subject_URI,Relation_URI
0,1,what are the brand names of Metipranolol,brand,O O O O O O B-E,7,7,Metipranolol,http://bio2rdf.org/drugbank:DB01214,http://bio2rdf.org/drugbank_vocabulary:brand


In [None]:
# split the full dataset into train, valid and test dataset
rest, test = train_test_split(df, test_size=0.2, random_state=0, 
                               stratify=df['Relation'])
train, valid = train_test_split(rest, test_size=0.1, random_state=0, 
                               stratify=rest['Relation'])
print(f'Train:{len(train)}, Test: {len(test)}, Validation: {len(valid)}')

Train:406, Test: 114, Validation: 46


In [None]:
# make the processing device as GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# create dictionary of NER_TAGs
tag_ids = ['O', 'B-E', 'I-E', 'PAD']
tag_dict = {t: i for i, t in enumerate(tag_ids)}
num_ner_tags = len(tag_dict)
print(num_ner_tags)

4


In [None]:
# create instance of tokenzier from BERT pretrained model
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAIN_MODEL_NAME, do_lower_case=False)

In [None]:
# process the question phrase to return tokens list
# process the NER_TAGs to match wordpieces of tokenizer
def process_tokens_labels(sent, labels):
    tokens_list = []
    labels_list = []
    for word, label in zip(sent, labels):
        # process tokens
        tokens = tokenizer.tokenize(word)
        tokens_list.extend(tokens)
        # process labels
        num_wordpieces = len(tokens)
        labels_list.extend([label] * num_wordpieces)
    return tokens_list, labels_list

In [None]:
# process the question phrase to return Torch tensors
# process the NER_TAGs to retun Torch tensors
def process_data(df_data):
  # process input data
  words_list = [[word for word in sentence.split()] for sentence in df_data['Question'].values]
  print(words_list[0])
  labels_list = [[tag for tag in tag_value.split()] for tag_value in df_data['NER_Tag'].values]
  print(labels_list[0])
  # gets inputs_ids and attention masks
  tokens_with_labels = [process_tokens_labels(sentence, labels)
                                for sentence, labels in zip(words_list, labels_list)]
  tokens_list = [token_with_label[0] for token_with_label in tokens_with_labels]
  input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(token) for token in tokens_list], maxlen=MAX_TOKENS,  
                            truncating="post", padding="post", value=0.0, dtype="long")
  attn_masks = [[float(id != 0.0) for id in input_id] for input_id in input_ids]
  # process labels and convert to numbers
  new_labels_list = [token_with_label[1] for token_with_label in tokens_with_labels]
  target_labels = pad_sequences([[tag_dict[lab] for lab in label] for label in new_labels_list], maxlen=MAX_TOKENS, 
                       truncating="post", padding="post", value=tag_dict["PAD"], dtype="long",)
  
  return torch.tensor(input_ids), torch.tensor(attn_masks), torch.tensor(target_labels)


In [None]:
# process question phrases and NER_TAGs to get Torch tensors
train_input_ids, train_attn_masks, train_ner_tags  = process_data(train)
valid_input_ids, valid_attn_masks, valid_ner_tags  = process_data(valid)
test_input_ids, test_attn_masks, test_ner_tags  = process_data(test)

['which', 'life', 'forms', 'are', 'impacted', 'by', 'Marimastat']
['O', 'O', 'O', 'O', 'O', 'O', 'B-E']
['what', 'is', 'the', 'volume', 'of', 'distribution', 'for', 'Coagulation', 'factor', 'VIIa']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-E', 'I-E', 'I-E']
['Nitroglycerin', 'is', 'patented', 'under', 'which', 'number']
['B-E', 'O', 'O', 'O', 'O', 'O']


In [None]:
# Process question phrases and NER_Tags to get datloader
train_dataset = TensorDataset(train_input_ids, train_attn_masks, train_ner_tags)
train_random_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_random_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_attn_masks, valid_ner_tags)
valid_random_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_random_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_input_ids, test_attn_masks, test_ner_tags)
test_random_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_random_sampler, batch_size=BATCH_SIZE)

In [None]:
# create model from pretrained BERT model
# send the model parameters to default device
model = BertForTokenClassification.from_pretrained( BERT_PRETRAIN_MODEL_NAME , num_labels=num_ner_tags, output_attentions = False,output_hidden_states = False)
model.cuda();

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
# pretrained BERT base layers are also finetuned and we use lower learning rate 
# determine all parameters of all pretrained layers of the model and create optimizer
parameters = list(model.named_parameters())
optimizer_parameters = [{"params": [parameter for num, parameter in parameters]}]
optimizer = AdamW(optimizer_parameters, lr=3e-5)

# Determine training steps and create scheduler
train_steps = len(train_dataloader) * NR_EPOCHES
scheduler = get_linear_schedule_with_warmup(optimizer, NR_WARM_STEPS, train_steps)


In [None]:
# function to update the parameters during training
def model_training(train_dataloader):
    # model in training mode
    model.train()
    train_loss = 0

    # train the model and update parameters of all layers
    for train_instance in train_dataloader:
        train_data_row = tuple(row.to(device) for row in train_instance)
        input_ids, attn_mask, labels = train_data_row
        model.zero_grad()
        outputs = model(input_ids, attention_mask=attn_mask, labels=labels)
        instance_loss = outputs[0]
        instance_loss.backward()
        train_loss += instance_loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_NORM)
        optimizer.step()
        scheduler.step()
    return train_loss

In [None]:
# function to calculate evaluation metrics during validation
def model_validation(valid_dataloader):
    # model in evaluation mode
    model.eval()
    valid_loss = 0
    pred_labels = [] 
    act_labels = []
    tokens = []
 
    # find predicted NER_TAGs and retrieve actual NER_TAGs from tensor
    for valid_instance in valid_dataloader:
        valid_data = tuple(row.to(device) for row in valid_instance)
        input_ids, attn_mask, labels = valid_data
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attn_mask, labels=labels)
        instance_loss = outputs[0]
        valid_loss += instance_loss.item()

        logits = outputs[1].detach().cpu().numpy()
        pred_labels.extend([list(p_labels) for p_labels in np.argmax(logits, axis=2)])
        act_label = labels.to('cpu').numpy()
        act_labels.extend(act_label)

        for input_id in input_ids:
          tokens.extend([tokenizer.convert_ids_to_tokens(input_id.to('cpu').numpy())])

    return valid_loss, pred_labels, act_labels, tokens

In [None]:
# train the model for required epoches 
for epoch_num in tqdm(range(NR_EPOCHES), desc="Training Progress"):
    num_train_samples = len(train_dataloader)
    num_valid_samples = len(valid_dataloader)

    train_loss = model_training(train_dataloader)
    valid_loss, pred_labels, act_labels, _ = model_validation(valid_dataloader)

    # calculate and print training loss
    training_loss = train_loss / num_train_samples
    print()
    print(f'Training loss: {training_loss}')

    # calculate and print validation loss, accuracy and F-Score
    validation_loss = valid_loss / num_valid_samples
    print(f'Validation loss: {validation_loss}')
    pred_ner_tags = [tag_ids[pred] for pred_label, act_label in zip(pred_labels, act_labels)
                                 for pred, act in zip(pred_label, act_label) if tag_ids[act] != "PAD"]
    act_ner_tags = [tag_ids[act] for act_label in act_labels
                                  for act in act_label if tag_ids[act] != "PAD"]
    print(f'Validation Accuracy: {accuracy_score(pred_ner_tags, act_ner_tags)}')
    print(f'Validation F-Score: {f1_score(pred_ner_tags, act_ner_tags)}')
    print()

Training Progress:   5%|▌         | 1/20 [00:03<00:57,  3.00s/it]


Training loss: 0.7828136109388791
Validation loss: 0.458412230014801
Validation Accuracy: 0.8364565587734242
Validation F-Score: 0.7149321266968326



Training Progress:  10%|█         | 2/20 [00:06<00:54,  3.03s/it]


Training loss: 0.25770169725784886
Validation loss: 0.3284241482615471
Validation Accuracy: 0.8960817717206133
Validation F-Score: 0.7932692307692307



Training Progress:  15%|█▌        | 3/20 [00:09<00:51,  3.03s/it]


Training loss: 0.12093467666552617
Validation loss: 0.3407772034406662
Validation Accuracy: 0.9148211243611585
Validation F-Score: 0.8349514563106796



Training Progress:  20%|██        | 4/20 [00:12<00:48,  3.02s/it]


Training loss: 0.06365664604191597
Validation loss: 0.2238511461764574
Validation Accuracy: 0.9557069846678024
Validation F-Score: 0.8992248062015504



Training Progress:  25%|██▌       | 5/20 [00:15<00:45,  3.03s/it]


Training loss: 0.021288384396869402
Validation loss: 0.26278222166001797
Validation Accuracy: 0.9676320272572402
Validation F-Score: 0.9336734693877551



Training Progress:  30%|███       | 6/20 [00:18<00:42,  3.03s/it]


Training loss: 0.013857544202787371
Validation loss: 0.25906841456890106
Validation Accuracy: 0.9659284497444633
Validation F-Score: 0.926208651399491



Training Progress:  35%|███▌      | 7/20 [00:21<00:39,  3.05s/it]


Training loss: 0.00938059353771118
Validation loss: 0.27143427170813084
Validation Accuracy: 0.9642248722316865
Validation F-Score: 0.9238578680203046



Training Progress:  40%|████      | 8/20 [00:24<00:36,  3.01s/it]


Training loss: 0.006837617005937948
Validation loss: 0.21341476030647755
Validation Accuracy: 0.9744463373083475
Validation F-Score: 0.9492385786802031



Training Progress:  45%|████▌     | 9/20 [00:27<00:32,  3.00s/it]


Training loss: 0.0049143837442478305
Validation loss: 0.34676104225218296
Validation Accuracy: 0.9625212947189097
Validation F-Score: 0.9187817258883249



Training Progress:  50%|█████     | 10/20 [00:30<00:29,  3.00s/it]


Training loss: 0.003709298715246125
Validation loss: 0.3010765574872494
Validation Accuracy: 0.9625212947189097
Validation F-Score: 0.9215189873417721



Training Progress:  55%|█████▌    | 11/20 [00:33<00:26,  2.99s/it]


Training loss: 0.0029036627652553413
Validation loss: 0.29866110160946846
Validation Accuracy: 0.9659284497444633
Validation F-Score: 0.9316455696202532



Training Progress:  60%|██████    | 12/20 [00:36<00:24,  3.02s/it]


Training loss: 0.005248996218702255
Validation loss: 0.36577668227255344
Validation Accuracy: 0.9659284497444633
Validation F-Score: 0.9312977099236641



Training Progress:  65%|██████▌   | 13/20 [00:39<00:21,  3.01s/it]


Training loss: 0.0020497560169762718
Validation loss: 0.3264639712870121
Validation Accuracy: 0.969335604770017
Validation F-Score: 0.9411764705882353



Training Progress:  70%|███████   | 14/20 [00:42<00:17,  2.99s/it]


Training loss: 0.004078362439311325
Validation loss: 0.28822724521160126
Validation Accuracy: 0.969335604770017
Validation F-Score: 0.9411764705882353



Training Progress:  75%|███████▌  | 15/20 [00:45<00:14,  2.98s/it]


Training loss: 0.001904968387232377
Validation loss: 0.3195117451250553
Validation Accuracy: 0.9676320272572402
Validation F-Score: 0.9336734693877551



Training Progress:  80%|████████  | 16/20 [00:48<00:11,  2.97s/it]


Training loss: 0.002954080371777169
Validation loss: 0.34137845039367676
Validation Accuracy: 0.9676320272572402
Validation F-Score: 0.9336734693877551



Training Progress:  85%|████████▌ | 17/20 [00:51<00:08,  2.97s/it]


Training loss: 0.000720620195632084
Validation loss: 0.34836279414594173
Validation Accuracy: 0.9659284497444633
Validation F-Score: 0.9312977099236641



Training Progress:  90%|█████████ | 18/20 [00:53<00:05,  2.96s/it]


Training loss: 0.0008930886934439724
Validation loss: 0.33920054510235786
Validation Accuracy: 0.969335604770017
Validation F-Score: 0.9411764705882353



Training Progress:  95%|█████████▌| 19/20 [00:56<00:02,  2.95s/it]


Training loss: 0.000663855319054654
Validation loss: 0.33615018613636494
Validation Accuracy: 0.969335604770017
Validation F-Score: 0.9411764705882353



Training Progress: 100%|██████████| 20/20 [00:59<00:00,  3.00s/it]


Training loss: 0.001204238640359388
Validation loss: 0.33433468267321587
Validation Accuracy: 0.9676320272572402
Validation F-Score: 0.9360613810741688






In [None]:
# function to evaluate any given dataset
def evaluate_model(dataloader):
    num_valid_samples = len(dataloader)
    calc_loss, pred_labels, act_labels, tokens = model_validation(dataloader)

    # calculate and print validation loss, accuracy and F-Score
    final_loss = calc_loss / num_valid_samples
    print(f'Loss: {final_loss}')
    pred_ner_tags = [tag_ids[pred] for pred_label, act_label in zip(pred_labels, act_labels)
                                 for pred, act in zip(pred_label, act_label) if tag_ids[act] != "PAD"]
    act_ner_tags = [tag_ids[act] for act_label in act_labels
                                  for act in act_label if tag_ids[act] != "PAD"]
    print(f'Accuracy: {accuracy_score(pred_ner_tags, act_ner_tags)}')
    print(f'F-Score: {f1_score(pred_ner_tags, act_ner_tags)}')
    print(classification_report(pred_ner_tags, act_ner_tags))    
    print()

    # reconstruct tokens, lables and entities
    # print actual and predicted for visual comparision
    p_labels_list, a_labels_list, tokens_list, a_entities_list, p_entities_list = [], [], [], [], []
    for token, prd_label, act_label in zip(tokens, pred_labels, act_labels ):
      new_p_labels, new_a_labels, new_tokens = [], [], []
      a_entity, p_entity = "", ""
      a_done_flag, p_done_flag = False, False
      a_inside_flag, p_inside_flag = False, False
      a_prev_tag, p_prev_tag = 'O', 'O'
      for token, label_idx, t_label_idx in zip(token, prd_label, act_label):
        if t_label_idx != 3:
          if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]

            if not(a_done_flag) and a_inside_flag:
              a_entity += token[2:]

            if not(p_done_flag) and p_inside_flag:
              p_entity += token[2:]
          else:
            new_p_labels.append(tag_ids[label_idx])
            new_a_labels.append(tag_ids[t_label_idx])
            new_tokens.append(token)

            a_curnt_tag = tag_ids[t_label_idx]
            if not(a_done_flag):
              if a_curnt_tag in ['B-E', 'I-E']:
                if token not in ["'", "s"]:
                  if token == "-" or a_entity[-1:] == "-":
                    a_entity = a_entity+ token
                  elif a_entity == "":
                    a_entity = token
                  else:
                    a_entity = a_entity+ " " +token
                  a_inside_flag = True
              else:
                if a_prev_tag in ['B-E', 'I-E']:
                  a_done_flag = True
            
            p_curnt_tag = tag_ids[label_idx]
            if not(p_done_flag):
              if p_curnt_tag in ['B-E', 'I-E']:
                if token not in ["'", "s"]:
                  if token == "-" or p_entity[-1:] == "-":
                    p_entity = p_entity+ token
                  elif p_entity == "":
                    p_entity = token
                  else:
                    p_entity = p_entity+ " " +token
                  p_inside_flag = True
              else:
                if p_prev_tag in ['B-E', 'I-E']:
                  p_done_flag = True
        
      tokens_list.append(new_tokens) 
      p_labels_list.append(new_p_labels)    
      a_labels_list.append(new_a_labels)
      a_entities_list.append(a_entity)
      p_entities_list.append(p_entity)

    print("Tokens List")
    print(tokens_list)
    print("Predicted Labels")
    print(p_labels_list)
    print("Actual Labels")
    print(a_labels_list)
    print("Predicted Entities")
    print(p_entities_list)
    print("Actual Entities")
    print(a_entities_list)

    # write the predicted entity strings to csv file
    with open(path+'test_entities_final_v0.csv', 'w', newline='') as myfile:
      wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
      wr.writerow(p_entities_list)


In [None]:
print(f'                 Validation Dataset Results                  ')
print("--------------------------------------------------------------")
evaluate_model(valid_dataloader)

                 Validation Dataset Results                  
--------------------------------------------------------------
Loss: 0.33433468267321587
Accuracy: 0.9676320272572402
F-Score: 0.9360613810741688
           precision    recall  f1-score   support

        E       0.96      0.92      0.94       200

micro avg       0.96      0.92      0.94       200
macro avg       0.96      0.92      0.94       200


Tokens List
[['what', 'is', 'the', 'volume', 'of', 'distribution', 'for', 'Coagulation', 'factor', 'VIIa'], ['what', 'is', 'exact', 'the', 'position', 'of', 'Ferritin', 'heavy', 'chain', 'on', 'a', 'chromosome'], ['how', 'the', 'interaction', 'of', 'Dronedarone', 'affects', 'other', 'drugs', "'", 's', 'actions'], ['which', 'is', 'the', 'kingdom', 'grouping', 'of', 'the', 'drug', 'Alpha', '-', 'Linolenic', 'Acid'], ['provide', 'the', 'general', 'activities', 'carried', 'out', 'by', 'Protein', 'S100', '-', 'A1'], ['Leukotriene', 'C4', 'synthase', 'is', 'encoded', 'by', 'which', '

In [None]:
print(f'                 Test Dataset Results                  ')
print("--------------------------------------------------------------")
evaluate_model(test_dataloader)

                 Test Dataset Results                  
--------------------------------------------------------------
Loss: 0.11270354269072413
Accuracy: 0.9806666666666667
F-Score: 0.954864593781344
           precision    recall  f1-score   support

        E       0.97      0.94      0.95       507

micro avg       0.97      0.94      0.95       507
macro avg       0.97      0.94      0.95       507


Tokens List
[['Nitroglycerin', 'is', 'patented', 'under', 'which', 'number'], ['which', 'companies', 'manufacture', 'Phenmetrazine'], ['list', 'all', 'synonyms', 'of', 'Nepafenac'], ['list', 'the', 'mixtures', 'that', 'contains', 'Hydrochlorothiazide'], ['which', 'is', 'the', 'transporter', 'for', 'Zafirlukast'], ['what', 'are', 'the', 'overdose', 'impacts', 'of', 'Theophylline'], ['what', 'is', 'the', 'general', 'function', 'of', 'enzme', 'Aldehyde', 'oxidase'], ['for', 'Vitamin', 'D3', 'receptor', 'provide', 'actual', 'isoelectric', 'point', 'value'], ['provide', 'the', 'kingdom', '

**References**

Followed Examples from


---

https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

https://mccormickml.com/2019/07/22/BERT-fine-tuning/

http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

https://www.kaggle.com/nkaenzig/bert-tensorflow-2-huggingface-transformers

https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU#scrollTo=tBa6vRHknSkv


---

