In [None]:
# install required packages
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 3.3MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 52.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 51.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# set path for files
path = '/content/drive/My Drive/thesis_dataset/'

In [None]:
# import all required packages/modules
import csv
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import TFBertModel, TFPreTrainedModel
from transformers import BertTokenizer, BertConfig
from tensorflow.keras.models import load_model

In [None]:
# define constants
MAX_TOKENS = 64
BERT_PRETRAIN_MODEL_NAME = "bert-base-cased"
NR_EPOCHS = 15
BATCH_SIZE, BUFFER_SIZE = 32, 10000
REPEAT, PREFETCH = 5, 1

In [None]:
# read data from excel
df = pd.read_excel(path+"All_Questions_V1.xlsx",'data', encoding='utf-8') 
df.head(1)

Unnamed: 0,SlNo,Question,Relation,NER_Tag,Q_Len,T_Len,Subject,Subject_URI,Relation_URI
0,1,what are the brand names of Metipranolol,brand,O O O O O O B-E,7,7,Metipranolol,http://bio2rdf.org/drugbank:DB01214,http://bio2rdf.org/drugbank_vocabulary:brand


In [None]:
# split the full dataset into train, valid and test dataset
rest, test = train_test_split(df, test_size=0.2, random_state=0, 
                               stratify=df['Relation'])
train, valid = train_test_split(rest, test_size=0.1, random_state=0, 
                               stratify=rest['Relation'])
train_size, test_size, validation_size = len(train), len(test), len(valid)
print(f'Train:{train_size}, Test: {test_size}, Validation: {validation_size}')

Train:406, Test: 114, Validation: 46


In [None]:
# create instance of tokenzier from BERT pretrained model
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAIN_MODEL_NAME, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
# process the question phrase, labels to return input_ids, attention_masks, one-hot-encoded labels and label names
def process_data(df_data, tokenizer, max_tokens, train=False):
  # process labels only for training data
  if(train):
    df_class = pd.get_dummies(df_data, columns=["Relation"], prefix=[""], prefix_sep="" )
    df_class.head(1)
    column_names = df_class.columns.to_list()
    label_names = column_names[8:]
    onehot_labels =  df_class[label_names].values
  else:
    onehot_labels, label_names = [], []

  # process data and provide input_ids and attention_masks
  tokens_list = []
  attn_masks_list = []
  for question in tqdm(df_data['Question']):
      tokens = tokenizer.encode(question, max_length = max_tokens, truncation=True, add_special_tokens = True)
      tokens_list.append(tokens)
  # we use post padding for BERT
  padded_tokens_list = pad_sequences(tokens_list, maxlen=max_tokens, truncating="post", padding="post", dtype="long", value=0)

  # create atttion masks
  for tokens in padded_tokens_list:
      attn_masks = [int(token > 0) for token in tokens]
      attn_masks_list.append(attn_masks)

  return padded_tokens_list, np.asarray(attn_masks_list), np.asarray(onehot_labels), label_names

In [None]:
# process question phrases, labels to get input_ids, attention_masks for BERT input and onehot labels
train_input_ids, train_attention_masks, train_labels, labels = process_data(train, tokenizer, MAX_TOKENS, True)
valid_input_ids, valid_attention_masks, valid_labels, _  = process_data(valid, tokenizer, MAX_TOKENS, True)
num_class = len(labels)

HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




In [None]:
#Function for creating and updating parameters of dataset using input tensors
def load_dataset(dataset, train=True):
    dataset_loader = tf.data.Dataset.from_tensor_slices(dataset)
    if train:
      dataset_loader = dataset_loader.shuffle(buffer_size=BUFFER_SIZE)
      dataset_loader = dataset_loader.repeat(REPEAT)
      dataset_loader = dataset_loader.prefetch(PREFETCH)
    dataset_loader = dataset_loader.batch(BATCH_SIZE)
    return dataset_loader

In [None]:
# cerate dataset from BERT inputs
train_dataset_loader = load_dataset((train_input_ids, train_attention_masks, train_labels))
valid_dataset_loader = load_dataset((valid_input_ids, valid_attention_masks, valid_labels))

In [None]:
config_params = BertConfig.from_pretrained(BERT_PRETRAIN_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [None]:
# create a class for relation clssifer
# adapt from BERT base model
# build a top classifier layer with input as CLS token output
class RelationClassifier(TFPreTrainedModel):    
    def __init__(self, base: TFBertModel, num_relations: int):
        super().__init__(config_params)
        self.base = base
        self.top_classifier = Dense(num_relations, activation='softmax')
        
    @tf.function
    def call(self, input_ids, attention_mask):
        outputs = self.base(input_ids, attention_mask=attention_mask, token_type_ids=None,
                               position_ids=None, head_mask=None)
        cls_token_output = outputs[1]
        cls_token_output = self.top_classifier(cls_token_output)
        return cls_token_output

In [None]:
# create a model from relation classifier class
bert_base_model = TFBertModel.from_pretrained(BERT_PRETRAIN_MODEL_NAME)
model = RelationClassifier(bert_base_model, num_class)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# define loss object, metrices, optimizer and training/validation steps
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=False) 
train_loss, validation_loss = tf.keras.metrics.Mean(name='train_loss'), tf.keras.metrics.Mean(name='test_loss')
validation_accuracy = tf.keras.metrics.CategoricalAccuracy(name='accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-06, clipnorm=1)
steps_per_epoch = int(train_size / BATCH_SIZE)
validation_steps = int(validation_size / BATCH_SIZE)

In [None]:
# define function for training / validation of model in the epoch run
@tf.function
def model_training(model, input_ids, attn_masks, onehot_labels, train = True):
    act_labels = tf.dtypes.cast(onehot_labels, tf.float32)
    # while train loop, calculate loss and update all parameters for all layers
    if train:
      with tf.GradientTape() as tape:
          pred_labels = model(input_ids, attn_masks)
          training_loss = loss_object(act_labels, pred_labels)
      training_gradients = tape.gradient(training_loss, model.trainable_variables)
      optimizer.apply_gradients(zip(training_gradients, model.trainable_variables))
      train_loss(training_loss)
    # while validation loop, predict labels, calculate loss and accuracy  
    else:
      pred_labels = model(input_ids, attn_masks, training=train)
      valid_loss = loss_object(act_labels, pred_labels)
      validation_loss(valid_loss)
      validation_accuracy.update_state(act_labels, pred_labels)


In [None]:
# train and validate the model for number of epoches
for epoch_num in range(NR_EPOCHS):
    print(f'Epoch Number: {epoch_num+1}')
    for i, (input_ids, attn_masks, act_labels) in enumerate(tqdm(train_dataset_loader, total=steps_per_epoch)):
        model_training(model, input_ids, attn_masks, act_labels, train=True)        
    for i, (input_ids, attn_masks, act_labels) in enumerate(tqdm(valid_dataset_loader, total=validation_steps)):
        model_training(model, input_ids, attn_masks, act_labels, train=False)
    print(f'Training Loss: {train_loss.result()}')
    print(f'Validation Loss: {validation_loss.result()}')
    print(f'Validation Accuracy: {validation_accuracy.result().numpy()}')
    print(f'_______________________________________________________________________________')

Epoch Number: 1


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 2.5329275131225586
Validation Loss: 1.6255028247833252
Validation Accuracy: 0.695652186870575
_______________________________________________________________________________
Epoch Number: 2


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 1.4351942539215088
Validation Loss: 1.138523817062378
Validation Accuracy: 0.79347825050354
_______________________________________________________________________________
Epoch Number: 3


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.9733770489692688
Validation Loss: 0.9218280911445618
Validation Accuracy: 0.8333333134651184
_______________________________________________________________________________
Epoch Number: 4


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.7362219095230103
Validation Loss: 0.7922084331512451
Validation Accuracy: 0.85326087474823
_______________________________________________________________________________
Epoch Number: 5


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.5921546220779419
Validation Loss: 0.7159678936004639
Validation Accuracy: 0.865217387676239
_______________________________________________________________________________
Epoch Number: 6


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.4953460693359375
Validation Loss: 0.6554868817329407
Validation Accuracy: 0.8731883764266968
_______________________________________________________________________________
Epoch Number: 7


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.42580708861351013
Validation Loss: 0.6141793131828308
Validation Accuracy: 0.8788819909095764
_______________________________________________________________________________
Epoch Number: 8


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.3734295666217804
Validation Loss: 0.5848097801208496
Validation Accuracy: 0.883152186870575
_______________________________________________________________________________
Epoch Number: 9


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.33255207538604736
Validation Loss: 0.5550819635391235
Validation Accuracy: 0.8864734172821045
_______________________________________________________________________________
Epoch Number: 10


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.29975923895835876
Validation Loss: 0.5312504768371582
Validation Accuracy: 0.8891304135322571
_______________________________________________________________________________
Epoch Number: 11


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.2728655934333801
Validation Loss: 0.5106315016746521
Validation Accuracy: 0.8913043737411499
_______________________________________________________________________________
Epoch Number: 12


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.25040921568870544
Validation Loss: 0.49513867497444153
Validation Accuracy: 0.8931159377098083
_______________________________________________________________________________
Epoch Number: 13


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.23137490451335907
Validation Loss: 0.4799872934818268
Validation Accuracy: 0.8946488499641418
_______________________________________________________________________________
Epoch Number: 14


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.2150350660085678
Validation Loss: 0.4706234633922577
Validation Accuracy: 0.8959627151489258
_______________________________________________________________________________
Epoch Number: 15


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Training Loss: 0.20085462927818298
Validation Loss: 0.45843735337257385
Validation Accuracy: 0.8971014618873596
_______________________________________________________________________________


In [None]:
# try one question and find the predicted relation
question = (['what is the salt of choloroform'])
df_test = pd.DataFrame(question, columns=['Question'])
test_steps = int(len(df_test) / BATCH_SIZE)
test_input_ids, test_attention_masks, _, _ = process_data(df_test, tokenizer, MAX_TOKENS, False)
test_dataset_loader = load_dataset((test_input_ids, test_attention_masks),False)

for i, (token_ids, masks) in enumerate(tqdm(test_dataset_loader, total=test_steps)):
    predictions = model(token_ids, attention_mask=masks).numpy()
    print(predictions)
    max_col = np.argmax(predictions)
    print(max_col)
    print(np.max(predictions))
    print(labels[max_col])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

[[1.7716702e-04 7.8135054e-05 1.6244044e-04 3.3401779e-05 4.0652692e-05
  2.2005233e-05 8.0369151e-05 4.0218907e-05 9.3911149e-05 1.1763857e-05
  8.8326042e-06 1.7994009e-05 1.2415894e-05 1.3998369e-04 2.6034726e-05
  1.6278293e-05 3.0289475e-05 2.6828042e-04 6.8706704e-06 4.2061758e-04
  2.7349130e-05 2.7504459e-04 2.8732017e-04 1.3724630e-04 2.8017434e-05
  1.8412741e-04 5.1300307e-05 5.5622164e-05 9.9646866e-01 2.2988497e-05
  3.5998470e-04 4.5021352e-05 2.5953957e-05 3.9508162e-05 9.8811164e-05
  1.5767527e-04 4.7757399e-05]]
28
0.99646866
salt



In [None]:
# define function for evaluating any given dataset
def evaluate(df_test):
  # create input for BERT Model
  test_steps = int(len(df_test) / BATCH_SIZE)
  test_input_ids, test_attention_masks, _, _ = process_data(df_test, tokenizer, MAX_TOKENS, False)
  test_dataset_loader = load_dataset((test_input_ids, test_attention_masks),False)

  # predict the relations
  pred_labels =[]
  for i, (token_ids, masks) in enumerate(tqdm(test_dataset_loader, total=test_steps)):
      predictions = model(token_ids, attention_mask=masks).numpy()
      for i in range(len(predictions)):
        max_col = np.argmax(predictions[i])
        pred_labels.append(labels[max_col])
  # print actual and predicted relations      
  print(df_test['Relation'].values.tolist())
  print(pred_labels)
  # write to csv file
  with open(path+'test_relations_v2.csv', 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(pred_labels)
  # calculate and print accuracy
  print(accuracy_score(df_test['Relation'].values.tolist(),pred_labels))


In [None]:
print(f'--------------------   Validation Dataset   --------------------')
evaluate(valid)

--------------------   Validation Dataset   --------------------


HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


['volume-of-distribution', 'locus', 'ddi-interactor-in', 'kingdom', 'general-function', 'gene-name', 'biotransformation', 'patent', 'food-interaction', 'mixture', 'mixture', 'packager', 'synonym', 'clearance', 'affected-organism', 'route-of-elimination', 'group', 'locus', 'category', 'volume-of-distribution', 'product', 'target', 'theoretical-pi', 'general-function', 'kingdom', 'molecular-weight', 'transporter', 'protein-binding', 'toxicity', 'product', 'pharmacology', 'brand', 'manufacturer', 'specific-function', 'organism', 'mechanism-of-action', 'dosage', 'salt', 'indication', 'cellular-location', 'protein-binding', 'half-life', 'substructure', 'indication', 'gene-name', 'ingredient']
['volume-of-distribution', 'locus', 'ddi-interactor-in', 'kingdom', 'general-function', 'gene-name', 'route-of-elimination', 'patent', 'food-interaction', 'mixture', 'mixture', 'packager', 'synonym', 'clearance', 'affected-organism', 'clearance', 'group', 'locus', 'category', 'volume-of-distribution',

In [None]:
print(f'--------------------   Testing Dataset   --------------------')
evaluate(test)

--------------------   Testing Dataset   --------------------


HBox(children=(FloatProgress(value=0.0, max=114.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


['patent', 'manufacturer', 'synonym', 'mixture', 'transporter', 'toxicity', 'general-function', 'theoretical-pi', 'kingdom', 'group', 'indication', 'pharmacology', 'gene-name', 'target', 'synonym', 'general-function', 'patent', 'cellular-location', 'route-of-elimination', 'general-function', 'substructure', 'category', 'toxicity', 'patent', 'product', 'substructure', 'salt', 'general-function', 'half-life', 'group', 'brand', 'indication', 'mechanism-of-action', 'synonym', 'affected-organism', 'gene-name', 'volume-of-distribution', 'affected-organism', 'product', 'indication', 'volume-of-distribution', 'mixture', 'locus', 'mixture', 'packager', 'half-life', 'molecular-weight', 'ingredient', 'specific-function', 'ddi-interactor-in', 'cellular-location', 'molecular-weight', 'protein-binding', 'organism', 'dosage', 'organism', 'locus', 'volume-of-distribution', 'manufacturer', 'transporter', 'molecular-weight', 'mechanism-of-action', 'theoretical-pi', 'product', 'route-of-elimination', 's

In [None]:
model.summary()

Model: "relation_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
tf_bert_model (TFBertModel)  multiple                  108310272 
_________________________________________________________________
dense (Dense)                multiple                  28453     
Total params: 108,338,725
Trainable params: 108,338,725
Non-trainable params: 0
_________________________________________________________________


**References**

Followed Examples from


---

https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

https://mccormickml.com/2019/07/22/BERT-fine-tuning/

http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

https://www.kaggle.com/nkaenzig/bert-tensorflow-2-huggingface-transformers

https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU#scrollTo=tBa6vRHknSkv


---

