In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# set path for files
path = '/content/drive/My Drive/thesis_dataset/'

In [None]:
# install required packages
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 3.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 12.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 26.6MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K 

In [None]:
# import all required packages/modules
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import TFBertModel, BertConfig
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
import pickle
tf.__version__

'2.3.0'

In [None]:
# define constants
MAX_TOKENS = 32
BERT_PRETRAIN_MODEL_NAME = "bert-base-cased"

In [None]:
# read data from excel
df = pd.read_excel(path+"All_Questions_V1.xlsx",'data', encoding='utf-8') 
df.head(1)

Unnamed: 0,SlNo,Question,Relation,NER_Tag,Q_Len,T_Len,Subject,Subject_old,Subject_URI_old,Subject_URI,Relation_URI
0,1,what are the brand names of Metipranolol,brand,O O O O O O B-E,7,7,Metipranolol,Metipranolol,http://bio2rdf.org/drugbank:DB01214,http://bio2rdf.org/drugbank:DB01214,http://bio2rdf.org/drugbank_vocabulary:brand


In [None]:
# split the full dataset into train, valid and test dataset
rest, test = train_test_split(df, test_size=0.2, random_state=0, 
                               stratify=df['Relation'])
train, valid = train_test_split(rest, test_size=0.1, random_state=0, 
                               stratify=rest['Relation'])
train_size, test_size, validation_size = len(train), len(test), len(valid)
print(f'Train:{train_size}, Test: {test_size}, Validation: {validation_size}')

Train:406, Test: 114, Validation: 46


In [None]:
# create instance of tokenzier from BERT pretrained model
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAIN_MODEL_NAME, do_lower_case=False)

In [None]:
# process the question phrase to return tokens list
def process_questions(questions, tokenizer):
    tokens_list = []
    attn_masks_list = []
    for question in tqdm(questions):
        tokens = tokenizer.encode(question, max_length = MAX_TOKENS, truncation=True, add_special_tokens = True)
        tokens_list.append(tokens)
    # here, we use post padding for BERT. later, in embeddings prepading for LSTM inputs
    padded_tokens_list = pad_sequences(tokens_list, maxlen=MAX_TOKENS, truncating="post", padding="post", dtype="long", value=0)

    # create atttion masks
    for tokens in padded_tokens_list:
        attn_masks = [int(token > 0) for token in tokens]
        attn_masks_list.append(attn_masks)

    return padded_tokens_list, np.asarray(attn_masks_list)

In [None]:
# process question phrases to get input_ids and attention_masks for BERT input
train_input_ids, train_attention_masks  = process_questions(train['Question'], tokenizer)
valid_input_ids, valid_attention_masks = process_questions(valid['Question'], tokenizer)
test_input_ids, test_attention_masks = process_questions(test['Question'], tokenizer)

HBox(children=(FloatProgress(value=0.0, max=406.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=114.0), HTML(value='')))




In [None]:
# create model from pretrained BERT model
config_params = BertConfig.from_pretrained(BERT_PRETRAIN_MODEL_NAME)
config_params.output_hidden_states=True
model = TFBertModel.from_pretrained(BERT_PRETRAIN_MODEL_NAME, config = config_params)

Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# obtain outputs from BERT model
train_outputs = model({"input_ids": train_input_ids, "attention_mask": train_attention_masks})                    
valid_outputs = model({"input_ids": valid_input_ids, "attention_mask": valid_attention_masks})
test_outputs = model({"input_ids": test_input_ids, "attention_mask": test_attention_masks})

In [None]:
# extract hidden status from BERT output
train_hidden_states = train_outputs[2][1:]
valid_hidden_states = valid_outputs[2][1:]
test_hidden_states = test_outputs[2][1:]

In [None]:
# function to extract word embeddings from hidden
def get_embeddings(df_data, input_ids, hidden_states):
  new_sentences = []
  embeddings = []
  for idx_sent, input_id in enumerate(input_ids):
    sent_len = len(df_data.iloc[idx_sent]['Question'].split())
    pad_len = MAX_TOKENS - sent_len
    tokens = tokenizer.convert_ids_to_tokens(input_id)
    new_tokens = []
    embedding = []
    for idx_tkn, token in enumerate(tokens):
      if token not in ["[CLS]", "[SEP]", "[PAD]"]:
        # Join word pieces
        # Create embeddings by summing the hidden states
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
            embedding[-1] = (embedding[-1] + hidden_states[-1][idx_sent,idx_tkn+1]+
                              hidden_states[-2][idx_sent,idx_tkn+1]+
                              hidden_states[-3][idx_sent,idx_tkn+1]+
                              hidden_states[-4][idx_sent,idx_tkn+1]).numpy().tolist()
        else:
            new_tokens.append(token)
            embedding.append((hidden_states[-1][idx_sent,idx_tkn+1]+
                              hidden_states[-2][idx_sent,idx_tkn+1]+
                              hidden_states[-3][idx_sent,idx_tkn+1]+
                              hidden_states[-4][idx_sent,idx_tkn+1]).numpy().tolist())
      else:
        pass
    new_embed = [] # paddings of zeros for embeddings
    for i in range(MAX_TOKENS-len(embedding)):
      new_embed.append(np.zeros(shape=(768,)).tolist())
    new_embed.extend(embedding)
    new_sentences.append(new_tokens)
    embeddings.append(new_embed)
  return new_sentences, embeddings

In [None]:
# get embeddings from train, valid and test datasets
train_sentences, train_embeddings = get_embeddings(train, train_input_ids, train_hidden_states)
valid_sentences, valid_embeddings = get_embeddings(valid, valid_input_ids, valid_hidden_states)
test_sentences, test_embeddings = get_embeddings(test, test_input_ids, test_hidden_states)

In [None]:
# function to write using pickle
def to_pickle(file_name, data_strcuture_name):
  outfile = open(path+file_name,'wb')
  pickle.dump(data_strcuture_name,outfile)
  outfile.close()

In [None]:
# export embeddings to pickle file
to_pickle('train_embeddings_file_v2', train_embeddings)
to_pickle('valid_embeddings_file_v2', valid_embeddings)
to_pickle('test_embeddings_file_v2', test_embeddings)

In [None]:
# define NER_TAGs
tags = ['O', 'B-E', 'I-E', 'PAD']
tag_dict = {t: i for i, t in enumerate(tags)}
num_tags = len(tags); num_tags  

4

In [None]:
# function to process labels and add padding
def get_labels(df_data,max_len):
  labels = [[label for label in tag.split()] for tag in df_data['NER_Tag'].values]
  targets = [[tag_dict[id] for id in label] for label in labels]
  padded_targets = pad_sequences(maxlen=max_len, sequences=targets, padding="pre", value=tag_dict["PAD"])
  target_labels = [to_categorical(target, num_classes=num_tags) for target in padded_targets]
  return target_labels

In [None]:
# get target labels for train, valid and test datasets
y_train = get_labels(train, MAX_TOKENS)
y_valid = get_labels(valid, MAX_TOKENS)
y_test = get_labels(test, MAX_TOKENS)

In [None]:
# export target labels to pickle file
to_pickle('y_train_file_v2', y_train)
to_pickle('y_valid_file_v2', y_valid)
to_pickle('y_test_file_v2', y_test)

In [None]:
# check embeddings are in right dimension
for v in valid_embeddings:
    for e in v:
        if len(e)-768 != 0:
          print('error')
  

**References**

Followed Examples from


---

https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/

https://mccormickml.com/2019/07/22/BERT-fine-tuning/

http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

https://www.kaggle.com/nkaenzig/bert-tensorflow-2-huggingface-transformers

https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU#scrollTo=tBa6vRHknSkv


---

