## test finetuned model

In [18]:
import torch
import sys
sys.path.insert(1, '/content/SRL-for-BioBERT/')
from transformers import BertTokenizer

In [19]:
text = ["in addition, deletion of the distal tor box (box1) abolished torc induction whereas the presence of a dna fragment starting three bases upstream from box1 suffices for normal torc expression."]
import sys
sys.path.insert(1, '../')
from infer_pipeline import inferPipeline
from utils.data_utils import TaskType
pipe = inferPipeline(modelPath='./output/multi_task_model_9_13050.pt', maxSeqLen=50)

data = pipe.infer(text, ['conllsrl'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


taskName :  odict_values(['conllsrl'])
allTasksList :  [{'data_task_id': 0, 'data_': [{'uid': 0, 'label': [12, 2, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'token_id': [101, 178, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'type_id': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}], 'data_task_type': <TaskType.NER: 3>, 'data_task_name': 'conllsrl'}]
inferDataLoader :  <torch.utils.data.dataloader.DataLoader object at 0x7f2580d23f10>


Eval: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]


In [17]:
print(data)

[{'Query': 'in addition, deletion of the distal tor box (box1) abolished torc induction whereas the presence of a dna fragment starting three bases upstream from box1 suffices for normal torc expression.', 'conllsrl': [('A2', 'i')]}]


In [2]:
import json
import torch

def read_data(readPath):

    with open(readPath, 'r', encoding = 'utf-8') as file:
        taskData = []
        for i, line in enumerate(file):
            sample = json.loads(line)
            taskData.append(sample)

    return taskData

In [3]:
from models.model import multiTaskModel

def load_finetuned_model(model_file):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Load finetuned model 
    loadedDict = torch.load(model_file, map_location=torch.device(device))
    
    taskParams = loadedDict['task_params']
    allParams = {}
    allParams['task_params'] = taskParams
    allParams['gpu'] = torch.cuda.is_available()
    
    # dummy values
    allParams['num_train_steps'] = 10
    allParams['warmup_steps'] = 0
    allParams['learning_rate'] = 2e-5
    allParams['epsilon'] = 1e-8

   
    return allParams, loadedDict

  from .autonotebook import tqdm as notebook_tqdm
2024-04-10 14:54:28.212188: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 14:54:28.955121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import os
from MLM.custom_dataset import CustomDataset
def get_embedding_finetuned(dataDir, readFile, model, wriDir):
    data = read_data(os.path.join(dataDir, readFile))
    line = data[0]

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Convert inputs to PyTorch tensors
    attention_mask = torch.tensor([int(x) for x in line['attention_mask']], device=device)
    tokens_tensor = torch.tensor([int(x) for x in line['token_id']],  device=device)
    segments_tensors = torch.tensor([int(x) for x in line['token_type_ids']],  device=device)
    
    test_dataset = CustomDataset(
        data_path=dataDir,
        file_name=readFile)
    
    
    with torch.no_grad():
        outputs_model, _ = model.network(test_dataset['atte'], segments_tensors, attention_mask, 0, 'conllsrl')
       
        last_hidden_states = outputs_model[0]
        hidden_states = outputs_model[2]
    
    # sum of last four layer
    word_embed_5 = torch.stack(hidden_states[-4:]).sum(0)
    
    # concatenate last four layers
    word_embed_6 = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)
    
    return last_hidden_states, word_embed_5, word_embed_6


## get word embedding of SRL data from fine-tuned SRL model

In [9]:
dataDir = '/mnt/c/Users/Phat Pham/Documents/THESIS/SRL-for-BioBERT/data/coNLL_tsv/bert-base-uncased_prepared_data/'
readFile = 'ner_conll_testa_abolish.json'
wriDir = 'test_get_embedding_finetuned'
last_hidden_states_finetuned, sum_4_finetuned, concat_4_finetuned = get_embedding_finetuned(dataDir, readFile, wriDir)

In [9]:


import torch
import os
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
# from torchsummary import summary
bertmodel = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2', output_hidden_states =True)

def get_embedding(dataDir, readFile, wriDir):
    data = read_data(os.path.join(dataDir, readFile))
    
    line = data[0]
    
    tokens_id = line['token_id']
    segments_id = line['type_id']
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([tokens_id])
    segments_tensors = torch.tensor([segments_id])
    
    
    with torch.no_grad():
        outputs = bertmodel(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        last_hidden_states = outputs[0]
 
    # sum of last four layer
    word_embed_5 = torch.stack(hidden_states[-4:]).sum(0)
    
    # concatenate last four layers
    word_embed_6 = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)
    
    return last_hidden_states, word_embed_5, word_embed_6
  

In [10]:
dataDir = '/mnt/c/Users/Phat Pham/Documents/THESIS/SRL-for-BioBERT/data/coNLL_tsv/bert-base-uncased_prepared_data/'
readFile = 'ner_conll_testa_abolish.json'
wriDir = 'test_get_embedding_finetuned'
vec_origin = get_embedding(dataDir, readFile, wriDir)

In [11]:
last_hidden_states_original, sum_4_origin, concat_4_origin = vec_origin
print(last_hidden_states_original)

tensor([[[-0.2609, -0.4492, -0.4217,  ...,  0.1371,  0.2317,  0.3307],
         [-0.3257, -0.4465, -0.4151,  ...,  0.2235,  0.2166,  0.3605],
         [-0.2726, -0.4443, -0.3247,  ...,  0.1279,  0.2824,  0.2297],
         ...,
         [-0.2117, -0.4122, -0.2803,  ...,  0.1804,  0.2409,  0.2266],
         [-0.2186, -0.4060, -0.2899,  ...,  0.1875,  0.2226,  0.2187],
         [-0.2050, -0.4040, -0.3002,  ...,  0.1869,  0.2205,  0.2061]]])


In [12]:
def cosine_module_2_tensors(tensor1, tensor2):
    
    # Compute the dot product
    dot_product = torch.dot(tensor1, tensor2)

    # Compute the L2 (Euclidean) norms
    norm_tensor1 = torch.norm(tensor1)
    norm_tensor2 = torch.norm(tensor2)

    # Calculate the cosine similarity
    consine = dot_product / (norm_tensor1 * norm_tensor2)
   
    module = 1 - (torch.abs(norm_tensor1 - norm_tensor2)/(norm_tensor1 + norm_tensor2))
   
    cosine_module = 1.0/2 * (module + 1.0/2 * (1 + consine))

    return cosine_module

def cosine_module_similarity(sen1, sen2):
    assert len(sen1) == len(sen2), "Sentence lengths are not equal"
        
    res_vec = []
    for word in range(len(sen1)):
        res_vec.append(cosine_module_2_tensors(sen1[word], sen2[word]))
    return res_vec

cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
cosine = cos(last_hidden_states_finetuned[0], last_hidden_states_original[0])

cosine_module = cosine_module_similarity( last_hidden_states_finetuned[0], last_hidden_states_original[0])
print("consine similarity: ", cosine)
print("\n cosine module similarity: ", cosine_module)

consine similarity:  tensor([ 0.0375,  0.0090,  0.0093, -0.0042, -0.0196,  0.0314,  0.0029,  0.0089,
         0.0180, -0.0548, -0.0023,  0.0181, -0.0236, -0.0281, -0.0224, -0.0169,
         0.0271,  0.0265, -0.0211, -0.0581,  0.0118, -0.0579,  0.0051,  0.0080,
        -0.0038,  0.0017, -0.0394, -0.0407, -0.0222,  0.0089, -0.0360, -0.0107,
         0.0179, -0.0420,  0.0141, -0.0371, -0.0299, -0.0056,  0.0307, -0.0025,
        -0.0150, -0.0117, -0.0291, -0.0209, -0.0225, -0.0178, -0.0140, -0.0125,
        -0.0098, -0.0041])

 cosine module similarity:  [tensor(0.6693), tensor(0.6673), tensor(0.6644), tensor(0.6619), tensor(0.6611), tensor(0.6740), tensor(0.6676), tensor(0.6692), tensor(0.6780), tensor(0.6552), tensor(0.6658), tensor(0.6753), tensor(0.6629), tensor(0.6596), tensor(0.6656), tensor(0.6603), tensor(0.6769), tensor(0.6638), tensor(0.6610), tensor(0.6517), tensor(0.6649), tensor(0.6522), tensor(0.6682), tensor(0.6692), tensor(0.6670), tensor(0.6620), tensor(0.6566), tensor(0.6

In [13]:
print(concat_4_finetuned.shape)

torch.Size([1, 50, 3072])


In [14]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
cosine_sum = cos(sum_4_finetuned[0], sum_4_origin[0])

cosine_module_sum = cosine_module_similarity(sum_4_finetuned[0], sum_4_origin[0])
print("consine similarity: ", cosine_sum)
print("\n cosine module similarity: ", cosine_module_sum)

consine similarity:  tensor([ 3.7041e-02, -8.0849e-03, -1.5281e-02, -6.0440e-02,  6.9533e-05,
         3.8828e-03, -2.7961e-03,  2.1425e-03,  6.4395e-03, -7.3029e-02,
        -1.7443e-02, -2.1238e-03, -5.1611e-02,  4.6082e-03,  6.4568e-03,
         3.2407e-02,  5.3661e-03, -3.9898e-03, -8.4192e-03, -3.9222e-02,
         2.8195e-02, -7.2102e-02, -6.2373e-03,  8.1167e-03,  1.3320e-02,
         2.0194e-02, -1.2999e-02, -7.9552e-03, -1.3428e-02, -3.5969e-03,
        -3.0939e-03,  1.7212e-02,  8.1992e-03, -3.2678e-03,  2.6840e-03,
        -1.7231e-02,  2.2295e-02,  3.4076e-02,  3.3302e-03,  5.1790e-02,
         3.4035e-02,  3.1593e-02,  1.4438e-02,  1.9211e-02,  2.6862e-02,
         2.7005e-02,  4.8388e-02,  4.1161e-02,  4.5212e-02,  4.2074e-02])

 cosine module similarity:  [tensor(0.7330), tensor(0.7265), tensor(0.7188), tensor(0.7114), tensor(0.7284), tensor(0.7259), tensor(0.7263), tensor(0.7278), tensor(0.7317), tensor(0.7068), tensor(0.7191), tensor(0.7298), tensor(0.7172), tensor(0.7

In [15]:
cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
cosine_concat = cos(concat_4_finetuned[0], concat_4_origin[0])

cosine_module_concat = cosine_module_similarity(concat_4_finetuned[0], concat_4_origin[0])
print("consine similarity: ", cosine_concat)
print("\n cosine module similarity: ", len(cosine_module_concat))

consine similarity:  tensor([ 0.0400, -0.0112, -0.0197, -0.0628, -0.0042, -0.0002, -0.0054, -0.0032,
        -0.0009, -0.0712, -0.0191, -0.0032, -0.0481,  0.0017,  0.0033,  0.0271,
         0.0074,  0.0015, -0.0034, -0.0312,  0.0221, -0.0648, -0.0038,  0.0113,
         0.0158,  0.0163, -0.0092, -0.0076, -0.0116,  0.0004,  0.0019,  0.0156,
         0.0137, -0.0011,  0.0035, -0.0148,  0.0225,  0.0364,  0.0191,  0.0472,
         0.0315,  0.0295,  0.0148,  0.0173,  0.0250,  0.0236,  0.0444,  0.0380,
         0.0406,  0.0374])

 cosine module similarity:  50


# calculate cosine similarity

In [None]:
from transformers import BertConfig, BertModel, BertTokenizer

data = read_data('/mnt/c/Users/Phat Pham/Documents/THESIS/SRL-for-BioBERT/data/coNLL_tsv/bert-base-uncased_prepared_data/ner_conll_testa_abolish.json')

print("data shape: ", len(data))  # 280
tokens_id = data[0]['token_id']
segments_id = data[0]['type_id']


# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([tokens_id])
segments_tensors = torch.tensor([segments_id])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2',
                                output_hidden_states = True # Whether the model returns all hidden-states.
                                )
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)
    print("outputs: ", len(outputs)) # 2
    hidden_states = outputs[2]
    last_hidden_states = outputs[0]


print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")  #13
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i])) #1
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i])) #50
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))     # each token 768 features


# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states)) # <class 'tuple'>

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size()) # torch.Size([1, 50, 768])


# `token_vecs` is a tensor with shape [50 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
print ("Our final sentence embedding vector of shape:", sentence_embedding.size()) # torch.Size([768])

data shape:  280
outputs:  3
Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 50
Number of hidden units: 768
      Type of hidden_states:  <class 'tuple'>
Tensor shape for each layer:  torch.Size([1, 50, 768])
Our final sentence embedding vector of shape: torch.Size([768])


In [None]:
import pickle
with open('models/outputs.pkl', 'rb') as f:
    token_vecs_cat = pickle.load(f)

print("token_vecs_cat: ", len(token_vecs_cat)) # 50
print(token_vecs_cat['uid']) # (3072,)


token_vecs_cat:  2
['22110', '22111', '22112', '22113', '22114', '22115', '22116', '22117', '22118', '22119', '22120', '22121', '22122', '22123', '22124', '22125', '22126', '22127', '22128', '22129', '22130', '22131', '22132', '22133', '22134', '22135', '22136', '22137', '22138', '22139', '22140', '22141']


# read pickle file for test word present in the model

In [None]:
import pandas as pd
import torch
import pickle

data_file = './vecs_ner_conll_testa_abolish.pkl'
with open(data_file, 'rb') as f:
    data = pickle.load(f)

data['vec']
df = pd.DataFrame(data['vec'])

In [None]:
df.shape

(50, 3072)