In [52]:
import json
import os
import torch
import pandas as pd
from pathlib import Path
import sys
sys.path.insert(1, '../')
from infer_pipeline import inferPipeline
from logger_ import make_logger
from datetime import datetime
from utils.tranform_functions import read_data
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# setting logging
now = datetime.now()
logDir = now.strftime("%d_%m-%H_%M")
if not os.path.isdir(logDir):
    os.makedirs(logDir)

logger = make_logger(name = 'get_word_represent', debugMode=True,
                    logFile=os.path.join(logDir, 'get_word_represent.log'), silent=True)
logger.info("logger created.")


# model_path = Path('./output/multi_task_model_9_13050.pt')
# pipe = inferPipeline(model_path, logger)


In [53]:
def get_sum_avg(last_hidden_state, pos_tag_id):
    
    # lhds = 32 128 768 
    # pos tag = 32 128
    
    b_sum_vector = []
    b_avg_vector = []
    
    for index in range(len(pos_tag_id)):
        masked_index = torch.where(pos_tag_id[index] != 0)
        sum_vector_present = torch.sum(last_hidden_state[index][masked_index], 0)
        avg_vector_present = torch.mean(last_hidden_state[index][masked_index], 0)
        
        assert sum_vector_present.shape == avg_vector_present.shape == torch.Size([768])
        
        b_sum_vector.append(sum_vector_present)
        b_avg_vector.append(avg_vector_present)
    return b_sum_vector, b_avg_vector

In [55]:
def get_word_represent(model, dataloader):
    '''Get word presentation from original BioBert model'''
    df = pd.DataFrame()
    model.eval()
    origin_ids = []
    list_sum_vector = []
    list_avg_vector = []
    for batch in dataloader:
        batch = tuple(t.to(device) if isinstance(t, torch.Tensor) else t for t in batch)
        origin_uid, token_id, type_id, mask, label, pos_tag_id, _ = batch
        # origin_uid, tensors['token_id'], tensors['type_id'], tensors['mask'], tensors['label'], tensors['pos_tag_id'], tensors['masked_id']
        origin_ids.append(origin_uid)
        with torch.no_grad():
            outputs_model = model(token_id, type_id, mask)
            last_hidden_states = outputs_model[0]
            sum_vector, avg_vector = get_sum_avg(last_hidden_states, pos_tag_id)
            
            # sum last 4 
            # hidden_states = outputs_model[2]
            # word_embed = torch.stack(hidden_states[-4:]).sum(0)
            # sum_vector, avg_vector = get_sum_avg(word_embed, pos_tag_id)
            
            list_sum_vector.append(sum_vector)
            list_avg_vector.append(avg_vector)
        
    df['uid'] = [item for sublist in origin_ids for item in sublist]
    df['sum_vector'] = [item for sublist in list_sum_vector for item in sublist]
    df['avg_vector'] = [item for sublist in list_avg_vector for item in sublist]
   
    return df    
            

In [56]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from mlm_utils.pertured_dataset import PerturbedDataset
from mlm_utils.transform_func import check_data_dir, get_files
from mlm_utils.metric_func import cosine_sim, cosine_module, ele_wise_sub


def cal_all_files(pipe, dataDir, embDir, wriDir):
    files = get_files(dataDir)
    emb_files = get_files(embDir)
    
    check_data_dir(wriDir, auto_create=True)
    
    for file, emb_file in tqdm(zip(files, emb_files)):
        print("file", file, "emb_file", emb_file)
        df_metric = pd.DataFrame()
    
        pipe.infer(dataDir, file)
        df = get_word_represent(pipe.model, pipe.dataloader)
    
        emb_df = pd.DataFrame(read_data(embDir/emb_file))
       
        # convert tensor to list
        df['sum_vector'] = df['sum_vector'].apply(lambda x: x.tolist())
        
        # calculate Hypothesis 1 and Hypothesis 2 in Task 1.1
        df_metric['uid'] = df['uid']
        df_metric['cos_sim'] = pd.Series(map(cosine_sim, df['sum_vector'], emb_df['sum_vector']))
        df_metric['cos_mo'] = pd.Series(map(cosine_module, df['sum_vector'], emb_df['sum_vector'], df_metric['cos_sim']))
        df_metric['ele_sub_neg'] = pd.Series(map(lambda x, y: ele_wise_sub(x, y, negate=True), df['sum_vector'], emb_df['sum_vector']))
        
        df_metric['cos_dif'] = 1 - np.abs(df_metric['cos_sim'])
        df_metric['cos_mo_dif'] = 1 - np.abs(df_metric['cos_mo'])
        df_metric['ele_sub'] = pd.Series(map(lambda x, y: ele_wise_sub(x, y), df['sum_vector'], emb_df['sum_vector']))
        
        
        # save df to csv 
        df_metric.to_csv(wriDir/f'{file}.csv'.replace('.json', ''), index=False)
        
        print(f'{file} done')
    return df_metric
    

In [57]:
from pathlib import Path
# bertmodel = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2', output_hidden_states =True)
pipe = inferPipeline(logger)

## Calculate difference between word presentation from original model and fine-tuned model

In [58]:

dataDir = Path('./data_mlm/process_folder/mlm_output')
wriDir = Path('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM/data_mlm/process_folder/PAS_knowledge')
embDir = Path('./data_mlm/process_folder/word_represent_finetuned_model')
df = cal_all_files(pipe, dataDir, embDir, wriDir)                                 


0it [00:00, ?it/s]

file mlm_abolish_full.json emb_file mlm_abolish_full.json


1it [01:59, 119.30s/it]

mlm_abolish_full.json done
file mlm_alter_full.json emb_file mlm_alter_full.json


2it [04:56, 153.60s/it]

mlm_alter_full.json done
file mlm_begin_1_full.json emb_file mlm_begin_1_full.json


3it [09:47, 216.14s/it]

mlm_begin_1_full.json done
file mlm_begin_2_full.json emb_file mlm_begin_2_full.json


4it [16:07, 280.73s/it]

mlm_begin_2_full.json done
file mlm_block_full.json emb_file mlm_block_full.json


5it [20:25, 272.64s/it]

mlm_block_full.json done
file mlm_catalyse_full.json emb_file mlm_catalyse_full.json


6it [24:14, 257.88s/it]

mlm_catalyse_full.json done
file mlm_confer_full.json emb_file mlm_confer_full.json


7it [29:52, 284.01s/it]

mlm_confer_full.json done
file mlm_decrease_1_full.json emb_file mlm_decrease_1_full.json


8it [35:46, 306.17s/it]

mlm_decrease_1_full.json done
file mlm_decrease_2_full.json emb_file mlm_decrease_2_full.json


9it [36:51, 230.87s/it]

mlm_decrease_2_full.json done
file mlm_delete_full.json emb_file mlm_delete_full.json


10it [38:39, 192.82s/it]

mlm_delete_full.json done
file mlm_develop_full.json emb_file mlm_develop_full.json


11it [42:53, 211.52s/it]

mlm_develop_full.json done
file mlm_disrupt_full.json emb_file mlm_disrupt_full.json


12it [44:20, 173.90s/it]

mlm_disrupt_full.json done
file mlm_eliminate_full.json emb_file mlm_eliminate_full.json


13it [46:09, 154.13s/it]

mlm_eliminate_full.json done
file mlm_encode_full.json emb_file mlm_encode_full.json


14it [48:22, 147.87s/it]

mlm_encode_full.json done
file mlm_express_full.json emb_file mlm_express_full.json


15it [51:22, 157.53s/it]

mlm_express_full.json done
file mlm_generate_full.json emb_file mlm_generate_full.json


16it [53:12, 142.99s/it]

mlm_generate_full.json done
file mlm_inhibit_full.json emb_file mlm_inhibit_full.json


17it [56:19, 156.31s/it]

mlm_inhibit_full.json done
file mlm_initiate_full.json emb_file mlm_initiate_full.json


18it [1:00:46, 189.71s/it]

mlm_initiate_full.json done
file mlm_lead_full.json emb_file mlm_lead_full.json


19it [1:03:24, 180.04s/it]

mlm_lead_full.json done
file mlm_lose_full.json emb_file mlm_lose_full.json


20it [1:08:00, 208.78s/it]

mlm_lose_full.json done
file mlm_modify_full.json emb_file mlm_modify_full.json


21it [1:13:31, 245.50s/it]

mlm_modify_full.json done
file mlm_mutate_full.json emb_file mlm_mutate_full.json


22it [1:25:15, 383.12s/it]

mlm_mutate_full.json done
file mlm_proliferate_full.json emb_file mlm_proliferate_full.json


23it [1:29:32, 345.36s/it]

mlm_proliferate_full.json done
file mlm_recognize_full.json emb_file mlm_recognize_full.json


24it [1:31:16, 273.06s/it]

mlm_recognize_full.json done
file mlm_result_full.json emb_file mlm_result_full.json


25it [1:37:48, 308.52s/it]

mlm_result_full.json done
file mlm_skip_full.json emb_file mlm_skip_full.json


26it [1:43:03, 310.64s/it]

mlm_skip_full.json done
file mlm_splice_1_full.json emb_file mlm_splice_1_full.json


27it [1:48:55, 322.84s/it]

mlm_splice_1_full.json done
file mlm_splice_2_full.json emb_file mlm_splice_2_full.json


28it [1:53:35, 310.04s/it]

mlm_splice_2_full.json done
file mlm_transcribe_full.json emb_file mlm_transcribe_full.json


29it [2:00:23, 339.62s/it]

mlm_transcribe_full.json done
file mlm_transform_1_full.json emb_file mlm_transform_1_full.json


30it [2:02:15, 271.33s/it]

mlm_transform_1_full.json done
file mlm_transform_2_full.json emb_file mlm_transform_2_full.json


31it [2:05:55, 255.88s/it]

mlm_transform_2_full.json done
file mlm_translate_1_full.json emb_file mlm_translate_1_full.json


32it [2:11:47, 284.51s/it]

mlm_translate_1_full.json done
file mlm_translate_2_full.json emb_file mlm_translate_2_full.json


33it [2:17:11, 296.59s/it]

mlm_translate_2_full.json done
file mlm_translate_3_full.json emb_file mlm_translate_3_full.json


34it [2:25:23, 355.03s/it]

mlm_translate_3_full.json done
file mlm_truncate_full.json emb_file mlm_truncate_full.json


35it [2:33:45, 263.59s/it]

mlm_truncate_full.json done





In [None]:
# cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
# last_hidden_states = outputs_model[0]
# hidden_states = outputs_model[2]

# # sum of last four layer
# word_embed_5 = torch.stack(hidden_states[-4:]).sum(0)

# # concatenate last four layers
# word_embed_6 = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)

# return last_hidden_states, word_embed_5, word_embed_6


# draft calculate cosine similarity

In [None]:
from transformers import BertConfig, BertModel, BertTokenizer

data = read_data('/mnt/c/Users/Phat Pham/Documents/THESIS/SRL-for-BioBERT/data/coNLL_tsv/bert-base-uncased_prepared_data/ner_conll_testa_abolish.json')

print("data shape: ", len(data))  # 280
tokens_id = data[0]['token_id']
segments_id = data[0]['type_id']


# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([tokens_id])
segments_tensors = torch.tensor([segments_id])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2',
                                output_hidden_states = True # Whether the model returns all hidden-states.
                                )
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)
    print("outputs: ", len(outputs)) # 2
    hidden_states = outputs[2]
    last_hidden_states = outputs[0]


print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")  #13
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i])) #1
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i])) #50
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))     # each token 768 features


# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states)) # <class 'tuple'>

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size()) # torch.Size([1, 50, 768])


# `token_vecs` is a tensor with shape [50 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
print ("Our final sentence embedding vector of shape:", sentence_embedding.size()) # torch.Size([768])

data shape:  280
outputs:  3
Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 50
Number of hidden units: 768
      Type of hidden_states:  <class 'tuple'>
Tensor shape for each layer:  torch.Size([1, 50, 768])
Our final sentence embedding vector of shape: torch.Size([768])
