## Import libraries

In [None]:
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp

sys.path.append('/kaggle/working/SRLPredictionEasel/MLM')
from mlm_utils.model_utils import TOKENIZER


## Load data

In [23]:

def read_and_convert_csv(file_path):
    '''Read a CSV file and convert it to a dictionary.'''
    data = pd.read_csv(file_path)
    return convert_to_dict(data)

def convert_to_dict(data):
    '''Convert DataFrame to list of dictionaries with keys: uid, word_vector, sum_vector, avg_vector.'''
    eval_locals = {'tensor': torch.tensor, 'torch': torch}
 
    list_data_dict = [{"uid": row[0], 
                       "word_vector": eval(row[1], eval_locals), 
                       "sum_vector": eval(row[2], eval_locals), 
                       "avg_vector": eval(row[3], eval_locals)} 
                      for row in data.itertuples(index=False)]
    
    return list_data_dict
                            
def read_data(readPath):
    '''read csv file and return list of json objects.'''
    with open(readPath, 'r', encoding = 'utf-8') as file:
        taskData = []
        for line in file:
            sample = json.loads(line)
            taskData.append(sample)

    return taskData

## Define metric functions

In [57]:
def cosine_module_2_tensors(tensor1, tensor2, cosine_func):

    norm_tensor1 = torch.norm(tensor1)
    norm_tensor2 = torch.norm(tensor2)
    
    # cosine term
    cosine_sim = cosine_func(tensor1 / norm_tensor1, tensor2 / norm_tensor2)
    
    # module term
    module_similarity = 1 - (torch.abs(norm_tensor1 - norm_tensor2) / (norm_tensor1 + norm_tensor2))
    
    return module_similarity * cosine_sim


In [54]:
def cosine_sen_content_word(sum_tensor, avg_tensor, content_word_dict):
    '''
    Calculate cosine similarity between the sum and avg of the word and the content word vector.
    Input:
        sum_tensor: torch.Tensor, sum of the word vector.
        avg_tensor: torch.Tensor, average of the word vector.
        content_word_dict: dict (word_vector, sum_vector, avg_vector), dictionary containing the content word vector.
    Output:
        cosine_sum: float, cosine similarity between the sum of the word vector and the content word vector.
        cosine_module_sum: float, cosine module similarity between the sum of the word vector and the content word vector.
        content_word_dict['word_vector']: torch.Tensor, masked word vector.
    '''
   
   
    content_tensor_sum = content_word_dict['sum_vector'].clone().detach()
    content_tensor_avg = content_word_dict['avg_vector'].clone().detach()
    
    cosine_func = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
    cosine_sum = cosine_func(sum_tensor/torch.norm(sum_tensor), content_tensor_sum/torch.norm(content_tensor_sum)).item()
    cosine_avg = cosine_func(avg_tensor/torch.norm(avg_tensor), content_tensor_avg/torch.norm(content_tensor_avg)).item()
    cosine_module_sum = cosine_module_2_tensors(sum_tensor, content_tensor_sum, cosine_func)
    
    return cosine_sum, cosine_module_sum, content_word_dict['word_vector']

In [46]:

def cosine_sim_sen_list_word(sum_tensor, avg_tensor, list_dict_content_word):
    '''
    Calculate cosine similarity between the sum and avg of the word and list the content word vector, then return the word with cosine approximate -1 and 0.
    Input:
        sum_tensor: torch.Tensor, sum of the word vector.
        avg_tensor: torch.Tensor, average of the word vector.
        list_dict_content_word: list of dict (word_vector, sum_vector, avg_vector), list of dictionary containing the content word vector.
    Output:
        replace_word_neg_cos_sum: torch.Tensor, replaced word vector with cosine approximate -1.
        replace_word_neg_cos_module_sum: torch.Tensor, replaced word vector with cosine module approximate -1.
        replace_word_0_cos_sum: torch.Tensor, replaced word vector with cosine approximate 0.
        replace_word_0_cos_module_sum: torch.Tensor, replaced word vector with cosine module approximate 0.
    '''
    
    
    list_result = list(map(lambda x: cosine_sen_content_word(sum_tensor, avg_tensor, x), list_dict_content_word))
    sorted_cos_sum = sorted(list_result, key = lambda x: x[0]) # tang dan

    sorted_cos_module_sum = sorted(list_result, key = lambda x: x[1]) # tang dan
    
    # # cosine approximate -1
    cos_neg_sum, _, replace_word_neg_cos_sum = sorted_cos_sum[0]
    _, cos_module_neg_sum, replace_word_neg_cos_module_sum = sorted_cos_module_sum[0]
    
    # cosine approximate 0
    sorted_cos_sum.sort(key=lambda x: abs(x[0]))
    cos_pos_sum, _, replace_word_0_cos_sum = sorted_cos_sum[0]
   
    sorted_cos_module_sum.sort(key=lambda x: abs(x[1]))
    _, cos_module_pos_sum, replace_word_0_cos_module_sum = sorted_cos_module_sum[0]
   
   
    return replace_word_neg_cos_sum, replace_word_neg_cos_module_sum, replace_word_0_cos_sum, replace_word_0_cos_module_sum 

## Find new word with lowest cosine sim 

In [20]:
def find_new_word(predicate_data_chunk, chunkNumber, tempList, content_word_data):
    ''' 
    Generate pertured sentence for each predicate data in the chunk. 
    '''
    name = 'new_data_{}.json'.format(str(chunkNumber))
    
    def generate_new_word_in_sen(predicate_data):
        
        # find masked index in sentence
        masked_index = torch.where(torch.tensor(predicate_data['pos_tag_id']).clone().detach() != 0)
        
        # convert input id to numpy array, easy to replace new word at masked index
        predicate_np = np.array(predicate_data['input_id'])
        
        # convert sum and avg vector to tensor
        avg_tensor = torch.tensor(predicate_data['avg_vector']).clone().detach()
        sum_tensor = torch.tensor(predicate_data['sum_vector']).clone().detach()
        
        
        # find cosine similarity between the sum and avg of the word and its corresponding type of content word vector
        if predicate_data['pos_tag_id'][masked_index[0][0].item()] == 1:
            replace_word_neg_cos, replace_word_neg_cos_module, replace_word_0_cos, replace_word_0_cos_module = cosine_sim_sen_list_word(sum_tensor, avg_tensor, content_word_data['noun'])
            
        elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 2:
            replace_word_neg_cos, replace_word_neg_cos_module, replace_word_0_cos, replace_word_0_cos_module = cosine_sim_sen_list_word(sum_tensor, avg_tensor, content_word_data['verb'])
            
        elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 3:
            replace_word_neg_cos, replace_word_neg_cos_module, replace_word_0_cos, replace_word_0_cos_module = cosine_sim_sen_list_word(sum_tensor, avg_tensor, content_word_data['adj'])
            
        elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 4:
            replace_word_neg_cos, replace_word_neg_cos_module, replace_word_0_cos, replace_word_0_cos_module = cosine_sim_sen_list_word(sum_tensor, avg_tensor, content_word_data['adv'])
            
        else: 
            KeyError('pos_tag_id not in [1, 2, 3, 4]')


        feature =  {"origin_id": predicate_data['input_id'], 
                "masked_index": masked_index[0].tolist(),
                "masked_word": TOKENIZER.decode(predicate_np[masked_index[0]]),
                "cos_neg": TOKENIZER.decode(replace_word_neg_cos), 
                "cos_0": TOKENIZER.decode(replace_word_0_cos), 
                "cos_module_neg": TOKENIZER.decode(replace_word_neg_cos_module),
                "cos_module_0": TOKENIZER.decode(replace_word_0_cos_module),
                "pos_tag_id": predicate_data['pos_tag_id']}
        return feature         
    
    # apply function to each sentence in the chunk          
    list_feature = map(lambda x: generate_new_word_in_sen(x), tqdm(predicate_data_chunk))
    
    
    # write to json file           
    with open(name, 'w') as wf:
        for feature in list_feature:
            wf.write('{}\n'.format(json.dumps(feature))) 
        tempList.append(name)

In [14]:
def multi_process_gen_data(predicate_file, dict_content_word, wriDir):
    '''
    Create pertured data using multiple processes and save pertured data to a file.
    
    '''
    print("Preprocessing file... ", predicate_file)
    predicates_data = read_data(predicate_file)
    predicates_data = [item for sublist in predicates_data for item in sublist]
    
    # MULTI PROCESSING
    man = mp.Manager()

    # shared list to store all temp files written by processes
    tempFilesList = man.list()
    numProcess = mp.cpu_count() - 1
    chunkSize = int(len(predicates_data) / (numProcess))
    
    print('Data Size: ', len(predicates_data))
    print('number of threads: ', numProcess)

    processes = []
    for i in range(numProcess):
        dataChunk = predicates_data[chunkSize*i : chunkSize*(i+1)]

        p = mp.Process(target = find_new_word, args = (dataChunk, i, tempFilesList, dict_content_word))
        
        p.start()
        processes.append(p)
        
    for pr in processes:
        pr.join()
    
    wrtPath = wriDir + '{}'.format(predicate_file.split('/')[-1].replace('mlm_', ''))
    
    # combining the files written by multiple processes into a single final file
    with open(wrtPath, 'w') as f:
        for file in tempFilesList:
            with open(file, 'r') as r:
                for line in r:
                    sample =  json.loads(line)
                    f.write('{}\n'.format(json.dumps(sample)))
            os.remove(file)
        
    print("Done file", predicate_file)   

## Load 4 lists of content word.

In [6]:
file_paths = {
    "noun": "/kaggle/input/masked-data/list_content_word_v2/NOUN.csv",
    "verb": "/kaggle/input/masked-data/list_content_word_v2/VERB.csv",
    "adj": "/kaggle/input/masked-data/list_content_word_v2/ADJ.csv",
    "adv": "/kaggle/input/masked-data/list_content_word_v2/ADV.csv"
}

dict_content_word = {key: read_and_convert_csv(file_path) for key, file_path in file_paths.items()}


In [8]:
len(dict_content_word['noun']), len(dict_content_word['verb']), len(dict_content_word['adj']), len(dict_content_word['adv'])

(42549, 22676, 13854, 4537)

In [14]:
from mlm_utils.transform_func import get_files, encode_text
from pathlib import Path

# Create a directory if it is not there, so we can save files and results in it
Path('/kaggle/working/pertured_data/').mkdir(parents=True, exist_ok=True)
wriDir = '/kaggle/working/pertured_data/'
dataDir = '/kaggle/input/masked-data/word_present_each_file/'
files = get_files(dataDir)

for file in files:
    multi_process_gen_data(dataDir + file, dict_content_word, wriDir)

2024-05-14 15:11:12.600124: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-14 15:11:13.237953: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-14 15:11:18.315892: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  from .autonotebook import tqdm as notebook_tqdm


## Replace new word in the sentence

In [None]:
def read_data(readPath):
    
    with open(readPath, 'r', encoding = 'utf-8') as file:
        taskData = []
        for i, line in enumerate(file):
            sample = json.loads(line)
            taskData.append(sample)
            
    return taskData

def generate_data_and_write_json(dataDir, wriDir, lableDir):
    
    def gen_data_line(line):
        
        origin_sen = TOKENIZER.decode(torch.tensor(line['origin_id']), skip_special_tokens = True)

        new_word_list_set = set([line['cos_neg'], line['cos_0'], line['cos_module_neg'], line['cos_module_0']])

        masked_data_set = list(map(lambda x: origin_sen.replace(line['masked_word'], x, 1), new_word_list_set))
        return line['origin_id'], str(masked_data_set)
    
    write_file = "pertured_data_{}".format(data_f)
    files_data = get_files(dataDir)
    files_label = get_files(lableDir)
    
    
    for data_f, label_f in tqdm(zip(files_data, files_label)):
       
        data = read_data(dataDir + data_f)
        label_data = read_data(lableDir + label_f)
        masked_data_file = list(map(lambda x: gen_data_line(x), data))
        
        tmp_list = []
        
        # Get origin label from label file  
        print("Process file", data_f)
        for (origin_id, pertured_text) in masked_data_file:
            for line in label_data:
                if list(filter(None, origin_id)) == list(filter(None, line['token_id'])):
                    tmp_list.append((line['uid'], origin_id, pertured_text, line['label']))
                else: 
                    continue
         
        with open(os.path.join(wriDir, write_file) , 'w') as f:
            for (origin_uid, origin_id, pertured_text, label) in (tmp_list):
                encoded_text = encode_text(pertured_text)
                sample = {
                    'origin_uid': origin_uid,
                    'origin_id': origin_id,
                    'label': label,
                    'token_id': encoded_text['input_ids'][0].tolist(),
                    'type_id': encoded_text['token_type_ids'][0].tolist(),
                    'mask': encoded_text['attention_mask'][0].tolist()}
                
                f.write('{}\n'.format(json.dumps(sample)))
        print("Done", data_f)

In [None]:
labelDir = './data_mlm/process_folder/coNLL_tsv_json/modify_label_prepared_data/'
dataDir = './data_mlm/pertured_data/'
wriDir = './data_mlm/pertured_data/masked_data_json_v2'
generate_data_and_write_json(dataDir = dataDir, wriDir = wriDir, lableDir=labelDir)