## Import libraries

In [25]:
import os
import sys
import json
import torch
import numpy as np
from tqdm import tqdm
import multiprocessing as mp

from mlm_utils.model_utils import TOKENIZER
sys.path.append('../')

def read_data(readPath):

    with open(readPath, 'r', encoding = 'utf-8') as file:
        taskData = list(map(json.loads, file))
          
    return taskData

## Load data

In [2]:
# def read_data(readPath):
#     '''read csv file and return list of json objects.'''
#     with open(readPath, 'r', encoding = 'utf-8') as file:
#         taskData = list(map(lambda x: json.loads(x), file))
       
#     return taskData

## Define metric functions

In [26]:
# def cosine_module_2_tensors(tensor1, tensor2, cosine_sim):
    
#     norm_tensor1 = torch.norm(tensor1)
#     norm_tensor2 = torch.norm(tensor2)
    
#     module_similarity = 1 - (torch.abs(norm_tensor1 - norm_tensor2) / (norm_tensor1 + norm_tensor2))
    
#     return module_similarity * cosine_sim
from numpy import dot
from numpy.linalg import norm
def cosine_sim(a, b):
    
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

def cosine_module_2_numpy(arr1, arr2, cosine_sim):
        
    norm_arr1 = np.linalg.norm(arr1)
    norm_arr2 = np.linalg.norm(arr2)
    
    module_similarity = 1 - (np.abs(norm_arr1 - norm_arr2) / (norm_arr1 + norm_arr2))
    
    return module_similarity * cosine_sim


In [27]:
def cosine_sen_content_word(vector_tensor, type, content_word_dict):
    '''
    Calculate cosine similarity between the sum and avg of the word and the content word vector.
    Input:
        sum_tensor: torch.Tensor, sum of the word vector.
        avg_tensor: torch.Tensor, average of the word vector.
        content_word_dict: dict (word_vector, sum_vector, avg_vector), dictionary containing the content word vector.
    Output:
        cosine: float, cosine similarity between the word vector and the content word vector.
        cosine_module: float, cosine module similarity between the word vector and the content word vector.
        content_word_dict['word_vector']: torch.Tensor, masked word vector.
    '''
    
    # content_tensor = torch.tensor(content_word_dict['sum_vector']).clone().detach()
    if type == 'sum':
        content_tensor = np.array(content_word_dict['sum_vector'])
    elif type == 'avg':
        content_tensor = np.array(content_word_dict['avg_vector'])
    # cosine_func = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
    # cosine = cosine_func(vector_tensor/torch.norm(vector_tensor), content_tensor/torch.norm(content_tensor)).item()
    cosine = round(cosine_sim(vector_tensor, content_tensor), 7)
    cosine_module = round(cosine_module_2_numpy(vector_tensor, content_tensor, cosine), 7)
    
    del content_tensor
    return cosine, cosine_module, content_word_dict['word']

In [28]:
def cosine_sim_sen_list_word(vector_tensor, type, list_dict_content_word):
    '''
    Calculate cosine similarity between the sum and avg of the word and list the content word vector, then return the word with cosine approximate -1 and 0.
    Input:
        sum_tensor: torch.Tensor, sum of the word vector.
        avg_tensor: torch.Tensor, average of the word vector.
        list_dict_content_word: list of dict (word_vector, sum_vector, avg_vector), list of dictionary containing the content word vector.
    Output:
        replace_word_neg_cos: torch.Tensor, replaced word vector with cosine approximate -1.
        replace_word_neg_cos_module: torch.Tensor, replaced word vector with cosine module approximate -1.
        replace_word_pos_cos: torch.Tensor, replaced word vector with cosine approximate 0.
        replace_word_pos_cos_module: torch.Tensor, replaced word vector with cosine module approximate 0.
    '''
   
    list_result = list(map(lambda x: cosine_sen_content_word(vector_tensor, type, x), list_dict_content_word))
    sorted_cos = sorted(list_result, key = lambda x: x[0]) # tang dan
    sorted_cos_module = sorted(list_result, key = lambda x: x[1]) # tang dan
    
    # cosine approximate -1
    neg_cos, _, replace_word_neg_cos = sorted_cos[0]
    _, neg_cos_module, replace_word_neg_cos_module = sorted_cos_module[0]
    
    # cosine approximate 0
    sorted_cos.sort(key=lambda x: abs(x[0]))
    pos_cos, _, replace_word_pos_cos = sorted_cos[0]
   
    sorted_cos_module.sort(key=lambda x: abs(x[1]))
    _, pos_cos_module, replace_word_pos_cos_module = sorted_cos_module[0]
    
    del list_result, sorted_cos, sorted_cos_module
    # return pair of word and cosine similarity
    return {'neg_cos': (neg_cos, replace_word_neg_cos), 
            'neg_cos_module': (neg_cos_module, replace_word_neg_cos_module), 
            'pos_cos': (pos_cos, replace_word_pos_cos), 
            'pos_cos_module': (pos_cos_module, replace_word_pos_cos_module)}
            



## Find new word with lowest cosine sim 

In [29]:


def find_new_word(predicate_data_chunk, chunkNumber, tempList, content_word_data):
    ''' 
    Generate pertured sentence for each predicate data in the chunk. 
    '''
    name = 'new_data_{}.json'.format(str(chunkNumber))
    cosine_val_file = 'cosine_res_{}.csv'.format(str(chunkNumber))
    def gen_new_sen(origin_id, masked_word, list_new_word):
        list_new_word = [item for sublist in list_new_word for item in sublist]
        origin_sen = TOKENIZER.decode(torch.tensor(origin_id), skip_special_tokens = True)
        list_new_sen = list(map(lambda x: origin_sen.replace(TOKENIZER.decode(masked_word), TOKENIZER.decode(x), 1), list_new_word))
        
        return list_new_sen
        
    def generate_new_word_in_sen(predicate_data):
        
        # find masked index in sentence
        
        masked_index = torch.where(torch.tensor(predicate_data['pos_tag_id']).clone().detach() != 0)
        
        # convert input id to numpy array, easy to replace new word at masked index
        predicate_np = np.array(predicate_data['origin_id'])
        
        # convert sum and avg vector to tensor
      
        sum_tensor = np.array(predicate_data['sum_vector'])
        avg_tensor = np.array(predicate_data['avg_vector'])
        
        # find cosine similarity between the sum and avg of the word and its corresponding type of content word vector
        if predicate_data['pos_tag_id'][masked_index[0][0].item()] == 1:
            content_word = content_word_data['noun']
            
        elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 2:
            content_word =  content_word_data['verb']
            
        elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 3:
            content_word =  content_word_data['adj']
            
        elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 4:
            content_word =  content_word_data['adv']
        else: 
            KeyError('pos_tag_id not in [1, 2, 3, 4]')
        
        lisst = [cosine_sim_sen_list_word(vector_np, type, content_word) for type, vector_np in zip(['sum', 'avg'], [sum_tensor, avg_tensor])]
             
        del content_word, sum_tensor, avg_tensor
         
        list_new_sens = gen_new_sen(predicate_data['origin_id'], predicate_np[masked_index[0]], list(map(lambda x: [i[1] for i in x.values()], lisst)))
        feature =  {
                "origin_uid": predicate_data['origin_uid'], 
                "sum_neg_cos": list_new_sens[0], 
                "sum_neg_cos_module": list_new_sens[1],
                "sum_pos_cos": list_new_sens[2], 
                "sum_pos_cos_module": list_new_sens[3],
                "avg_neg_cos":list_new_sens[4],
                "avg_neg_cos_module":list_new_sens[5],
                "avg_pos_cos":list_new_sens[6],
                "avg_pos_cos_module": list_new_sens[7],
                "pos_tag_id": predicate_data['pos_tag_id']}
        
        return feature, lisst        
    
    # apply function to each sentence in the chunk          
    list_feature = map(lambda x: generate_new_word_in_sen(x), tqdm(predicate_data_chunk))
    
    # write to json file           
    with open(name, 'w') as wf, open(cosine_val_file, 'w', newline='') as csv_file:
        for feature in list_feature:
            wf.write('{}\n'.format(json.dumps(feature[0]))) 
            
            # Write data rows
            for item in feature[1]:
                csv_file.write("%s\n" % item)
        tempList.append(name)
        tempList.append(cosine_val_file)

In [30]:
def multi_process_gen_data(predicate_file, dict_content_word, wriDir):
    '''
    Create pertured data using multiple processes and save pertured data to a file.
    
    '''
    print("Preprocessing file... ", predicate_file)
    predicates_data = read_data(predicate_file)
    
    
    
    # MULTI PROCESSING
    man = mp.Manager()

    # shared list to store all temp files written by processes
    tempFilesList = man.list()
    numProcess = mp.cpu_count() - 1
   
    chunkSize = int(len(predicates_data) / (numProcess))
    
    print('Data Size: ', len(predicates_data))
    print('number of threads: ', numProcess)
    
    list_test = []
    processes = []
    for i in range(numProcess):
        dataChunk = predicates_data[chunkSize*i : chunkSize*(i+1)]
        list_test.append(dataChunk)
        p = mp.Process(target = find_new_word, args = (dataChunk, i, tempFilesList, dict_content_word))
        
        p.start()
        processes.append(p)
    
    lastDataChunk = predicates_data[chunkSize*(numProcess):]
    list_test.append(lastDataChunk)
    p = mp.Process(target = find_new_word, args = (lastDataChunk, numProcess, tempFilesList, dict_content_word))
        
    p.start()
    processes.append(p)    
    for pr in processes:
        pr.join()
    
    wrtPath = wriDir + 'masked_data_text/' + '{}'.format(predicate_file.split('/')[-1].replace('mlm_', ''))
    wrtCosine = wriDir + 'cosine_val/' + 'cosine_val_{}'.format(predicate_file.split('/')[-1].replace('mlm_', '').replace('.json', '.csv'))
    
    # combining the files written by multiple processes into a single final file
    with open(wrtPath, 'w') as f:
        for file in tempFilesList:
            if file.endswith('.json'):
                with open(file, 'r') as r:
                    for line in r:
                        sample =  json.loads(line)
                        f.write('{}\n'.format(json.dumps(sample)))
                os.remove(file)
            else:
                # read csv and write to file
                with open(file, 'r') as read_file, open(wrtCosine, 'w', newline="") as write_file:
                    read_dataaa = read_file.readlines()
                    for line in read_dataaa:
                        write_file.write("%s\n" % line)
                   
                os.remove(file)
  
    print("Done file", predicate_file)   

## Load 4 lists of content word.

In [31]:
import pandas as pd

def read_and_convert_to_dict(file_path):
    '''Convert DataFrame to list of dictionaries with keys: uid, word_vector, sum_vector, avg_vector.'''
    
    data=pd.read_csv(file_path)
    
    eval_locals = {'tensor': torch.tensor, 'torch': torch}
    
    list_data_dict = [{"uid": row[0],
                        "word": eval(row[1], eval_locals),
                        "sum_vector": eval(row[2], eval_locals),
                        "avg_vector": eval(row[3], eval_locals),
                        }  for row in data.itertuples(index=False)]
    
    return list_data_dict
file_paths = {
    "noun": "../content_word_csv/NOUN.csv",
    "verb": "../content_word_csv/VERB.csv",
    "adj": "../content_word_csv/ADJ.csv",
    "adv": "../content_word_csv/ADV.csv"
}

dict_content_word = {key: read_and_convert_to_dict(file_path) for key, file_path in file_paths.items()}


: 

In [32]:
from mlm_utils.transform_func import get_files


# wriDir = './data_mlm/perturbed_data/'
wriDir = './data_mlm/perturbed_data/test_new_split/'
dataDir = './data_mlm/process_folder/word_represent_finetuned_model/'
# files = get_files(dataDir)

# files = ['mlm_lead_full.json', 'mlm_inhibit_full.json', 'mlm_express_full.json']
files = ['mlm_decrease_2_full.json']
for file in files:
    
    multi_process_gen_data(dataDir + file, dict_content_word, wriDir)
    break

Preprocessing file...  ./data_mlm/process_folder/word_represent_finetuned_model/mlm_decrease_2_full.json
Data Size:  544
number of threads:  7


  6%|▋         | 5/77 [00:46<13:30, 11.26s/it]

### convert perturbed text data to json file 

In [2]:
# read masked_data_text
from pathlib import Path
import os
import numpy as np
import json
import sys
sys.path.append('..')
from mlm_utils.transform_func import encode_text

def create_data_perturbation(dataDir, labelDir, type_file):
    def encode_sentence(sentence, type_file):
        encode_data = encode_text(sentence[type_file])
        feature = {'uid': sentence['origin_uid'], 
                    'token_id': encode_data['input_ids'][0].tolist(),
                    'type_id': encode_data['token_type_ids'][0].tolist(),
                    'mask': encode_data['attention_mask'][0].tolist(),
                    'pos_tag_id': sentence['pos_tag_id'],
                    'label': sentence['label']
                    }
        return feature
    
    
    dataMaskedDir = dataDir / Path('masked_data_text')
    
    wrtDir = dataDir / Path(type_file)
    # create directory if not exist
    if not os.path.exists(wrtDir):
        os.makedirs(wrtDir)
        
        
    for data_file, label_file in zip(os.listdir(dataMaskedDir), os.listdir(labelDir)):
       
        print(data_file, label_file)
        with open(os.path.join(dataMaskedDir, data_file), 'r') as f, open(os.path.join(labelDir, label_file), 'r') as f1:
            data = [json.loads(line) for line in f]  
            data_label = [json.loads(line) for line in f1]
            
            # Create a dictionary from data_label for quick lookup
            label_dict = {int(item['uid']): item['label'] for item in data_label}

            # assign label from data_label if uid matches
            for item in data:
               
                uid = item['origin_uid']
                if uid in label_dict:
                    item['label'] = label_dict[uid]
                
               
            data = list(map(lambda x: encode_sentence(x, type_file), data))
           
             
            # save data to json file
            with open(os.path.join(wrtDir, '{}'.format(data_file).replace('_full', '')), 'w') as f:
                for item in data:
                    f.write('{}\n'.format(json.dumps(item)))
    
        


In [None]:
dataDir = Path('./data_mlm/perturbed_data')
labelDir = Path('./data_mlm/process_folder/coNLL_tsv_json/label_cu')

type_files = ['sum_neg_cos', 'sum_pos_cos', 'avg_pos_cos', 'sum_neg_cos_module', 'avg_neg_cos_module', 'sum_pos_cos_module', 'avg_pos_cos_module']
for type_file in type_files:
    
    print('Processing type file: ', type_file)
    create_data_perturbation(dataDir, labelDir, type_file)