In [1]:
import csv
import torch

def read_csv(file_name):
    with open(file_name, 'r') as f:
        reader = csv.reader(f)
        data = list(reader)
    return data


def apply_eval(data):
    """
    Apply eval function to all elements in a list.
    
    Args:
    - data (list): List of strings
    
    Returns:
    - list: List with elements evaluated
    """
    # Define required functions and modules
    eval_locals = {'tensor': torch.tensor, 'torch': torch}
    
    return [eval(item, eval_locals) for item in data]


def convert_to_dict(data):
    list_items = map(lambda x: apply_eval(x), data[1:])
    
    list_data_dict = [{"uid": items[0], 
                        "word_vector": items[1].clone().detach(), 
                        "sum_vector": items[2].clone().detach(), 
                        "avg_vector": items[3].clone().detach()} 
                      for items in list_items]
    
    return list_data_dict

In [2]:
# read json file and convert to dict
import json

def read_data(readPath):

    with open(readPath, 'r', encoding = 'utf-8') as file:
        taskData = []
        for i, line in enumerate(file):
            sample = json.loads(line)
            taskData.append(sample)

    return taskData


## calculate cosine 

In [26]:
import numpy as np
import torch
import sys
from sklearn.metrics.pairwise import cosine_similarity
sys.path.append('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM')
from mlm_utils.model_utils import TOKENIZER

def cosine_sen_content_word(sum_vector_tensor, content_word_dict):
    '''một từ với 1 content word
    trả ra consine similarity giữa sum_vector_tensor và sum_vector của content word
    '''
    # SUM VECTOR 
    sum_content_tensor = content_word_dict['sum_vector'].clone().detach()
    
    # cosine_val_sum = cosine_similarity(sum_vector_tensor.unsqueeze(0), sum_content_tensor.unsqueeze(0))[0][0] 
    second_cosine = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
    cosine_val_sum = second_cosine(sum_vector_tensor, sum_content_tensor).item()
   
    return cosine_val_sum, content_word_dict['word_vector']

 
def cosine_sim_sen_list_word(sum_vector_tensor, list_dict_content_word):
    '''tinh cosine trong 1 cau voi tat ca cac content word. trả ra từ có cosine gần -1 và từ có cosine gần 0'''
    list_result = map(lambda x: cosine_sen_content_word(sum_vector_tensor, x), list_dict_content_word)
    list_result = sorted(list_result, key = lambda x: x[0], reverse = True) # giam dan
    
    # cosine gan -1
    min_cosine_neg, replace_word_cosine_neg = list_result[-1]
              
    # consine gan 0
    sort_positive_cosine = list(filter(lambda x: x[0] > 0, list_result))
    min_cosine_0, replace_word_cosine_0 = sort_positive_cosine[-1]
     
    return replace_word_cosine_neg, replace_word_cosine_0
   
def replace_word_in_sentence(predicate_data, content_word_data):
    ''' 1 câu gốc thành 2 câu mới: cosine = -1 và cosine = 0
    '''
    masked_index = torch.where(torch.tensor(predicate_data['pos_tag_id']).clone().detach() != 0)

    predicate_np = np.array(predicate_data['input_id'])
    replace_word_cosine_neg = None
    replace_word_cosine_0 = None
    
    sum_vector_tensor = torch.tensor(predicate_data['sum_vector']).clone().detach()
    if predicate_data['pos_tag_id'][masked_index[0][0].item()] == 1:
        replace_word_cosine_neg, replace_word_cosine_0 = cosine_sim_sen_list_word(sum_vector_tensor, content_word_data['noun'])
    elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 2:
        replace_word_cosine_neg, replace_word_cosine_0 = cosine_sim_sen_list_word(sum_vector_tensor, content_word_data['verb'])
    elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 3:
        replace_word_cosine_neg, replace_word_cosine_0 = cosine_sim_sen_list_word(sum_vector_tensor, content_word_data['adj'])
    elif predicate_data['pos_tag_id'][masked_index[0][0].item()] == 4:
        replace_word_cosine_neg, replace_word_cosine_0 = cosine_sim_sen_list_word(sum_vector_tensor, content_word_data['adv'])
    else: 
        KeyError('pos_tag_id not in [1, 2, 3, 4]')
    
    print("masked index: ", masked_index[0])   
    print("type predicate", type(predicate_data['input_id']), type(predicate_data['input_id'][0]))
    print("predicate data", predicate_np[masked_index[0]]) 
    
   
    predicate_np[masked_index[0]] = replace_word_cosine_neg
    
    new_data_cosine_neg = predicate_np.tolist()
    
    predicate_np[masked_index[0]] = replace_word_cosine_0
    new_data_cosine_pos = predicate_np.tolist()
    
    return {"origin_id": predicate_data['input_id'], 
            "cosine_neg_id": new_data_cosine_neg, 
            "cosine_0_id": new_data_cosine_pos, 
            "pos_tag_id":predicate_data['pos_tag_id']}

def create_new_data_neg_sum(predicate_file, dict_content_word, wriDir):
    '''
    return 4 data cho moi file:
        sum_vector va cosine = -1: uid, origin_input_id, sum_neg_input_id, pos_tag_id
        sum_vector va cosine = 0: uid, origin_input_id, sum_0_input_id, pos_tag_id
        avg_vector va cosine = -1: uid, origin_input_id, avg_neg_input_id, pos_tag_id
        avg_vector va cosine = 0: uid, origin_input_id, avg_0_input_id, pos_tag_id
    
    '''
    predicates_data = read_data(predicate_file)
    predicates_data =  [item for sublist in predicates_data for item in sublist]
    
    new_data = map(lambda x: replace_word_in_sentence(x, dict_content_word), predicates_data[:50])
    
    # write to csv file with header uid, origin_sentence, cosine_neg_sentence, cosine_0_sentence, pos_tag_id
    with open(wriDir + 'sum_vector_{}'.format(predicate_file.split('/')[-1].replace('mlm_', '').replace('.json', '.csv')), 'w') as file:
        writer = csv.writer(file)
        for item in new_data:
            writer.writerow(item)
           


In [None]:
file_data = './word_present_each_file/mlm_abolish_full.json'
wriDir = './pertured_data/'

noun_file = read_csv('./list_content_word_v2/NOUN.csv')
verb_file = read_csv('./list_content_word_v2/VERB.csv')
adj_file = read_csv('./list_content_word_v2/ADJ.csv')
adv_file = read_csv('./list_content_word_v2/ADV.csv')

dict_content_word = {"noun": convert_to_dict(noun_file),
                        "verb": convert_to_dict(verb_file),
                        "adj": convert_to_dict(adj_file),
                        "adv": convert_to_dict(adv_file)}




In [27]:
create_new_data_neg_sum(file_data, dict_content_word, wriDir)

masked index:  tensor([7])
type predicate <class 'list'> <class 'int'>
predicate data 6468
masked index:  tensor([10])
type predicate <class 'list'> <class 'int'>
predicate data 1148
masked index:  tensor([11, 12, 13, 14])
type predicate <class 'list'> <class 'int'>
predicate data [  183 21977 26918 23767]
masked index:  tensor([16, 17])
type predicate <class 'list'> <class 'int'>
predicate data [27553  1179]


ValueError: shape mismatch: value array of shape (4,) could not be broadcast to indexing result of shape (2,)