In [2]:
import sys
import ast
import numpy as np
import pandas as pd
import dask.dataframe as dd
from tqdm import tqdm
from mlm_utils.metric_func import cosine_sim, cosine_module
from mlm_utils.transform_func import get_files
import multiprocessing as mp
import dask
from dask.delayed import delayed
sys.path.append('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM')


2024-05-15 23:56:31.621248: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-15 23:56:32.117782: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-15 23:56:35.436532: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def read_data_dask(readPath):
    df = dd.read_json(readPath, lines=True)
    return df


def load_df(df_dir):
    df_split_1 = dd.read_parquet('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM/data_mlm/split_data/split_1.parquet')
    df_split_2 = dd.read_parquet('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM/data_mlm/split_data/split_2.parquet')
    df_split_3 = dd.read_parquet('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM/data_mlm/split_data/split_3.parquet')
    return df_split_1, df_split_2, df_split_3


def preprocess_df_predicate(df_predicate):
    def first_nonzero(lst):
        nonzero_elements = filter(lambda x: x != 0, lst)
        return next(nonzero_elements, 0)
    
    df_predicate['tag_id'] = df_predicate['pos_tag_id'].apply(first_nonzero, meta=('pos_tag_id', 'int64')) 
   
    return df_predicate


In [4]:
def cosine_sim(a, b):
    # check if a and b are not list, convert it
    if not isinstance(a, list):
        a = ast.literal_eval(a)
    if not isinstance(b, list):
        b = ast.literal_eval(b)
            
    return round(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)), 4)


def cosine_module(a, b, cosine_sum):
    norm_array1 = np.linalg.norm(a)
    norm_array2 = np.linalg.norm(b)
    
    module_similarity = 1 - (np.abs(norm_array1 - norm_array2) / (norm_array1 + norm_array2))
    
    return module_similarity * cosine_sum

In [5]:

def compute_cosine_similarities(df_predicate, vector_type, df_content, metric = 'cosine'):
    # Convert pandas DataFrames to Dask DataFrames
    num_processes = mp.cpu_count() - 1
    similarities = pd.DataFrame(index=df_predicate.index, columns=df_content.index)
   
    # Define the delayed computation
    def compute_similarity(i, j):
        vec1 = df_predicate.at[i, '{}_vector'.format(vector_type)]
        vec2 = df_content.at[j, '{}_vector'.format(vector_type)]
        
        if metric == 'cosine':
            return cosine_sim(vec1, vec2)
        elif metric == 'cosine_module':
            return cosine_module(vec1, vec2, cosine_sim(vec1, vec2))
        else:
            raise ValueError("Invalid metric")
    
    # Create delayed tasks for all combinations of i and j
    tasks = []
    for i in df_predicate.index:
        for j in df_content.index:
            tasks.append(delayed(compute_similarity)(i, j))
    
    # Compute the tasks in parallel
    results = dask.compute(*tasks, num_workers=num_processes)
    
    # Fill the similarities DataFrame with the computed results
    idx = 0
    for i in df_predicate.index:
        for j in df_content.index:
            similarities.at[i, j] = results[idx]
            idx += 1
    
    # Convert to numeric type
    similarities = similarities.apply(pd.to_numeric)
    print("similarity shape: ", similarities.shape)
    
    # cosine -1
    print("create cosine -1...")
    min_indices = similarities.idxmin(axis=1)
    df_predicate.loc[:, "neg_{}_{}".format(metric, vector_type)] = df_content.loc[min_indices]['word'].values
    df_predicate.loc[:, "neg_value_{}_{}".format(metric, vector_type)] = similarities.min(axis=1).values
    del min_indices
    
    # 
    # cosine 0
    print("create cosine 0...")
    pos_cos_sum_indices = np.abs(similarities).idxmin(axis=1)
    df_predicate.loc[:, "pos_{}_{}".format(metric, vector_type)] = df_content.loc[pos_cos_sum_indices]['word'].values
    df_predicate.loc[:, "pos_value_{}_{}".format(metric, vector_type)] = np.abs(similarities).min(axis=1).values  # absolute value
    del pos_cos_sum_indices
    
    # separate neg_value_cosine_sum and pos_value_cosine_value into dataframe with 2 column
    val_df = df_predicate[['neg_value_{}_{}'.format(metric, vector_type), 'neg_{}_{}'.format(metric, vector_type), 'pos_value_{}_{}'.format(metric, vector_type), 'pos_{}_{}'.format(metric, vector_type)]]
    
    
    # drop 2 columns from df_predicate
    df_predicate.drop(['neg_value_{}_{}'.format(metric, vector_type), 'pos_value_{}_{}'.format(metric, vector_type)], axis=1, inplace=True)
    return df_predicate, val_df



In [6]:
def select_noun_word(df_predicate, vector_type, dfs, metric = 'cosine'):
    
    pd.options.mode.copy_on_write = True
    
    val_dfs = []
    for df in dfs:
        val_df = compute_cosine_similarities(df_predicate, vector_type, df, metric)[1]
        val_dfs.append(val_df)
    
    # merge 3 val_df into one with axis 1 and get the min value of each row
    concat_df = pd.concat(val_dfs, axis=1)
    del val_dfs
    
    concat_df_neg  = concat_df.filter(like='neg_value_{type}')
    concat_df_neg.columns = ['neg_value_{type}_1', 'neg_value_{type}_2', 'neg_value_{type}_3']
    
    
    word_cols = concat_df.filter(like='neg_{type}')
    word_cols.columns = ['neg_{type}_1', 'neg_{type}_2', 'neg_{type}_3']
    
    
    min_val_indices = concat_df_neg.columns.get_indexer(concat_df_neg.idxmin(axis=1))
    df_predicate.loc[:, "neg_{}".format(metric)] =  word_cols.apply(lambda row: row.iloc[min_val_indices[row.name]], axis=1)
    
    return df_predicate

In [9]:
file_paths = {
        "noun": "./data_mlm/process_folder/list_content_word_v3/NOUN.json",
        "verb": "./data_mlm/process_folder/list_content_word_v3/VERB.json",
        "adj": "./data_mlm/process_folder/list_content_word_v3/ADJ.json",
        "adv": "./data_mlm/process_folder/list_content_word_v3/ADV.json",
        "predicate_dir": "./data_mlm/process_folder/word_present_each_file_v3/",
        "wri_dir": "./data_mlm/pertured_data/masked_data_parquet/",
        "df_dir": "/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM/data_mlm/split_data/"
    }

df_verb = read_data_dask(file_paths["verb"])
df_adj = read_data_dask(file_paths["adj"])
df_adv = read_data_dask(file_paths["adv"])   
df_noun = read_data_dask(file_paths["noun"])
files = get_files(file_paths["predicate_dir"]) 
# dfs = load_df(file_paths["df_dir"])
for file in tqdm(files):
    print("Processing file...", file)
    df_predicate = read_data_dask(file_paths["predicate_dir"] + file)
    df_predicate = preprocess_df_predicate(df_predicate)
    
    results = []
    for i in range(4):
        if i == 1:
            separated_df = df_predicate[df_predicate['tag_id'] == i]
            pertured_df = compute_cosine_similarities(separated_df, 'sum', df_noun, metric = 'cosine')
            del df_noun
            results.append(pertured_df)
        elif i == 2:
            separated_df = df_predicate[df_predicate['tag_id'] == i]
            pertured_df = compute_cosine_similarities(separated_df, 'sum', df_verb, metric = 'cosine')
            del df_verb
            results.append(pertured_df)
        elif i == 3:
            separated_df = df_predicate[df_predicate['tag_id'] == i]
            pertured_df = compute_cosine_similarities(separated_df, 'sum', df_adj, metric = 'cosine')
            del df_adj
            results.append(pertured_df)
        elif i == 4:
            separated_df = df_predicate[df_predicate['tag_id'] == i]
            pertured_df = compute_cosine_similarities(separated_df, 'sum', df_adv, metric = 'cosine')
            del df_adv
            results.append(pertured_df)
    res_df = pd.concat(results, axis=0).sort_index()        
    res_df.to_parquet(file_paths['wri_dir'] + file.replace("mlm_", "").split(".")[0] + ".parquet")
    del res_df
    break 

  0%|          | 0/35 [00:00<?, ?it/s]

Processing file... mlm_abolish_full.json


: 