In [None]:
import sys
import numpy as np
import pandas as pd

sys.path.append('/mnt/c/Users/Phat Pham/Documents/THESIS/SRLPredictionEasel/MLM')


In [53]:
import ast
import dask.dataframe as dd

def read_data(readPath):
    df = pd.read_json(readPath, lines=True)
    return df

def read_data_dask(readPath):
    df = dd.read_json(readPath, lines=True)
    return df

def cosine_sim(a, b):
    # check if a and b are not list, convert it
    if not isinstance(a, list):
        a = ast.literal_eval(a)
    if not isinstance(b, list):
        b = ast.literal_eval(b)
            
    return round(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)), 4)


def cosine_module(a, b, cosine_sum):
    norm_array1 = np.linalg.norm(a)
    norm_array2 = np.linalg.norm(b)
    
    module_similarity = 1 - (np.abs(norm_array1 - norm_array2) / (norm_array1 + norm_array2))
    
    return module_similarity * cosine_sum



In [54]:

def compute_cosine_similarities(df_predicate, vector_type, df_content, metric = 'cosine'):
    ''' cosine simimlarity matrix -> neg cosine sum, pos cosine sum'''
    similarities = pd.DataFrame(index=df_predicate.index, columns=df_content.index)
    
    for i in df_predicate.index:
        for j in df_content.index:
            vec1 = df_predicate.at[i, '{}_vector'.format(vector_type)]
            vec2 = df_content.at[j, '{}_vector'.format(vector_type)]
            similarities.at[i, j] = cosine_sim(vec1, vec2)
    # Convert to numeric type
    similarities = similarities.apply(pd.to_numeric)
    
    print("Similarities shape: ", similarities.shape)
    
    # cosine -1
    min_indices = similarities.idxmin(axis=1)
    df_predicate.loc[:, "neg_{}_{}".format(metric, vector_type)] = df_content.loc[min_indices]['word'].values
    df_predicate.loc[:, "neg_value_{}_{}".format(metric, vector_type)] = similarities.min(axis=1).values
   
    # cosine 0
    pos_cos_sum_indices = np.abs(similarities).idxmin(axis=1)
    df_predicate.loc[:, "pos_{}_{}".format(metric, vector_type)] = df_content.loc[pos_cos_sum_indices]['word'].values
    df_predicate.loc[:, "pos_value_{}_{}".format(metric, vector_type)] = np.abs(similarities).min(axis=1).values  # absolute value
    
    # separate neg_value_cosine_sum and pos_value_cosine_value into dataframe with 2 column
    val_df = df_predicate[['neg_value_{}_{}'.format(metric, vector_type), 'neg_{}_{}'.format(metric, vector_type), 'pos_value_{}_{}'.format(metric, vector_type), 'pos_{}_{}'.format(metric, vector_type)]]
    # drop 2 columns from df_predicate
    df_predicate.drop(['neg_value_{}_{}'.format(metric, vector_type), 'pos_value_{}_{}'.format(metric, vector_type)], axis=1, inplace=True)
    return df_predicate, val_df



In [None]:
def first_nonzero(lst):
    nonzero_elements = filter(lambda x: x != 0, lst)
    return next(nonzero_elements, 0)


def preprocess_df_predicate(df_predicate):
    
    df_predicate['tag_id'] = df_predicate['pos_tag_id'].apply(first_nonzero) 
   
    return df_predicate



In [None]:
def find_new_word(df_predicate, df_noun, df_verb, df_adj, df_adv):
    
    # filter df with tag_id = 1
    predicate_noun = df_predicate[df_predicate['tag_id'] == 1]
    noun = compute_cosine_similarities(predicate_noun, 'sum', df_noun, 'cosine')
    
    predicate_verb = df_predicate[df_predicate['tag_id'] == 2]
    verb = compute_cosine_similarities(predicate_verb, 'sum', df_verb, metric = 'cosine')
    
    predicate_adj = df_predicate[df_predicate['tag_id'] == 3]
    adj = compute_cosine_similarities(predicate_adj, 'sum', df_adj, metric = 'cosine')
    
    predicate_adv = df_predicate[df_predicate['tag_id'] == 4]
    adv = compute_cosine_similarities(predicate_adv, 'sum', df_adv, metric = 'cosine')
    
    res_df = pd.concat([noun, verb, adj, adv])
   
    return res_df

In [None]:
file_paths = {
    "noun": "./data_mlm/process_folder/list_content_word_v2/NOUN.json",
    "verb": "./data_mlm/process_folder/list_content_word_v2/VERB.json",
    "adj": "./data_mlm/process_folder/list_content_word_v2/ADJ.json",
    "adv": "./data_mlm/process_folder/list_content_word_v2/ADV.json"
}

# df_noun = read_data_dask("./data_mlm/process_folder/list_content_word_v2/NOUN.json")
# df_verb = read_data_dask(file_paths["verb"])
# df_adj = read_data(file_paths["adj"])
# df_adv = read_data_dask(file_paths["adv"])


In [55]:
predicate_file = "./data_mlm/process_folder/word_present_each_file/mlm_abolish_full.json"

df_predicate = read_data(predicate_file)

df = preprocess_df_predicate(df_predicate)
df_adv = read_data_dask(file_paths["adv"])
pd.options.mode.copy_on_write = True
predicate_verb = df[df['tag_id'] == 2]

    
adv, val_df = compute_cosine_similarities(predicate_verb.iloc[:, :10], vector_type='sum', df_content=df_adv.head(40), metric = 'cosine')


In [63]:
def split_dataframe(df, n_splits):
    # Get the number of partitions
    # n_partitions = df.npartitions
    
    # # Compute how many partitions per split
    # partitions_per_split = n_partitions // n_splits
    
    # # Split the dataframe into n_splits
    # dfs = []
    # for i in range(n_splits):
    #     start = i * partitions_per_split
    #     end = (i + 1) * partitions_per_split if i < n_splits - 1 else n_partitions
    #     dfs.append(df.partitions[start:end])
    df_split_1 = dd.read_parquet('./split_data/split_1.parquet')
    df_split_2 = dd.read_parquet('./split_data/split_2.parquet')
    df_split_3 = dd.read_parquet('./split_data/split_3.parquet')
    return df_split_1, df_split_2, df_split_3


def select_noun_word(df_predicate, vector_type, df_noun, metric = 'cosine'):
    dfs = split_dataframe(df_noun, 3)
    pd.options.mode.copy_on_write = True
    val_df1 = compute_cosine_similarities(df_predicate, vector_type, dfs[0].iloc[:, :10].compute(), metric)[1]
    val_df2 = compute_cosine_similarities(df_predicate, vector_type, dfs[1].iloc[:, :10].compute(), metric)[1]
    val_df3 = compute_cosine_similarities(df_predicate, vector_type, dfs[2].iloc[:, :10].compute(), metric)[1]
    
    # merge 3 val_df into one with axis 1 and get the min value of each row
    concat_df = pd.concat([val_df1, val_df2, val_df3], axis=1)
    
    concat_df_neg  = concat_df.filter(like='neg_value_{type}')
    concat_df_neg.columns = ['neg_value_{type}_1', 'neg_value_{type}_2', 'neg_value_{type}_3']
    
    
    word_cols = concat_df.filter(like='neg_{type}')
    word_cols.columns = ['neg_{type}_1', 'neg_{type}_2', 'neg_{type}_3']
    
    
    min_val_indices = concat_df_neg.columns.get_indexer(concat_df_neg.idxmin(axis=1))
    df_predicate.loc[:, "neg_{}".format(metric)] =  word_cols.apply(lambda row: row.iloc[min_val_indices[row.name]], axis=1)
    
    return df_predicate

In [61]:

predicate_file = "./data_mlm/process_folder/word_present_each_file/mlm_abolish_full.json"

df_predicate = read_data(predicate_file)
df = preprocess_df_predicate(df_predicate)
predicate_verb = df[df['tag_id'] == 2]


In [64]:
aaa = select_noun_word(predicate_verb.iloc[:, :10], 'sum', None, 'cosine')
aaa.head()

In [56]:
df_split_1 = dd.read_parquet('./split_data/split_1.parquet')
adv, val_df = compute_cosine_similarities(predicate_verb[:10], vector_type='sum', df_content=df_split_1.compute(), metric = 'cosine')

In [49]:
adv.head()

Unnamed: 0,origin_uid,origin_id,pos_tag_id,word,sum_vector,avg_vector,tag_id,neg_cosine_sum,pos_cosine_sum
5,0,"[101, 170, 176, 118, 1106, 118, 170, 6468, 112...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[8632],"[-0.6510494351387021, -0.20481480658054302, -0...","[-0.6510494351387021, -0.20481480658054302, -0...",2,[17957],[17957]
17,2,"[101, 2393, 1643, 3246, 1110, 8632, 1118, 2791...","[0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[8632],"[-0.106601975858211, 0.20928417146205902, -1.9...","[-0.106601975858211, 0.20928417146205902, -1.9...",2,[2765],[2765]
18,2,"[101, 2393, 1643, 3246, 1110, 8632, 1118, 2791...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, ...",[19717],"[-0.282490104436874, 0.22874452173709803, -2.2...","[-0.282490104436874, 0.22874452173709803, -2.2...",2,[2765],[2765]
23,3,"[101, 2393, 1643, 3246, 1108, 8632, 1118, 2791...","[0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[8632],"[-0.212677419185638, 0.457284778356552, -2.437...","[-0.212677419185638, 0.457284778356552, -2.437...",2,[2765],[2765]
24,3,"[101, 2393, 1643, 3246, 1108, 8632, 1118, 2791...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, ...",[19717],"[-0.25142347812652505, 0.23143696784973103, -2...","[-0.25142347812652505, 0.23143696784973103, -2...",2,[2765],[2765]
