In [1]:
from base64 import encode
import os
import pandas as pd
import numpy as np
from typing import Callable, List
from tqdm import tqdm
from scipy.spatial import distance

In [2]:
word2vecpath = '../word2vec/W2V_150.txt'
ant_sym_path = '../antonym-synonym set'
visimpath = '../datasets/ViSim-400'
viconpath = '../datasets/ViCon-400'


In [3]:
words = []
vecs = []

dim = None
n_vocab = None

with open(word2vecpath, encoding='utf8') as f:
    for line in tqdm(f, f"loading {word2vecpath} to variables"):
        if not n_vocab:
            n_vocab = int(line)
        elif not dim:
            dim = int(line)
        else:
            line = line.replace('\n', '')
            words.append(line.split('  ')[0])
            vecs.append([float(i) for i in line.split('  ')[1].split()])
vecs = np.array(vecs)


def word2vec(a: str) -> np.array:
    try:
        idx = words.index(a)
        return vecs[idx]
    except:
        return np.zeros(dim)


def cosine(a: np.array, b: np.array) -> float:
    a = a / np.linalg.norm(a) if np.linalg.norm(a) != 0 else a
    b = b / np.linalg.norm(b) if np.linalg.norm(b) != 0 else b
    return a.dot(b)

#  Dot	Product Distance, Euclidean Distance, Dice Distance, Jaccard Distance.


def dot(a: np.array, b: np.array) -> float:
    return a.dot(b)


def euclid(a: np.array, b: np.array) -> float:
    return np.linalg.norm(a - b)


def dice(a: np.array, b: np.array) -> float:
    return distance.dice(a, b)


def jaccard(a: np.array, b: np.array) -> float:
    return distance.jaccard(a, b, w=None)


def sim(row, sim_f=cosine):
    vec1 = word2vec(row.iloc[0])
    vec2 = word2vec(row.iloc[1])

    return sim_f(vec1, vec2)


def topsimilar(
        w: str,
        vocab: List[str] = words,
        encoder: Callable = word2vec,
        distance_by: Callable = cosine,
        n: int = 5) -> list:
    input_encode = encoder(w)
    vocab_sim = [
        (other, distance_by(input_encode, encoder(other)))
        for other in tqdm(vocab, "Scanning vocab")
    ]
    vocab_sim.sort(key=lambda x: x[1], reverse=True)
    return vocab_sim[:n]


loading ../word2vec/W2V_150.txt to variables: 77023it [00:04, 18403.08it/s]


In [4]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

sim_pairs = pd.read_csv(visimpath + '/Visim-400.txt', sep="\t")

noun_pairs = pd.read_csv(viconpath + '/400_verb_pairs.txt', sep="\t")
verb_pairs = pd.read_csv(viconpath + '/400_verb_pairs.txt', sep="\t")
adj_pairs = pd.read_csv(viconpath + '/600_adj_pairs.txt', sep="\t")

dataset = pd.concat([noun_pairs, verb_pairs, adj_pairs])[
    ['Word1', 'Word2', 'Relation']]


def flatten(row):
    vec1 = word2vec(row.iloc[0])
    vec2 = word2vec(row.iloc[1])

    return np.array([vec1, vec2], dtype='float32')


x = np.array([i for i in dataset.apply(flatten, axis=1)])
y = enc.fit_transform(dataset['Relation'].values.reshape(-1, 1))

In [5]:
ant_pairs = pd.read_csv(ant_sym_path + '/Antonym_vietnamese.txt',
                        delim_whitespace=True, header=None, names=['Word1', 'Word2'])
syn_pairs = pd.read_csv(ant_sym_path + '/Synonym_vietnamese.txt',
                        delim_whitespace=True, header=None, names=['Word1', 'Word2'])


In [6]:
ant_word_1 = []
ant_word_2 = []

with open(ant_sym_path + '/Antonym_vietnamese.txt', encoding='utf8') as file:
    for line in file:
        line = line.replace('\n', '').split(' ')
        ant_word_1.append(line[0])
        ant_word_2.append(line[1])

In [7]:
syn_word_1 = []
syn_word_2 = []

with open(ant_sym_path + '/Synonym_vietnamese.txt', encoding='utf8') as file:
    for line in file:
        line = line.replace('\n', '').split(' ')
        syn_word_1.append(line[0])
        syn_word_2.append(line[1])

In [8]:
def get_synonym(word):
    if word in syn_word_1:
        return syn_word_2[syn_word_1.index(word)]
    if word in syn_word_2:
        return syn_word_1[syn_word_2.index(word)]
    else:
        return None

def get_antonym(word):
    if word in ant_word_1:
        return ant_word_2[ant_word_1.index(word)]
    if word in ant_word_2:
        return ant_word_1[ant_word_2.index(word)]
    else:
        return None

In [9]:
def get_syn_have_vec(word):
    scanned_syn = [word]
    syn_have_vec = False

    while word is not None:
        word = get_synonym(word)
        if word in scanned_syn:
            break
        else:
            scanned_syn.append(word)

        if np.any(word2vec(word)):
            syn_have_vec = True
            break

    return syn_have_vec, word2vec(word)

In [10]:
def vec_estimate(word):
    syn_have_vec, syn_vec = get_syn_have_vec(word)

    if syn_have_vec == False:
        return np.zeros(dim)
    
    ant_1 = get_antonym(word)
    while not np.any(word2vec(ant_1)):
        ant_1 = get_synonym(ant_1)

    if not np.any(word2vec(ant_1)):
        return np.zeros(dim)

    ant_2 = get_synonym(ant_1)
    while not np.any(word2vec(ant_2)):
        ant_2 = get_synonym(ant_2)

    if not np.any(word2vec(ant_2)):
        return np.zeros(dim)

    return syn_vec + word2vec(ant_1) - word2vec(ant_2)



# Task 1

In [11]:
distances = [('cosine', cosine)]
for name, function in distances:
    sim_pairs[f'sim-{name}'] = sim_pairs.apply(sim, axis=1, sim_f=function)
print(sim_pairs.to_string)


                   Word1                Word2 POS  Sim1  Sim2   STD  sim-cosine
0                   biến                 ngập   V  3.13  5.22  0.72   -0.004912
1            nhà_thi_đấu                  nhà   N  3.07  5.12  1.18    0.082523
2                   động                 tĩnh   V  0.60  1.00  0.95    0.277086
3                 khuyết                   ưu   N  0.20  0.33  0.40    0.176799
4                cõi_tục               cõi_âm   N  0.60  1.00  0.95    0.000000
5               thủ_pháp            biện_pháp   N  4.13  6.88  1.26    0.402366
6              kết_duyên            thành_hôn   V  5.27  8.78  1.06    0.463008
7               cấp_tiến              bảo_thủ   A  0.87  1.45  1.15    0.256947
8               nước_lớn            nguy_hiểm   N  1.07  1.78  1.12    0.185192
9                    hoa                  nhị   N  2.27  3.78  0.93    0.223070
10               bất_lợi            thuận_lợi   N  0.33  0.55  0.79    0.534891
11               phân_ly              su

In [12]:
def sim2word(word1, word2, sim_f=cosine):
    vec1 = word2vec(word1)
    if not np.any(vec1):
        print('vec1')
        vec1 = vec_estimate(word1)

    vec2 = word2vec(word2)
    if not np.any(vec2):
        print('vec2')
        vec2 = vec_estimate(word2)

    return sim_f(vec1, vec2)

In [13]:
sim2word('cõi_tục', 'trần_gian')

vec1


0.6957755493627643

In [14]:
word1 = sim_pairs['Word1'].to_list()
word2 = sim_pairs['Word2'].to_list()

In [15]:
cosine_sim = []

In [16]:
for i in tqdm(range(400)):
    print(sim2word(word1[i], word2[i]))

 11%|█▏        | 45/400 [00:00<00:00, 387.77it/s]

-0.004912339469670016
0.08252318329211772
0.27708595986827755
0.17679862835626714
vec1
0.6060296673640085
0.4023661291943061
0.46300840201407223
0.25694700889961236
0.18519202240211616
0.22306960452548544
0.5348913029963394
0.07770108930396213
vec2
0.0
vec1
0.0
0.6438835401838533
0.3610325527728171
0.059602370849219294
0.23895503536998705
0.16647413914742729
-0.17284007209619465
0.6417888034717878
0.28753527791057554
0.3881971694440945
0.25596472879600973
0.48043194238556175
0.47242629887420834
0.662409385234064
0.22523314998167998
vec1
0.0
0.7340683342360697
-0.013609544519483876
0.6878254420258126
0.5964044436335486
0.37544414221247063
0.38561358220504466
0.23305646417105497
-0.030257136838635332
0.0923981505332208
0.4812901949758729
0.4430288482597332
0.4276402056214263
0.34353721462857584
0.47775360404332445
0.16598255399110182
vec1
vec2
0.0
0.35188002162814613
vec2


 12%|█▏        | 46/400 [00:57<07:22,  1.25s/it] 


KeyboardInterrupt: 