# Define function

In [1]:
from typing import Callable, List
import numpy as np
from tqdm import tqdm
import pandas as pd
import string
from scipy.spatial import distance

word2vecpath = '../word2vec/W2V_150.txt'
visim = '../Datasets/ViSim-400'
vicon = '../Datasets/ViCon-400'
words = []
vecs = []
dim = None
n_vocab = None


def word2norm(a: str) -> str:
    table = str.maketrans(dict.fromkeys(
        string.punctuation))  # OR {key: None for key in string.punctuation}
    new_s = a.translate(table)
    return new_s


with open(word2vecpath, encoding='utf8') as f:
    for line in tqdm(f, f"loading {word2vecpath} to variables"):
        if not n_vocab:
            n_vocab = int(line)
        elif not dim:
            dim = int(line)
        else:
            line = line.replace('\n', '')
            words.append(word2norm(line.split('  ')[0]))
            vecs.append([float(i) for i in line.split('  ')[1].split()])
vecs = np.array(vecs)


def word2vec(a: str) -> np.array:
    try:
        a = word2norm(a)
        i = words.index(a)
        return vecs[i]
    except:
        return np.zeros(dim)


def cosine(a: np.array, b: np.array) -> float:
    a = a / np.linalg.norm(a) if np.linalg.norm(a) != 0 else a
    b = b / np.linalg.norm(b) if np.linalg.norm(b) != 0 else b
    return a.dot(b)


#  Dot	Product Distance, Euclidean Distance, Dice Distance, Jaccard Distance.


def dot(a: np.array, b: np.array) -> float:
    return a.dot(b)


def euclid(a: np.array, b: np.array) -> float:
    return np.linalg.norm(a - b)


def dice(a: np.array, b: np.array) -> float:
    return distance.dice(a, b)


def jaccard(a: np.array, b: np.array) -> float:
    return distance.jaccard(a, b, w=None)


def sim(row, sim_f=cosine):
    vec1 = word2vec(row.iloc[0])
    vec2 = word2vec(row.iloc[1])
    return sim_f(vec1, vec2)


def topn(w: str,
         vocab: List[str] = words,
         encoder: Callable = word2vec,
         distance_by: Callable = cosine,
         n: int = 5) -> list:
    input_encode = encoder(w)
    vocab_sim = [(other, distance_by(input_encode, encoder(other)))
                 for other in tqdm(vocab, "Scanning vocab")]
    vocab_sim.sort(key=lambda x: x[1], reverse=True)
    return vocab_sim[:n] 

loading ../word2vec/W2V_150.txt to variables: 77023it [00:07, 10568.78it/s]


# Dataloader

In [2]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore',sparse=False)

simpairs = pd.read_csv(visim + '/Visim-400.txt', sep="\t")
npairs = pd.read_csv(vicon + '/400_verb_pairs.txt', sep="\t")
vpairs = pd.read_csv(vicon + '/400_verb_pairs.txt', sep="\t")
apairs = pd.read_csv(vicon + '/600_adj_pairs.txt', sep="\t")
dataset =  pd.concat([npairs, vpairs, apairs])[['Word1',	'Word2',	'Relation']]

def flatten(row):
    vec1 = word2vec(row.iloc[0])
    vec2 = word2vec(row.iloc[1])
    return np.array([vec1, vec2])

X = np.array([i for i in dataset.apply(flatten, axis=1)])
y = enc.fit_transform(dataset['Relation'].values.reshape(-1,1))
    

dataset


Unnamed: 0,Word1,Word2,Relation
0,giải_thoát,thi_hành,ANT
1,kìm_giữ,trói_buộc,SYN
2,gây_lộn,gây_sự,SYN
3,lầm,nhầm,SYN
4,chuyển_dạ,trở_dạ,SYN
...,...,...,...
595,ấm_áp,lạnh_lẽo,ANT
596,mập,ngẳng,ANT
597,chóng,lâu,ANT
598,chậm,sớm,ANT


# Task 1

In [5]:
distances = [('cosine', cosine), ('dot', dot), ('euclid', euclid),
                ('dice', dice), ('jaccard', jaccard)]
for name, function in distances:
    simpairs[f'sim-{name}'] = simpairs.apply(sim, axis=1, sim_f=function)
simpairs

  return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))


Unnamed: 0,Word1,Word2,POS,Sim1,Sim2,STD,sim-cosine,sim-dot,sim-euclid,sim-dice,sim-jaccard
0,biến,ngập,V,3.13,5.22,0.72,-0.004912,-1.493676,25.296228,0.808093,1.0
1,nhà_thi_đấu,nhà,N,3.07,5.12,1.18,0.082523,18.257401,22.118834,-3.074252,1.0
2,động,tĩnh,V,0.60,1.00,0.95,0.277086,39.547434,14.640360,-24.627852,1.0
3,khuyết,ưu,N,0.20,0.33,0.40,0.176799,40.841349,19.508880,-0.468402,1.0
4,cõi_tục,cõi_âm,N,0.60,1.00,0.95,0.000000,0.000000,12.063605,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...
395,lình_xình,nặng_tình,A,1.33,2.22,1.14,0.170494,38.338900,19.600983,-1.993741,1.0
396,người_làm,người_bị_hại,N,2.20,3.67,0.83,0.135008,27.805490,18.980925,-3.418888,1.0
397,cõi_tục,trần_gian,N,5.40,9.00,0.71,0.000000,0.000000,14.601425,1.000000,1.0
398,chần_chừ,lảo_đảo,V,3.20,5.33,0.98,0.112939,20.632544,18.305506,5.200466,1.0


# Task 2

In [6]:

# task 2 with word: 'tượng_đài'. Note: words in vocabulary are all NORMALIZED! 
print(topn('tượng_đài', n=5))

Scanning vocab: 100%|██████████| 77021/77021 [01:32<00:00, 828.56it/s] 

[('tượngđài', 0.9999999999999999), ('đềnthờ', 0.5623567247142175), ('thápchuông', 0.5443984164585618), ('biatưởngniệm', 0.5406205813189373), ('giáođường', 0.5329588054503885)]





# Task 3

## Train base model

In [7]:
# ! pip install tensorboard
# ! pip install tensorflow


import tensorflow as tf
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

def create_model():
  return tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=X.shape[1:]),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='sigmoid'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(512, activation='sigmoid'),
    tf.keras.layers.Dense(2, activation='softmax')
  ])
model = create_model()
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              metrics=['binary_accuracy'])


model.fit(x=x_train, 
          y=y_train, 
          batch_size=30,
          epochs=40, 
          validation_data=(x_test, y_test))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x1a197010a90>

In [8]:
model.evaluate(
    x=x_test,
    y=y_test
)



[0.8776617050170898, 0.8571428656578064]

## Adapter

In [9]:
def predict(w1:str, w2:str)->str:
    w1 = word2vec(word2norm(w1))
    w2 = word2vec(word2norm(w2))
    input = np.array([[w1,w2]])
    output = model.predict(input)
    print(output)
    return enc.inverse_transform(output)[0][0]

print(predict('thanh_danh','ô_nhục'))

[[9.4943371e-04 9.9905056e-01]]
SYN
