# Define function

In [1]:
from typing import Callable, List
import numpy as np
from tqdm import tqdm
import pandas as pd
import string
from scipy.spatial import distance
from scipy import stats

In [2]:
word2vecpath = '../word2vec/W2V_150.txt'
visim = '../Datasets/ViSim-400'
vicon = '../Datasets/ViCon-400'
words = []
vecs = []
dim = None
n_vocab = None

In [3]:
def shuffle_df(df: pd.DataFrame) -> pd.DataFrame:
    return df.sample(frac=1).reset_index(drop=True)


def batchify(x: np.array, y: np.array, batch_size):
    assert x.shape[0] == y.shape[0]
    total_sample = x.shape[0]
    n_sections = total_sample // batch_size
    for bx, by in zip(np.split(x, n_sections), np.split(y, n_sections)):
        yield bx, by


def word2norm(a: str) -> str:
    table = str.maketrans(dict.fromkeys(
        string.punctuation))  # OR {key: None for key in string.punctuation}
    new_s = a.translate(table)
    return new_s

def word2vec(a: str) -> np.array:
    try:
        a = word2norm(a)
        i = words.index(a)
        return vecs[i]
    except:
        return np.zeros(dim)


def cosine(a: np.array, b: np.array) -> float:
    a = a / np.linalg.norm(a) if np.linalg.norm(a) != 0 else a
    b = b / np.linalg.norm(b) if np.linalg.norm(b) != 0 else b
    return a.dot(b)


#  Dot	Product Distance, Euclidean Distance, Dice Distance, Jaccard Distance.


def dot(a: np.array, b: np.array) -> float:
    return a.dot(b)


def euclid(a: np.array, b: np.array) -> float:
    return np.linalg.norm(a - b)


def dice(a: np.array, b: np.array) -> float:
    return distance.dice(a, b)


def jaccard(a: np.array, b: np.array) -> float:
    return distance.jaccard(a, b, w=None)


def sim(row, sim_f=cosine):
    vec1 = word2vec(row.iloc[0])
    vec2 = word2vec(row.iloc[1])
    return sim_f(vec1, vec2)


def is_oov(word) -> bool:
    a = word2norm(word)
    return a not in words


def drop_oov(df: pd.DataFrame) -> pd.DataFrame:
    not_oov_index = df.apply(lambda row: not any([is_oov(row.iloc[i]) for i in [0, 1]]), axis=1).tolist()
    return df[not_oov_index]

In [4]:
with open(word2vecpath, encoding='utf8') as f:
    for line in tqdm(f, f"loading {word2vecpath} to variables"):
        if not n_vocab:
            n_vocab = int(line)
        elif not dim:
            dim = int(line)
        else:
            line = line.replace('\n', '')
            words.append(word2norm(line.split('  ')[0]))
            vecs.append([float(i) for i in line.split('  ')[1].split()])
vecs = np.array(vecs)

loading ../word2vec/W2V_150.txt to variables: 77023it [00:04, 16298.09it/s]


# Dataloader

In [5]:
simpairs = pd.read_csv(visim + '/Visim-400.txt', sep="\t")
simpairs = drop_oov(simpairs)

npairs = pd.read_csv(vicon + '/400_verb_pairs.txt', sep="\t")
vpairs = pd.read_csv(vicon + '/400_verb_pairs.txt', sep="\t")
apairs = pd.read_csv(vicon + '/600_adj_pairs.txt', sep="\t")

testsetdf = pd.concat([npairs, vpairs, apairs])[['Word1', 'Word2', 'Relation']].drop_duplicates()
testsetdf = drop_oov(testsetdf)
testsetdf = shuffle_df(testsetdf)

train_records = [
    {
        'Word1': line.split(' ')[0].strip(),
        'Word2': line.split(' ')[1].strip(),
        'Relation': 'SYN'
    } for line in open('../antonym-synonym set/Synonym_vietnamese.txt', encoding='utf8')
]

train_records.extend([
    {
        'Word1': line.split()[0].strip(),
        'Word2': line.split()[1].strip(),
        'Relation': 'ANT'
    } for line in open('../antonym-synonym set/Antonym_vietnamese.txt', encoding='utf8')
])

trainsetdf = pd.DataFrame.from_records(train_records).drop_duplicates()
trainsetdf = drop_oov(trainsetdf)
trainsetdf = shuffle_df(trainsetdf)


def flatten(row):
    vec1 = word2vec(row.iloc[0])
    vec2 = word2vec(row.iloc[1])
    return np.array([vec1, vec2])


train_x = np.array([i for i in trainsetdf.apply(flatten, axis=1)])
train_y = trainsetdf.Relation.map({'ANT': 0, 'SYN': 1})

test_x = np.array([i for i in testsetdf.apply(flatten, axis=1)])
test_y = testsetdf.Relation.map({'ANT': 0, 'SYN': 1})

In [6]:
trainsetdf

Unnamed: 0,Word1,Word2,Relation
0,thèm,thèm_thuồng,SYN
1,chầu_chực,chờ_chực,SYN
2,vẹo,xiêu_vẹo,SYN
3,lất_phất,phất_phơ,SYN
4,lừa_đảo,tỉnh_ngộ,ANT
...,...,...,...
7345,động,đụng,SYN
7346,leo,trèo,SYN
7347,thò,thọc,SYN
7348,bất_thần,chợt,SYN


In [7]:
testsetdf

Unnamed: 0,Word1,Word2,Relation
0,chờ_mong,mong_chờ,SYN
1,đứng_tuổi,luống_tuổi,SYN
2,bán,mua,ANT
3,tàn_ác,tàn_tệ,SYN
4,bác_bỏ,chấp_nhận,ANT
...,...,...,...
841,mau,thưa,ANT
842,chóng,muộn,ANT
843,chua_cay,sâu_cay,SYN
844,ăn_năn,ân_hận,SYN


In [8]:
print(f'train_x shape = {train_x.shape}\n'
      f'train_y shape = {train_y.shape}\n'
      f'test_x shape = {test_x.shape}\n'
      f'test_y.shape = {test_y.shape}')

train_x shape = (7350, 2, 150)
train_y shape = (7350,)
test_x shape = (846, 2, 150)
test_y.shape = (846,)


In [9]:
trainsetdf_values = trainsetdf.apply(lambda row: '-'.join(row.iloc), axis=1).tolist()
actual_testdf = testsetdf[testsetdf.apply(lambda row: '-'.join(row.iloc) in trainsetdf_values, axis=1).tolist()]

atest_x = np.array([i for i in actual_testdf.apply(flatten, axis=1)])
atest_y = actual_testdf.Relation.map({'ANT': 0, 'SYN': 1})
actual_testdf.shape

(835, 3)

# Task 1

In [10]:
distances = [('cosine', cosine), ('dot', dot), ('euclid', euclid),
             ('dice', dice), ('jaccard', jaccard)]
for name, function in distances:
    simpairs[f'sim-{name}'] = simpairs.apply(sim, axis=1, sim_f=function)
simpairs

Unnamed: 0,Word1,Word2,POS,Sim1,Sim2,STD,sim-cosine,sim-dot,sim-euclid,sim-dice,sim-jaccard
0,biến,ngập,V,3.13,5.22,0.72,-0.004912,-1.493676,25.296228,0.808093,1.0
1,nhà_thi_đấu,nhà,N,3.07,5.12,1.18,0.082523,18.257401,22.118834,-3.074252,1.0
2,động,tĩnh,V,0.60,1.00,0.95,0.277086,39.547434,14.640360,-24.627852,1.0
3,khuyết,ưu,N,0.20,0.33,0.40,0.176799,40.841349,19.508880,-0.468402,1.0
5,thủ_pháp,biện_pháp,N,4.13,6.88,1.26,0.402366,106.914893,17.831482,-2.735911,1.0
...,...,...,...,...,...,...,...,...,...,...,...
393,triều_đại,cổ_đại,N,3.67,6.12,1.14,0.274376,76.308166,20.353694,-6.902056,1.0
395,lình_xình,nặng_tình,A,1.33,2.22,1.14,0.170494,38.338900,19.600983,-1.993741,1.0
396,người_làm,người_bị_hại,N,2.20,3.67,0.83,0.135008,27.805490,18.980925,-3.418888,1.0
398,chần_chừ,lảo_đảo,V,3.20,5.33,0.98,0.112939,20.632544,18.305506,5.200466,1.0


In [12]:
Sim1 = simpairs['Sim1'].tolist()
Sim2 = simpairs['Sim2'].tolist()
Cosine = simpairs['sim-cosine'].tolist()

print(" Pearson correlation coefficient: ", stats.pearsonr(Sim1, Cosine))
print(" Spearman's rank correlation coefficient: ", stats.spearmanr(Sim1, Cosine))

 Pearson correlation coefficient:  (0.4468430791767022, 2.7457782359140626e-18)
 Spearman's rank correlation coefficient:  SpearmanrResult(correlation=0.4077422929392862, pvalue=3.2726552283981985e-15)


# Task 2

In [18]:
def topn(w: str,
         vocab: List[str] = words,
         encoder: Callable = word2vec,
         distance_by: Callable = cosine,
         n: int = 5) -> list:
    input_encode = encoder(w)
    vocab_sim = [(other, distance_by(input_encode, encoder(other)))
                 for other in tqdm(vocab, "Scanning vocab")]
    vocab_sim.sort(key=lambda x: x[1], reverse=True)
    return vocab_sim[1:n+1]


# task 2 with word: 'tượng_đài'. Note: words in vocabulary are all NORMALIZED!
print(topn('tượng_đài', n=5))

Scanning vocab: 100%|██████████| 77021/77021 [01:27<00:00, 881.21it/s]  

[('đềnthờ', 0.5623567247142175), ('thápchuông', 0.5443984164585618), ('biatưởngniệm', 0.5406205813189373), ('giáođường', 0.5329588054503885), ('lăngmộ', 0.5287220564708057)]





# Task 3

## Train base model

In [14]:
# ! pip install tensorboard
# ! pip install tensorflow


import torch

batch_size = 30
import torch.nn as nn
import torch.nn.functional as F


class Discriminator(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(Discriminator, self).__init__()
        self.ln1 = nn.Linear(embedding_dim, hidden_dim)
        self.ln2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln3 = nn.Linear(hidden_dim, hidden_dim)
        self.ln4 = nn.Linear(hidden_dim, hidden_dim)
        self.ln5 = nn.Linear(hidden_dim+1, 2)

    def _sim(self, x):
        """
        Calculate cosine similarity between pairs of embedding vector
        :param x: input shape: [batch_size x 2 x embedding_dim]
        :return: shape: [batch_size,] similarity between input pairs
        """
        x1 = torch.squeeze(x[:, 0, ...], dim=1)  # -> [batch_size x embedding_dim]
        x2 = torch.squeeze(x[:, 1, ...], dim=1)  # -> [batch_size x embedding_dim]
        return F.cosine_similarity(x1, x2).reshape(-1,1)
    def _diff(self, x):
        """
        Calculate difference vector between pairs of embedding vector
        :param x: input shape: [batch_size x 2 x embedding_dim]
        :return: shape: [batch_size x embedding_dim]
        """
        x1 = torch.squeeze(x[:, 0, ...], dim=1)  # -> [batch_size x embedding_dim]
        x2 = torch.squeeze(x[:, 1, ...], dim=1)  # -> [batch_size x embedding_dim]
        return x1-x2

    def forward(self, x,**kwargs):
        """
        :param x: input shape: [batch_size x 2 x embedding_dim]
        :return: shape: [batch_size,] similarity between input pairs
        """
        x = F.relu(self.ln1(x))
        x = F.tanh(self.ln2(x))
        x = F.relu(self.ln3(x))
        x = F.tanh(self.ln4(x))
        sim_ = self._sim(x)
        dif_ = self._diff(x)
        x = torch.cat((dif_, sim_),1) 
        x = self.ln5(x)
        x = F.softmax(x, dim=-1)
        return x


model = Discriminator(embedding_dim=150, hidden_dim=500)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
labels_dict = {
    0: [1, 0],
    1: [0, 1],
}
loss_f = torch.nn.BCELoss()


def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.

    for i, (x, y) in enumerate(batchify(train_x, train_y, batch_size)):
        y = torch.Tensor([labels_dict[yi] for yi in y])
        optimizer.zero_grad()
        predict_y = model(torch.Tensor(x), training=True)
        loss_value = loss_f(predict_y, y)
        loss_value.backward()
        optimizer.step()
        running_loss += loss_value
        # Gather data and report
        running_loss += loss_value
        if i % 100 == 99:
            last_loss = running_loss / 1000  # loss per batch
            print('epoch {}  batch {} loss: {}'.format(epoch_index, i + 1, last_loss))
            running_loss = 0

    return last_loss

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
epochs = 20
for epoch in range(epochs):
    train_one_epoch(epoch_index=epoch)



epoch 0  batch 100 loss: 0.1175815686583519
epoch 0  batch 200 loss: 0.09047933667898178
epoch 1  batch 100 loss: 0.08473016321659088
epoch 1  batch 200 loss: 0.08031094819307327
epoch 2  batch 100 loss: 0.08283901959657669
epoch 2  batch 200 loss: 0.07886103540658951
epoch 3  batch 100 loss: 0.08199287950992584
epoch 3  batch 200 loss: 0.07765471935272217
epoch 4  batch 100 loss: 0.08068524301052094
epoch 4  batch 200 loss: 0.07579269260168076
epoch 5  batch 100 loss: 0.07835826277732849
epoch 5  batch 200 loss: 0.07256737351417542
epoch 6  batch 100 loss: 0.07402992248535156
epoch 6  batch 200 loss: 0.06711097061634064
epoch 7  batch 100 loss: 0.06729568541049957
epoch 7  batch 200 loss: 0.060301147401332855
epoch 8  batch 100 loss: 0.05982448533177376
epoch 8  batch 200 loss: 0.053859204053878784
epoch 9  batch 100 loss: 0.05326998978853226
epoch 9  batch 200 loss: 0.04836992919445038
epoch 10  batch 100 loss: 0.048045191913843155
epoch 10  batch 200 loss: 0.04367933049798012
epoch 

## Test

In [16]:
predict = np.argmax(model(torch.Tensor(test_x)).detach().numpy(), axis=1).tolist()

apredict = np.argmax(model(torch.Tensor(atest_x)).detach().numpy(), axis=1).tolist()

from sklearn.metrics import classification_report
target_names = ['ANT', 'SYN']
print('original test set (drop oov)')
print(classification_report(test_y.tolist(), predict, target_names=target_names))
print('____________')
print('d-test set (drop oov, drop same pair with train set)')
print(classification_report(atest_y.tolist(), apredict, target_names=target_names))
print('____________')

original test set (drop oov)
              precision    recall  f1-score   support

         ANT       0.99      0.91      0.95       466
         SYN       0.90      0.99      0.95       380

    accuracy                           0.95       846
   macro avg       0.95      0.95      0.95       846
weighted avg       0.95      0.95      0.95       846

____________
d-test set (drop oov, drop same pair with train set)
              precision    recall  f1-score   support

         ANT       1.00      0.91      0.95       466
         SYN       0.90      1.00      0.95       369

    accuracy                           0.95       835
   macro avg       0.95      0.96      0.95       835
weighted avg       0.96      0.95      0.95       835

____________


## Adapter

In [17]:
def predict(w1: str, w2: str) -> str:
    w1 = word2vec(word2norm(w1))
    w2 = word2vec(word2norm(w2))
    input = torch.Tensor([[w1, w2]])
    print(input.shape)
    output = model(input).detach().numpy()
    output = np.argmax(output, axis=1).tolist()[0]
    print(output)
    if output == 0:
        return 'ANT'
    else:
        return 'SYN'


print(predict('thanh_danh', 'ô_nhục'))

torch.Size([1, 2, 150])
0
ANT


  input = torch.Tensor([[w1, w2]])
