In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [6]:
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]  
            vector = np.array(list(map(float, values[1:])))  
            embeddings[word] = vector
    return embeddings

embeddings_path = 'dataset/W2V_150.txt'
embeddings = load_embeddings(embeddings_path)

def get_word_vector(word):
    if word in embeddings:
        return embeddings[word]
    else:
        return np.zeros(150)  

In [7]:
get_word_vector('chó')

array([-0.2578754 , -0.9726154 , -1.724687  ,  0.1039191 ,  0.7886895 ,
       -0.9466447 ,  0.4693937 ,  1.152593  ,  0.1155706 ,  0.1692931 ,
        0.7421699 , -0.787329  , -0.6093806 ,  1.025403  ,  0.6054297 ,
       -1.236188  , -0.9221155 , -1.210821  , -0.2781016 ,  1.110141  ,
        1.728644  , -0.8907079 , -0.7430073 , -1.386758  , -0.6328576 ,
        0.4824337 , -0.4921961 , -0.5655223 , -1.927649  , -0.4070618 ,
        0.181633  ,  0.7142181 , -0.8746694 ,  0.8621774 ,  0.3763472 ,
       -0.2323172 , -1.696975  , -0.6795936 , -0.7101342 , -0.3980412 ,
        0.8449571 , -0.9750128 ,  1.558469  , -1.674035  , -0.680132  ,
       -0.97935   ,  0.07189509, -0.2478063 , -0.4585635 , -0.6954154 ,
        2.041032  ,  2.368579  ,  0.9618701 ,  1.230371  ,  2.721321  ,
        0.5995511 ,  0.8380752 ,  0.8573769 ,  1.95823   ,  1.082221  ,
        0.6791626 , -0.9185423 ,  0.3077531 ,  0.2545303 , -1.09807   ,
        0.07830691, -0.7900249 , -0.3596387 , -0.5374528 ,  0.70

In [8]:
pairs = []
labels = []

with open('./antonym-synonym/Antonym_vietnamese.txt', 'r', encoding='utf-8') as f:
    next(f)  
    for line in f:
        if line.strip() == "":  
            continue
        values = line.strip().split(' ')
        
        if len(values) < 2:
            print(f"Skipping line, not enough values: {values}")  
            continue
        
        word1 = values[0]
        word2 = values[1]
        
        vector1 = get_word_vector(word1)
        vector2 = get_word_vector(word2)
        
        vector = np.concatenate([vector1, vector2])
        pairs.append(vector)
        labels.append(0)  


with open('./antonym-synonym/Synonym_vietnamese.txt', 'r', encoding='utf-8') as f:
    next(f)  
    for line in f:
        if line.strip() == "":  
            continue
        values = line.strip().split(' ')
        
        if len(values) < 2:
            print(f"Skipping line, not enough values: {values}") 
            continue
        
        word1 = values[0]
        word2 = values[1]
        
        vector1 = get_word_vector(word1)
        vector2 = get_word_vector(word2)
        
        vector = np.concatenate([vector1, vector2])
        pairs.append(vector)
        labels.append(1)  

print(f"Processed pairs: {len(pairs)}, Labels: {len(labels)}")

X_train, y_train = np.array(pairs), np.array(labels)

Skipping line, not enough values: ['bằng_chứng']
Skipping line, not enough values: ['dài_ngoằng']
Skipping line, not enough values: ['dầu_nhớt']
Skipping line, not enough values: ['diễn_dịch']
Processed pairs: 13556, Labels: 13556


In [9]:
def load_data(file_path):
    pairs = []
    labels = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if not line: 
                continue
            
            values = line.split()  
            if len(values) < 3:  
                print(f"Skipping line, not enough values: {values}")  
                continue
            
            word1 = values[0]
            word2 = values[1]
            label = values[2]  
            

            vector1 = get_word_vector(word1)
            vector2 = get_word_vector(word2)
            

            vector = np.concatenate([vector1, vector2])
            pairs.append(vector)
            labels.append(0 if label == 'ANT' else 1)  

    return np.array(pairs), np.array(labels)

In [10]:
clf_logistic = LogisticRegression(max_iter=1000)
clf_logistic.fit(X_train, y_train)

clf_mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, activation='relu')
clf_mlp.fit(X_train, y_train)

def evaluate(y_true, y_pred, model_name):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"{model_name} - Precision: {precision}, Recall: {recall}, F1-Score: {f1}")


In [11]:
X_test, y_test = load_data('./dataset/ViCon-400/400_noun_pairs.txt')
len(X_test), len(y_test)
y_pred_logistic = clf_logistic.predict(X_test)
y_pred_mlp = clf_mlp.predict(X_test)
# Logistic Regression Evaluation
evaluate(y_test, y_pred_logistic, 'Logistic Regression')

# MLP Evaluation
evaluate(y_test, y_pred_mlp, 'MLP')

Logistic Regression - Precision: 0.5706051873198847, Recall: 0.9850746268656716, F1-Score: 0.7226277372262774
MLP - Precision: 0.9757281553398058, Recall: 1.0, F1-Score: 0.9877149877149877


In [12]:
X_verb_test, y_verb_test = load_data('./dataset/ViCon-400/400_verb_pairs.txt')

In [13]:
ypred_mlpverb = clf_mlp.predict(X_verb_test)
ypred_logisticverb = clf_logistic.predict(X_verb_test)
evaluate(y_verb_test, ypred_mlpverb, 'MLP')
evaluate(y_verb_test, ypred_logisticverb, 'Logistic Regression')

MLP - Precision: 0.9852941176470589, Recall: 1.0, F1-Score: 0.9925925925925926
Logistic Regression - Precision: 0.599388379204893, Recall: 0.9751243781094527, F1-Score: 0.7424242424242424


In [14]:
X_adj_test, y_adj_test = load_data('./dataset/ViCon-400/600_adj_pairs.txt')

In [15]:
ypred_mlpadj = clf_mlp.predict(X_adj_test)
ypred_logisticadj = clf_logistic.predict(X_adj_test)
evaluate(y_adj_test, ypred_logisticadj, 'Logistic Regression')
evaluate(y_adj_test, ypred_mlpadj, 'MLP')

Logistic Regression - Precision: 0.6953125, Recall: 0.8870431893687708, F1-Score: 0.7795620437956204
MLP - Precision: 1.0, Recall: 1.0, F1-Score: 1.0
