# P1. Cosine similarity

In [1]:
import numpy as np
import sklearn
import sklearn.metrics

# Load pre-trained word embeddings from a file
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            if len(values) > 1:
                word = values[0]
                vector = np.array(values[1:], dtype='float32')
                embeddings[word] = vector
    return embeddings

# Function to calculate cosine similarity converted to range [0, 6]
# def cosine_similarity(vec1, vec2):
#     return cosine(vec1, vec2)

def cosine_similarity(vec1, vec2):
    # Ensure that both vectors have the same dimensions
    if len(vec1) != len(vec2):
        raise ValueError("Vectors must be of the same length")
    
    # Compute the dot product between the two vectors
    dot_product = np.dot(vec1, vec2)
    
    # Compute the magnitudes (norms) of the vectors
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    # Ensure that we do not divide by zero (in case of zero vectors)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0  # No similarity if one of the vectors is zero
    
    # Compute cosine similarity
    return dot_product / (norm_vec1 * norm_vec2)


# def cosine_similarity(vec1, vec2):
#     vec1 = vec1.reshape(1, -1)
#     vec2 = vec2.reshape(1, -1)
#     return sklearn.metrics.pairwise.cosine_similarity(vec1, vec2)[0][0]

# Example usage
embeddings = load_embeddings("datasets/W2V_150.txt")
vec1 = embeddings['sinh_viên']
vec2 = embeddings['tân_việt']
similarity = cosine_similarity(vec1, vec2)
print(f'Cosine Similarity: {similarity}')


Cosine Similarity: -0.3297407031059265


# OPTION. Evaluating Cosine Similarity Using Pearson and Spearman Correlations

In [2]:
from scipy.stats import pearsonr, spearmanr

# Load ViSim-400 dataset
def load_visim400(file_path):
    word_pairs = []
    sim_scores = []
    line_id = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line_id == 0:
                line_id = 1
                continue
            word1, word2, _, sim1, _, _ = line.split()
            word_pairs.append((word1, word2))
            sim_scores.append(float(sim1))  # Or use sim2 based on your requirement
    return word_pairs, sim_scores

# Calculate correlations
def evaluate_correlation(embeddings, word_pairs, human_ratings):
    calculated_similarities = []
    for word1, word2 in word_pairs:
        if word1 in embeddings and word2 in embeddings:
            similarity = cosine_similarity(embeddings[word1], embeddings[word2])
            calculated_similarities.append(similarity)
        else:
            calculated_similarities.append(0)  # Handle missing words

    # Pearson and Spearman correlations
    pearson_corr, _ = pearsonr(calculated_similarities, human_ratings)
    spearman_corr, _ = spearmanr(calculated_similarities, human_ratings)
    
    return pearson_corr, spearman_corr

# Load ViSim-400 dataset and evaluate
word_pairs, sim_scores = load_visim400("Datasets/ViSim-400/Visim-400.txt")
pearson_corr, spearman_corr = evaluate_correlation(embeddings, word_pairs, sim_scores)

print(f'Pearson Correlation: {pearson_corr}')
print(f'Spearman Correlation: {spearman_corr}')


Pearson Correlation: 0.3446108828718941
Spearman Correlation: 0.2959639633854359


# P3. K-Nearest Words

In [3]:
import heapq # priority queue

# Function to find k most similar words
def k_nearest_words(word, embeddings, k=5):
    if word not in embeddings:
        print(f'{word} not found in embeddings.')
        return []

    word_vec = embeddings[word]
    similarities = []
    
    for other_word, other_vec in embeddings.items():
        if other_word != word:
            sim = cosine_similarity(word_vec, other_vec)
            similarities.append((sim, other_word))
    
    # Get the top-k most similar words
    k_nearest = heapq.nlargest(k, similarities, key=lambda x: x[0])
    return k_nearest

# Example usage
nearest_words = k_nearest_words('sinh_viên', embeddings, k=5)
print(nearest_words)


[(0.7981586, 'sv'), (0.6784422, 'du_học_sinh'), (0.6702902, 'hs-sv'), (0.661668, 'học_viên'), (0.62819535, 'lưu_học_sinh')]


# P4. Synonym-Antonym Classification

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support

# Load synonym-antonym dataset
def load_syn_ant_dataset(synpath, antpath, embeddings):
    X = []
    y = []
    with open(synpath, 'r', encoding='utf-8') as f:
        for line in f:
            word1, word2 = line.split(' ')
            # print(word1, '---1-->', word2)
            word1 = word1.strip()
            word2 =word2.strip()
            if word1 in embeddings and word2 in embeddings:
                # print(word1, '---1-->', word2)
                vec1 = embeddings[word1]
                vec2 = embeddings[word2]
                # Feature vector can be difference, sum, or concatenation of the two embeddings
                feature = np.concatenate([vec1, vec2])  # Example: concatenation
                X.append(feature)
                y.append(1)  # SYN = 1
    f.close()
    
    with open(antpath, 'r', encoding='utf-8') as f:
        for line in f:
            word1, word2 = line.split(' ')
            # print(word1, '---2-->', word2)
            word1 = word1.strip()
            word2 = word2.strip()
            if word1 in embeddings and word2 in embeddings:
                vec1 = embeddings[word1]
                vec2 = embeddings[word2]
                # Feature vector can be difference, sum, or concatenation of the two embeddings
                feature = np.concatenate([vec1, vec2])  # Example: concatenation
                X.append(feature)
                y.append(0)  # ANT = 0
    f.close()
    
    return np.array(X), np.array(y)

# Train and evaluate the classifier
def train_classifier(clf, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(f'Accuracy: {clf.score(X_test, y_test)}')
    print(f'Precision: {precision}\nRecall: {recall}\nF1: {f1}\n')
    return clf

In [5]:
def load_data(nounpath, verbpath, adjpath, embeddings):
    X = []
    y = []
    with open(nounpath, 'r', encoding='utf-8') as f:
        for line in f:
            word1, word2, relation = line.split()
            # print(word1, '---1-->', word2)
            if word1 in embeddings and word2 in embeddings:
                vec1 = embeddings[word1]
                vec2 = embeddings[word2]
                # Feature vector can be difference, sum, or concatenation of the two embeddings
                feature = np.concatenate([vec1, vec2])  # Example: concatenation
                X.append(feature)
                y.append(1 if relation == 'SYN' else 0)  # SYN = 1, ANT = 0
    f.close()
    with open(verbpath, 'r', encoding='utf-8') as f:
        for line in f:
            word1, word2, relation = line.split()
            # print(word1, '---2-->', word2)
            if word1 in embeddings and word2 in embeddings:
                vec1 = embeddings[word1]
                vec2 = embeddings[word2]
                # Feature vector can be difference, sum, or concatenation of the two embeddings
                feature = np.concatenate([vec1, vec2])  # Example: concatenation
                X.append(feature)
                y.append(1 if relation == 'SYN' else 0)  # SYN = 1, ANT = 0
    f.close()
    with open(adjpath, 'r', encoding='utf-8') as f:
        for line in f:
            word1, word2, relation = line.split()
            # print(word1, '---1-->', word2)
            if word1 in embeddings and word2 in embeddings:
                vec1 = embeddings[word1]
                vec2 = embeddings[word2]
                # Feature vector can be difference, sum, or concatenation of the two embeddings
                feature = np.concatenate([vec1, vec2])  # Example: concatenation
                X.append(feature)
                y.append(1 if relation == 'SYN' else 0)  # SYN = 1, ANT = 0
    f.close()
    
    return np.array(X), np.array(y)

In [6]:
# Predict and evaluate
def get_prediction(clf, X, y):
    
    y_pred = clf.predict(X)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='binary')
    print(f'Accuracy: {clf.score(X, y)}')
    print(f'Precision: {precision}\nRecall: {recall}\nF1: {f1}\n')
    return y_pred

In [7]:
# Load dataset and train
X, y = load_syn_ant_dataset("antonym-synonym set/Synonym_vietnamese.txt", "antonym-synonym set/Antonym_vietnamese.txt", embeddings)
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

clf = LogisticRegression()
clf = train_classifier(clf, X, y)

Accuracy: 0.8363522798251093
Precision: 0.8621190130624092
Recall: 0.9428571428571428
F1: 0.9006823351023503



In [8]:
# Load dataset and predict
X_new, y_new = load_data('datasets/ViCon-400/400_noun_pairs.txt', 'datasets/ViCon-400/400_verb_pairs.txt', 'datasets/ViCon-400/600_adj_pairs.txt', embeddings)

# X_new = scaler.fit_transform(X_new)

prediction = get_prediction(clf, X_new, y_new)

Accuracy: 0.6751918158567775
Precision: 0.592057761732852
Recall: 0.9213483146067416
F1: 0.7208791208791209



In [9]:
from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

In [13]:
spaces = {
    'kernel': ['linear', 'rbf'],
    'C': [10, 100, 1000],
    'gamma': [0.1, 0.01, 0.001]
}
# clf = GridSearchCV(SVC(), spaces, refit=True, verbose=2)

clf1 = SVC(kernel='rbf', C=10, gamma=0.01)
clf1 = train_classifier(clf, X, y)

Accuracy: 0.9381636477201749
Precision: 0.9277818717759764
Recall: 0.9992063492063492
F1: 0.9621704241497898



In [16]:
prediction1 = get_prediction(clf1, X_new, y_new)

Accuracy: 0.9624893435635123
Precision: 0.9238754325259516
Recall: 1.0
F1: 0.960431654676259



In [15]:
from sklearn.neural_network import MLPClassifier

In [19]:
clf2 = MLPClassifier(hidden_layer_sizes=(300, 100), max_iter=1000, random_state=42)

clf2 = train_classifier(clf2, X, y)

Accuracy: 0.9594003747657714
Precision: 0.9671618451915559
Recall: 0.9817460317460317
F1: 0.974399369830642



In [21]:
prediction2 = get_prediction(clf2, X_new, y_new)

Accuracy: 0.9846547314578005
Precision: 0.9708029197080292
Recall: 0.9962546816479401
F1: 0.9833641404805915

