In [None]:
import io
import os
import fasttext
import pandas as pd
import numpy as np

from transformers import set_seed
from sklearn.neural_network import MLPClassifier
from sentence_transformers.SentenceTransformer import SentenceTransformer

In [3]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
set_seed(42, deterministic=True)

In [4]:
train = pd.read_csv("dataset/train.tsv", sep="\t", header=None, names=["kata1","kata2","relasi"]).dropna().reset_index(drop=True)
test = pd.read_csv("dataset/test.tsv", sep="\t", header=None, names=["kata1","kata2","relasi"]).dropna().reset_index(drop=True)

In [5]:
def load_learned_embeddings(name: str):
    fin = io.open(name, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=float)
    return data
rwe = load_learned_embeddings('rwe_embeddings.txt')  # Self-trained
# rwe = load_learned_embeddings('reference_rwe.txt')  # From original repo
ft = fasttext.load_model('cc.en.300.bin')
embedding_model = SentenceTransformer('bge-base', local_files_only=True)

# Training

In [6]:
def get_bge_train_data(add_rwe=False):
    representations = []
    word1s_embeddings = embedding_model.encode(
        train["kata1"].to_list(), batch_size=256, show_progress_bar=True
    )
    word2s_embeddings = embedding_model.encode(
        train["kata2"].to_list(), batch_size=256, show_progress_bar=True
    )

    for i in range(len(train)):
        word1_embed = word1s_embeddings[i]
        word2_embed = word2s_embeddings[i]

        if add_rwe:
            word1_rwe = rwe.get(train["kata1"][i], np.zeros(300,))
            word2_rwe = rwe.get(train["kata2"][i], np.zeros(300,))
            word1_embed = np.concatenate((word1_embed, word1_rwe))
            word2_embed = np.concatenate((word2_embed, word2_rwe))

        pair_difference = np.subtract(word1_embed, word2_embed)
        representations.append(pair_difference)

    train["representation"] = representations
    X_train = np.vstack(train["representation"])
    y_train = train["relasi"]

    return X_train, y_train

In [7]:
def get_fasttext_train_data(add_rwe=False):
    representations = []
    for i in range(len(train)):
        word1_embed = ft.get_word_vector(train["kata1"][i])
        word2_embed = ft.get_word_vector(train["kata2"][i])

        if add_rwe:
            word1_rwe = rwe.get(train["kata1"][i], np.zeros(300,))
            word2_rwe = rwe.get(train["kata2"][i], np.zeros(300,))
            word1_embed = np.concatenate((word1_embed, word1_rwe))
            word2_embed = np.concatenate((word2_embed, word2_rwe))
        
        pair_difference = np.subtract(word1_embed, word2_embed)
        representations.append(pair_difference)

    train['representation'] = representations
    X_train = np.vstack(train['representation'])
    y_train = train['relasi']

    return X_train, y_train

In [None]:
X_train_bge, y_train_bge = get_bge_train_data()
X_train_bge_rwe, y_train_bge_rwe = get_bge_train_data(True)
X_train_ft, y_train_ft = get_fasttext_train_data()
X_train_ft_rwe, y_train_ft_rwe = get_fasttext_train_data(True)

In [None]:
model_bge = MLPClassifier(random_state=42, max_iter=300)
model_bge_rwe = MLPClassifier(random_state=42, max_iter=300)
model_ft = MLPClassifier(random_state=42, max_iter=300)
model_ft_rwe = MLPClassifier(random_state=42, max_iter=300)

model_bge.fit(X_train_bge, y_train_bge)
model_bge_rwe.fit(X_train_bge_rwe, y_train_bge_rwe)
model_ft.fit(X_train_ft, y_train_ft)
model_ft_rwe.fit(X_train_ft_rwe, y_train_ft_rwe)

# Testing

In [10]:
def get_bge_test_data(add_rwe=False):
    representations = []
    word1s_embeddings = embedding_model.encode(
        test["kata1"].to_list(), batch_size=256, show_progress_bar=True
    )
    word2s_embeddings = embedding_model.encode(
        test["kata2"].to_list(), batch_size=256, show_progress_bar=True
    )

    for i in range(len(test)):
        word1_embed = word1s_embeddings[i]
        word2_embed = word2s_embeddings[i]

        if add_rwe:
            word1_rwe = rwe.get(test["kata1"][i], np.zeros(300,))
            word2_rwe = rwe.get(test["kata2"][i], np.zeros(300,))
            word1_embed = np.concatenate((word1_embed, word1_rwe))
            word2_embed = np.concatenate((word2_embed, word2_rwe))

        pair_difference = np.subtract(word1_embed, word2_embed)
        representations.append(pair_difference)

    test["representation"] = representations
    X_test = np.vstack(test["representation"])
    y_test = test["relasi"]

    return X_test, y_test

In [11]:
def get_fasttext_test_data(add_rwe=False):
    representations = []
    for i in range(len(test)):
        word1_embed = ft.get_word_vector(test["kata1"][i])
        word2_embed = ft.get_word_vector(test["kata2"][i])

        if add_rwe:
            word1_rwe = rwe.get(test["kata1"][i], np.zeros(300,))
            word2_rwe = rwe.get(test["kata2"][i], np.zeros(300,))
            word1_embed = np.concatenate((word1_embed, word1_rwe))
            word2_embed = np.concatenate((word2_embed, word2_rwe))
        
        pair_difference = np.subtract(word1_embed, word2_embed)
        representations.append(pair_difference)

    test['representation'] = representations
    X_test = np.vstack(test['representation'])
    y_test = test['relasi']

    return X_test, y_test

In [12]:
def get_accuracy(pred, actual):
    sum = 0
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            sum += 1
    print(f"Accuracy: {sum/len(pred)}")

In [None]:
X_test_bge, y_test_bge = get_bge_test_data()
X_test_bge_rwe, y_test_bge_rwe = get_bge_test_data(True)
X_test_ft, y_test_ft = get_fasttext_test_data()
X_test_ft_rwe, y_test_ft_rwe = get_fasttext_test_data(True)

In [14]:
prediction_bge = model_bge.predict(X_test_bge)
prediction_bge_rwe = model_bge_rwe.predict(X_test_bge_rwe)
prediction_ft = model_ft.predict(X_test_ft)
prediction_ft_rwe = model_ft_rwe.predict(X_test_ft_rwe)

In [None]:
get_accuracy(prediction_bge, y_test_bge)
get_accuracy(prediction_bge_rwe, y_test_bge_rwe)
get_accuracy(prediction_ft, y_test_ft)
get_accuracy(prediction_ft_rwe, y_test_ft_rwe)