In [None]:
# Download fasttext embedding (for first time)
# import fasttext.util
# fasttext.util.download_model('en', if_exists='ignore')

In [None]:
import io
import os
import fasttext
import pandas as pd
import numpy as np
import setproctitle

from sklearn.ensemble import GradientBoostingClassifier
from transformers import set_seed
from sentence_transformers.SentenceTransformer import SentenceTransformer

In [None]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
set_seed(42, deterministic=True)
setproctitle.setproctitle("python")

In [None]:
train = pd.read_csv("BLESS/train.tsv", sep="\t", header=None, names=["kata1","kata2","relasi"]).dropna().reset_index(drop=True)
test = pd.read_csv("BLESS/test.tsv", sep="\t", header=None, names=["kata1","kata2","relasi"]).dropna().reset_index(drop=True)

In [None]:
def load_learned_embeddings(name: str):
    fin = io.open(name, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=float)
    return data
rwe_5_epochs = load_learned_embeddings('rwe_embeddings.txt')
ft = fasttext.load_model('../cc.en.300.bin')
embedding_model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True)

# Training

In [None]:
def train_with_stella():
    representations = []
    word1s: list[str] = train["kata1"].to_list()
    word2s: list[str] = train["kata2"].to_list()
    word1s_embeddings = embedding_model.encode(word1s, batch_size=256, show_progress_bar=True)
    word2s_embeddings = embedding_model.encode(word2s, batch_size=256, show_progress_bar=True)

    for i in range(len(train)):
        word1_embed = word1s_embeddings[i]
        word2_embed = word2s_embeddings[i]
        pair_difference = np.subtract(word1_embed, word2_embed)
        representations.append(pair_difference)

    train['representation'] = representations
    X_train = np.vstack(train['representation'])
    y_train = train['relasi']

    return X_train, y_train

In [None]:
def train_with_rwe():
    representations = []
    for i in range(len(train)):
        word1_embedding = ft.get_word_vector(train["kata1"][i])
        word2_embedding = ft.get_word_vector(train["kata2"][i])

        word1_rwe = rwe_5_epochs.get(train["kata1"][i], np.zeros(300,))
        word2_rwe = rwe_5_epochs.get(train["kata2"][i], np.zeros(300,))

        pair_difference_ft = np.subtract(word1_embedding, word2_embedding)
        pair_addition_rwe = np.multiply(word1_rwe, word2_rwe)
        pair_multiplication_rwe = np.multiply(word1_rwe, word2_rwe)

        pair_representation = np.concatenate((pair_difference_ft, pair_addition_rwe))
        pair_representation = np.concatenate((pair_representation, pair_multiplication_rwe))

        representations.append(pair_representation)

    train['representation'] = representations
    X_train = np.vstack(train['representation'])
    y_train = train['relasi']

    return X_train, y_train

In [None]:
def train_with_ft():
    representations = []
    for i in range(len(train)):
        word1_embedding = ft.get_word_vector(train["kata1"][i])
        word2_embedding = ft.get_word_vector(train["kata2"][i])
        pair_representation = np.subtract(word1_embedding, word2_embedding)
        representations.append(pair_representation)

    train['representation'] = representations
    X_train = np.vstack(train['representation'])
    y_train = train['relasi']

    return X_train, y_train

In [None]:
def train_with_stella_final():
    representations = []
    word1s: list[str] = train["kata1"].to_list()
    word2s: list[str] = train["kata2"].to_list()
    word1s_embeddings = embedding_model.encode(word1s, batch_size=256, show_progress_bar=True)
    word2s_embeddings = embedding_model.encode(word2s, batch_size=256, show_progress_bar=True)

    for i in range(len(train)):
        word1_embed = word1s_embeddings[i]
        word2_embed = word2s_embeddings[i]
        word1_rwe = rwe_5_epochs.get(train["kata1"][i], np.zeros(300,))
        word2_rwe = rwe_5_epochs.get(train["kata2"][i], np.zeros(300,))

        pair_difference = np.subtract(word1_embed, word2_embed)
        pair_addition_rwe = np.multiply(word1_rwe, word2_rwe)
        pair_multiplication_rwe = np.multiply(word1_rwe, word2_rwe)

        pair_representation = np.concatenate((pair_difference, pair_addition_rwe))
        pair_representation = np.concatenate((pair_representation, pair_multiplication_rwe))
        representations.append(pair_representation)

    train['representation'] = representations
    X_train = np.vstack(train['representation'])
    y_train = train['relasi']

    return X_train, y_train

In [None]:
X_train, y_train = train_with_stella()

In [None]:
from sklearn.svm import SVC

In [None]:
classifier = SVC(random_state=42)
classifier.fit(X_train, y_train)

# Testing

In [None]:
def test_with_stella():
    representations = []

    word1s: list[str] = test["kata1"].to_list()
    word2s: list[str] = test["kata2"].to_list()

    word1s_embeddings = embedding_model.encode(word1s, batch_size=256, show_progress_bar=True)
    word2s_embeddings = embedding_model.encode(word2s, batch_size=256, show_progress_bar=True)

    for i in range(len(test)):

        word1_embed = word1s_embeddings[i]
        word2_embed = word2s_embeddings[i]

        pair_difference = np.subtract(word1_embed, word2_embed)
        representations.append(pair_difference)

    test['representation'] = representations
    X_test = np.vstack(test['representation'])
    y_test = test['relasi']

    return X_test, y_test

In [None]:
def test_with_rwe():
    representations = []
    for i in range(len(test)):
        word1_embedding = ft.get_word_vector(test["kata1"][i])
        word2_embedding = ft.get_word_vector(test["kata2"][i])

        word1_rwe = rwe_5_epochs.get(test["kata1"][i], np.zeros(300,))
        word2_rwe = rwe_5_epochs.get(test["kata2"][i], np.zeros(300,))

        pair_difference_ft = np.subtract(word1_embedding, word2_embedding)
        pair_addition_rwe = np.multiply(word1_rwe, word2_rwe)
        # pair_multiplication_rwe = np.multiply(word1_rwe, word2_rwe)

        pair_representation = np.concatenate((pair_difference_ft, pair_addition_rwe))
        # pair_representation = np.concatenate((pair_representation, pair_multiplication_rwe))

        representations.append(pair_representation)

    test['representation'] = representations
    X_test = np.vstack(test['representation'])
    y_test = test['relasi']

    return X_test, y_test

In [None]:
def test_with_ft():
    representations = []
    for i in range(len(test)):
        word1_embedding = ft.get_word_vector(test["kata1"][i])
        word2_embedding = ft.get_word_vector(test["kata2"][i])
        pair_difference_ft = np.subtract(word1_embedding, word2_embedding)
        representations.append(pair_difference_ft)

    test['representation'] = representations
    X_test = np.vstack(test['representation'])
    y_test = test['relasi']

    return X_test, y_test

In [None]:
def test_with_stella_final():
    representations = []

    word1s: list[str] = test["kata1"].to_list()
    word2s: list[str] = test["kata2"].to_list()

    word1s_embeddings = embedding_model.encode(word1s, batch_size=256, show_progress_bar=True)
    word2s_embeddings = embedding_model.encode(word2s, batch_size=256, show_progress_bar=True)

    for i in range(len(test)):
        word1_embed = word1s_embeddings[i]
        word2_embed = word2s_embeddings[i]
        word1_rwe = rwe_5_epochs.get(test["kata1"][i], np.zeros(300,))
        word2_rwe = rwe_5_epochs.get(test["kata2"][i], np.zeros(300,))

        pair_difference = np.subtract(word1_embed, word2_embed)
        pair_addition_rwe = np.multiply(word1_rwe, word2_rwe)
        pair_multiplication_rwe = np.multiply(word1_rwe, word2_rwe)

        pair_representation = np.concatenate((pair_difference, pair_addition_rwe))
        pair_representation = np.concatenate((pair_representation, pair_multiplication_rwe))
        representations.append(pair_representation)

    test['representation'] = representations
    X_test = np.vstack(test['representation'])
    y_test = test['relasi']

    return X_test, y_test

In [None]:
X_test, y_test = test_with_stella()

In [None]:
prediction = classifier.predict(X_test)
sum = 0
for i in range(len(prediction)):
    if prediction[i] == y_test[i]:
        sum += 1
print(sum/len(prediction))