In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression

from src.word_to_embedding import WordToEmbedding

In [2]:
w2e = WordToEmbedding()

In [3]:
def __cosine_similarity(a: np.ndarray, b: np.ndarray):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def cosine_similarity(word1: str, word2: str):
    a = w2e.get_embedding(word1)
    b = w2e.get_embedding(word2)

    longer_word_length = max(a.shape[0], b.shape[0])
    shorter_word_length = min(a.shape[0], b.shape[0])
    similarities = sum(__cosine_similarity(a[i], b[i]) for i in range(shorter_word_length))

    return similarities / shorter_word_length

In [4]:
with open('/mnt/d/Projects/masters-thesis/data/paronym.txt') as file_pointer:
    lines = file_pointer.readlines()

lines = [line.strip().split(',') for line in lines]
lines = [(word1.strip(), word2.strip(), c.strip()) for word1, word2, c in lines]
train_test_split_index = int(0.8 * len(lines))

train_lines = lines[:train_test_split_index]
test_lines = lines[train_test_split_index:]

In [22]:
print(len(train_lines), len(test_lines))

148 37


In [5]:
train_input = [(cosine_similarity(word1, word2), is_paronym) for word1, word2, is_paronym in train_lines]

X = np.array([result for result, _ in train_input]).reshape(-1, 1)
y = np.array([int(is_paronym) for _, is_paronym in train_input]).reshape(-1, 1)

reg = LogisticRegression().fit(X, y)

  y = column_or_1d(y, warn=True)


In [12]:
test_input = [(cosine_similarity(word1, word2), is_paronym) for word1, word2, is_paronym in test_lines]

X = np.array([result for result, _ in test_input]).reshape(-1, 1)
actual = np.array([int(is_paronym) for _, is_paronym in test_input]).reshape(-1, 1)
predicted = reg.predict(X)

results = list(zip(actual, predicted))

In [19]:
tp = sum(1 for a, p in results if a == p and a == 1)
tn = sum(1 for a, p in results if a == p and a == 0)
fp = sum(1 for a, p in results if a != p and a == 0)
fn = sum(1 for a, p in results if a != p and a == 1)

ap = sum(1 for a, _ in results if a == 1)
an = sum(1 for a, _ in results if a == 0)

pp = sum(1 for _, p in results if p == 1)
pn = sum(1 for _, p in results if p == 0)

f'{tp=}, {tn=}, {fp=}, {fn=}, {ap=}, {an=}, {pp=}, {pn=}'

'tp=17, tn=16, fp=3, fn=1, ap=18, an=19, pp=20, pn=17'

In [None]:
accuracy = (tp + tn) / (ap + an)
precision = tp / (tp + fp)
recall = tp / ap
f1 = 2 * (precision * recall) / (precision + recall)
f'{accuracy=:.3f}, {precision=:.3f} {recall=:.3f}, {f1=:.3f}'

'accuracy=0.892, precision=0.850 recall=0.944, f1=0.895'