# Data

### Training data
- Source: http://mattmahoney.net/dc/text8.zip
- Stored in: `data/train.txt`

### Analogies data
- Source: https://raw.githubusercontent.com/nicholas-leonard/word2vec/refs/heads/master/questions-words.txt
- Stored in: `data/analogies.txt`

# Defining model

In [47]:
from scipy.spatial import distance

def evaluate_analogy(model, word_tuple):
    try:
        w0_embedding = model.wv[word_tuple[0]]
        w1_embedding = model.wv[word_tuple[1]]
        w2_embedding = model.wv[word_tuple[2]]
        w3_embedding = model.wv[word_tuple[3]]

        return distance.cosine(
            w0_embedding - w1_embedding,
            w3_embedding - w2_embedding,
        )

    except KeyError:
        return 0.0


In [48]:
analogies_file_name = "../data/analogies.txt"

with open(analogies_file_name) as file:
    file_content = file.read().splitlines()

analogies = {}
last_key_added = None
for line in file_content:
    if line[0] == ":":
        last_key_added = line.replace(": ", "")
        analogies[last_key_added] = []

    else:
        analogies[last_key_added].append(
            line.split(" ")
        )


In [49]:
from gensim.models import Word2Vec

word_embedder = Word2Vec(
    corpus_file="../data/train.txt",
    sg=0,
    window=9,
    vector_size=100,
    epochs=10,
)

In [50]:
import numpy as np

report = {k: 0.0 for k in analogies.keys()}
for sub_category in report.keys():
    sub_category_analogies = analogies[sub_category]

    report[sub_category] = np.average(
        [evaluate_analogy(word_embedder, curr_sample) for curr_sample in sub_category_analogies]
    )

report["overall_average"] = np.average(list(report.values()))
report


{'capital-common-countries': 0.0,
 'capital-world': 0.0,
 'currency': 0.0,
 'city-in-state': 0.0,
 'family': 0.9685711833643577,
 'gram1-adjective-to-adverb': 1.0344437106710933,
 'gram2-opposite': 1.086949848286605,
 'gram3-comparative': 1.1385346374587555,
 'gram4-superlative': 1.150448216914919,
 'gram5-present-participle': 1.012189168646903,
 'gram6-nationality-adjective': 0.0,
 'gram7-past-tense': 1.0193302541489941,
 'gram8-plural': 1.0610290279681767,
 'gram9-plural-verbs': 1.1479724230953072,
 'overall_average': 0.6871048907539364}