# Data

### Training data
- Source: http://mattmahoney.net/dc/text8.zip
- Stored in: `data/train.txt`

### Analogies data
- Source: https://raw.githubusercontent.com/nicholas-leonard/word2vec/refs/heads/master/questions-words.txt
- Stored in: `data/analogies.txt`

# Defining model

In [None]:
from scipy.spatial import distance

def evaluate_analogy(model, word_tuple):
    w0_embedding = model.wv[word_tuple[0]]
    w1_embedding = model.wv[word_tuple[1]]
    w2_embedding = model.wv[word_tuple[2]]
    w3_embedding = model.wv[word_tuple[3]]

    return distance.cosine(
        w0_embedding - w1_embedding,
        w3_embedding - w2_embedding,
    )


In [2]:
analogies_file_name = "../data/analogies.txt"

with open(analogies_file_name) as file:
    file_content = file.read().splitlines()

analogies = {}
last_key_added = None
for line in file_content:
    if line[0] == ":":
        last_key_added = line.replace(": ", "")
        analogies[last_key_added] = []

    else:
        analogies[last_key_added].append(
            line.lower().split(" ")
        )


In [3]:
from gensim.models import Word2Vec

word_embedder = Word2Vec(
    corpus_file="../data/train.txt",
    sg=0,
    window=9,
    vector_size=100,
    epochs=10,
    min_count=0,
)

In [11]:
import numpy as np

report = {k: 0.0 for k in analogies.keys()}
for sub_category in report.keys():
    report[sub_category] = np.average(
        [
            evaluate_analogy(word_embedder, curr_sample) for curr_sample in analogies[sub_category]
            if all([word_embedder.wv.__contains__(sample) for sample in curr_sample])
        ]
    )

report["overall_average"] = np.average(list(report.values()))
report

{'capital-common-countries': 1.145200044020742,
 'capital-world': 1.0200542260520233,
 'currency': 1.0507040328120292,
 'city-in-state': 0.9597974135099203,
 'family': 1.1491588860389297,
 'gram1-adjective-to-adverb': 1.0229043717327924,
 'gram2-opposite': 1.1038990891437972,
 'gram3-comparative': 1.1680502487806463,
 'gram4-superlative': 1.3427902522598905,
 'gram5-present-participle': 1.0132598159665822,
 'gram6-nationality-adjective': 0.9939386219094875,
 'gram7-past-tense': 1.0164548298098048,
 'gram8-plural': 1.0871997624037575,
 'gram9-plural-verbs': 1.1542761835592423,
 'overall_average': 1.0876919841428319}