# Trabalho Prático - Word Embeddings
## Thiago Pádua de Carvalho - 2020007066

In [9]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine

import itertools

In [10]:
with open("data/text8", "r") as file:
    corpus = file.read()

# Tokenize the corpus
sentences = [simple_preprocess(line) for line in corpus.splitlines()]

In [19]:
def evaluate_model(model, analogy_file="data/questions-words.txt"):
    total_distance = 0
    num_analogies = 0

    with open(analogy_file, "r") as file:
        for line in file:
            if line.startswith(":"):  # Ignore categories
                continue

            words = line.strip().split()
            if all(word in model.wv for word in words):  # Check if all words are in the vocabulary
                vector_a = model.wv[words[1]]  # Example: France
                vector_b = model.wv[words[0]]  # Example: Paris
                vector_c = model.wv[words[2]]  # Example: Berlin
                vector_d = model.wv[words[3]]  # Example: Germany

                result_vector = vector_a - vector_b + vector_c

                # Compute the cosine distance between the resulting vector and the expected vector
                distance = cosine(result_vector, vector_d)
                total_distance += distance
                num_analogies += 1

    average_distance = total_distance / num_analogies
    return average_distance


In [20]:
def generate_hyperparams_combinations(hyperparams):
    keys = hyperparams.keys()
    values = hyperparams.values()
    for combination in itertools.product(*values):
        yield dict(zip(keys, combination))

In [23]:
hyperparams = {
    "embedding_size": [100, 200],
    "window_size": [2, 3, 4, 5, 10],
    "skip_gram": [0, 1],
    "epochs": [5, 10]
}

data_dir = "models"

results = []
for params in generate_hyperparams_combinations(hyperparams):
    model = Word2Vec(
        sentences=sentences,
        vector_size=params["embedding_size"],
        window=params["window_size"],
        sg=params["skip_gram"],
        epochs=params["epochs"]
    )

    model_name = f"word2vec_{params['embedding_size']}_{params['window_size']}_{params['skip_gram']}_{params['epochs']}.model"
    model.save(f"{data_dir}/{model_name}")

    avg_distance = evaluate_model(model)
    # print(f"Distância média: {avg_distance:.4f}")
    results.append((model_name, avg_distance, params))

In [24]:
best_model = min(results, key=lambda x: x[1])
print(f"Best model: {best_model[0]} with average distance: {best_model[1]:.4f}")

Best model: word2vec_200_10_1_10.model with average distance: 0.8760


### Introdução
Neste trabalho prático, foi proposta a implementação de modelos de word embeddings e sua posterior avaliação em tarefas de similaridade e analogia de palavras. Para tal, foram criados modelos com variações de arquitetura e hiperparâmetros.
### Hiperparâmetros
Os hiperparâmetros utilizados foram:
- **Embedding Size**: Tamanho do vetor de embedding.
- **Window Size**: Tamanho da janela de contexto.
- **Epochs**: Número de épocas de treinamento.
### Arquiteturas
Foram implementadas 2 arquiteturas de word embeddings:
- **Skip-gram**: Modelo que prevê as palavras do contexto a partir de uma palavra central.
- **CBOW**: Modelo que prevê a palavra central a partir das palavras do contexto.
