# Trabalho 1

In [1]:
import os

from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils

### Preprocessing
- How to treat punctuation?
- Working with lowercase words is better
- I guess tokenizing only words, that is, split by spaces is a good start

#### CBOW
- Given an one-hot encoded input (one token), outputs k one-hot encoded tokens
- "Costuma ser bom para corpus pequeno"
#### Skip-Gram
- "Costuma ser bom para corpus maiores"

## Choices
- Context size (window size)
    - `window` arg: it's the number of words before and after the target word that the model will consider as context. The default value is 5.
- Training size
- CBOW vs Skip-Gram
    - `sg` arg: 0 for CBOW, 1 for Skip-Gram. The default value is 0.
- Embedding size
    - `vector_size` arg: it's the size of the embedding vector. The default value is 100.

## Training

In [47]:
corpus = "text8"
corpus_path = os.path.join(os.getcwd(), corpus)

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self, corpus, skip_lines: None | list=None):
        self.corpus = corpus
        self.skip_lines = skip_lines

    def __iter__(self):
        corpus_path = datapath(self.corpus)
        for line in open(corpus_path):
            # Skip lines
            if self.skip_lines is not None and line.startswith(self.skip_lines):
                continue
            # assume there's one document per line, tokens separated by whitespace
            preprocessed_line = self._preprocess_text(line)
            yield utils.simple_preprocess(preprocessed_line)

    def _preprocess_text(self, text):
        text = text.lower()
        return text

    def __len__(self):
        return sum(1 for _ in self)

sentences = MyCorpus(corpus_path) # TODO: Does my train corpus has words with accents, different punctuation, etc?
# vector_size is the number of dimensions (N) of the N-dimensional space that gensim Word2Vec maps the words onto.
# The default value of vector_size is 100.
model = Word2Vec(sentences=sentences)

In [52]:
vec_king = model.wv['king']
vec_king.size

# retrieve the vocabulary
vocabulary = model.wv.key_to_index
print(f"vocabulary size: {len(vocabulary)}")

for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

vocabulary size: 70965
word #0/70965 is the
word #1/70965 is of
word #2/70965 is and
word #3/70965 is one
word #4/70965 is in
word #5/70965 is to
word #6/70965 is zero
word #7/70965 is nine
word #8/70965 is two
word #9/70965 is is


## Evaluation
#### Load evaluation corpus

In [48]:
evaluation_corpus_path = os.path.join(os.getcwd(), "questions-words.txt")
evaluation_sentences = MyCorpus(evaluation_corpus_path, skip_lines=":")

print(f"Evaluation corpus size: {evaluation_sentences.__len__()}")

Evaluation corpus size: 19544


### Downstream application: Semantic Analogy task

In [51]:
def are_words_in_vocabulary(words, vocabulary):
    """Check if all words are in the vocabulary\
    Arguments:
        words (list): list of words to check
        vocabulary (dict): vocabulary
    Returns:
        bool: True if all words are in the vocabulary, False otherwise
    """
    return all(word in vocabulary for word in words)

# Initialize total distance
total_distance = 0
skipped_words = 0

for words in evaluation_sentences:
    # skip if any of the words are not on the vocabulary
    # if not all(word in vocabulary for word in words):
    if not are_words_in_vocabulary(words, vocabulary):
        skipped_words += 1
        continue

    first_two_words = words[:2]
    third_word = words[2]
    last_word = words[3]
    # Get the most similar word
    most_similar = model.wv.most_similar(positive=first_two_words, negative=third_word)[0][0]

    # Compute the similarity between the most similar word and the correct word
    similarity = model.wv.similarity(most_similar, last_word)
    total_distance += similarity

lines_not_used = (skipped_words*100)/evaluation_sentences.__len__()
if lines_not_used > 0:
    print("Skipped some words that were not in the vocabulary")
    print(f"Skipped {skipped_words} words (lines), which is {lines_not_used:.2f}% of the corpus size")

# recalculate evaluation corpus size, removing the skipped words
evaluation_corpus_size = evaluation_sentences.__len__() - skipped_words
# Compute the average distance
average_distance = total_distance / evaluation_corpus_size

print("Average distance: ", average_distance)

Skipped some words that were not in the vocabulary
Skipped 1717 words (lines), which is 8.79% of the corpus size
Average distance:  0.00023354751909435584
