In [1]:
import logging
import sys


date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.WARNING, format="%(asctime)s %(message)s", datefmt=date_strftime_format)

# Data

### Training data
- Source: http://mattmahoney.net/dc/text8.zip
- Stored in: `data/train.txt`

### Analogies data
- Source: https://raw.githubusercontent.com/nicholas-leonard/word2vec/refs/heads/master/questions-words.txt
- Stored in: `data/analogies.txt`

# Defining model

## Model wrapper

In [2]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Text8Corpus

In [3]:
class StoreLossCurveCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.last_logged_loss = 0
        self.loss_curve = []

    def on_epoch_end(self, model):
        curr_loss = model.get_latest_training_loss() - self.last_logged_loss
        self.last_logged_loss = model.get_latest_training_loss()

        self.loss_curve.append(curr_loss)

        print(
            f"Loss for epoch #{self.epoch}: {curr_loss}"
        )

        self.epoch += 1

In [4]:
class Word2VecModel:
    def __init__(
        self,
        model_type: str,
        window_size: int,
        embedding_size: int,
        min_word_count: int = 0,
    ):
        self.model_type = model_type
        self._sg = 1 if model_type ==  "skipgram" else 0
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.min_word_count = min_word_count
        self.compute_loss = True

        self._loss_container = StoreLossCurveCallback()
        self.loss_curve = []

        self.model = None


    @property
    def wv(self):
        return self.model.wv


    def train(
        self,
        training_corpus_fpath: str,
        epochs: int,
        workers: int = 8,
    ):
        self.model = Word2Vec(
            sentences=Text8Corpus(fname=training_corpus_fpath),
            sg=self._sg,
            window=self.window_size,
            vector_size=self.embedding_size,
            epochs=epochs,
            min_count=self.min_word_count,
            compute_loss=self.compute_loss,
            callbacks=[self._loss_container],
            workers=workers,
        )

        self.loss_curve = self._loss_container.loss_curve



## Searching best hyper-parameters configs

### Preparing test data

In [5]:
analogies_file_name = "../data/analogies.txt"

with open(analogies_file_name) as file:
    file_content = file.read().splitlines()

all_test_analogies = {}
last_key_added = None
for line in file_content:
    if line[0] == ":":
        last_key_added = line.replace(": ", "")
        all_test_analogies[last_key_added] = []

    else:
        all_test_analogies[last_key_added].append(
            line.lower().split(" ")
        )

### Defining evaluation

In [6]:
import numpy as np
from scipy.spatial import distance

In [7]:
def evaluate_analogy(model, word_tuple):
    w0_embedding = model.wv[word_tuple[0]]
    w1_embedding = model.wv[word_tuple[1]]
    w2_embedding = model.wv[word_tuple[2]]
    w3_embedding = model.wv[word_tuple[3]]

    return 1 - distance.cosine(
        w1_embedding - w0_embedding,
        w3_embedding - w2_embedding,
    )

In [None]:
def build_report(model, test_analogies):
    total_ignored_analogies = 0

    report = {k: 0.0 for k in test_analogies.keys()}
    for sub_category in report.keys():
        similarities = []
        for curr_sample in test_analogies[sub_category]:
            if all([model.wv.__contains__(sample) for sample in curr_sample]):
                curr_similarity = evaluate_analogy(model, curr_sample)
                similarities.append(curr_similarity)

            else:
                total_ignored_analogies += 1
        
        report[sub_category] = np.average(similarities)

    if total_ignored_analogies:
        print(
            f"[WARNING] A total of {total_ignored_analogies} samples were ignored because they contained "
            "words out of the model's vocabulary."
        )

    report["overall_average"] = np.average(list(report.values()))

    return report

### Grid search routine

In [9]:
from itertools import product

In [10]:
def run_grid_search(
    param_grid: dict,
    test_analogies: dict,
    param_conditions_callback: callable = None,
    return_best: bool = False
):
    param_combinations = list(product(*param_grid.values()))

    param_keys = list(param_grid.keys())

    curr_train = 0

    results = []
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))

        if not param_conditions_callback(param_dict):
            continue

        model = Word2VecModel(
            model_type=param_dict["model_type"],
            window_size=param_dict["window_size"],
            embedding_size=param_dict["embedding_size"],
        )

        print(
            f"Starting training model {curr_train + 1} of {len(param_combinations)}"
        )

        model.train(
            training_corpus_fpath="../data/train.txt",
            epochs=param_dict["epochs"],
        )

        curr_model_report = build_report(model, test_analogies)
        score = curr_model_report["overall_average"]

        print(
            f"model_type: {model.model_type}, window_size: {model.window_size}, embedding_size: {model.embedding_size}"
        )
        print(f"Final score: {score}\n")

        results.append(
            {"params": param_dict, "score": score, "full_report": curr_model_report}
        )

        curr_train += 1

    if not return_best:
        return results

    return max(results, key=lambda x: x["score"])

In [11]:
def is_valid_param_combination(selected_params: dict):
    return (
        selected_params["window_size"]
        <= selected_params["epochs"]
        <= selected_params["embedding_size"]
    )

In [12]:
param_grid = {
    "model_type": ["cbow"],
    "embedding_size": [5],
    "window_size": [3],
    "epochs": [5],
}

grid_search_results = run_grid_search(
    param_grid=param_grid,
    test_analogies=all_test_analogies,
    param_conditions_callback=is_valid_param_combination,
    return_best=True
)

grid_search_results

Starting training model 1 of 1
Loss for epoch #0: 3925150.75
Loss for epoch #1: 3213225.25
Loss for epoch #2: 2910120.0
Loss for epoch #3: 2694924.0
Loss for epoch #4: 2703890.0
model_type: cbow, window_size: 3, embedding_size: 5
Final score: 0.42961090919634254



{'params': {'model_type': 'cbow',
  'embedding_size': 5,
  'window_size': 3,
  'epochs': 5},
 'score': 0.42961090919634254,
 'full_report': {'capital-common-countries': 0.686720417587637,
  'capital-world': 0.7839351314017419,
  'currency': 0.7610609295174078,
  'city-in-state': 0.7584993818520714,
  'family': 0.0928185977840391,
  'gram1-adjective-to-adverb': 0.1774767649038595,
  'gram2-opposite': 0.34514809483209075,
  'gram3-comparative': 0.01765687989036821,
  'gram4-superlative': 0.44256560337404843,
  'gram5-present-participle': 0.3286936293923346,
  'gram6-nationality-adjective': 0.6962941432764763,
  'gram7-past-tense': 0.2437719318179426,
  'gram8-plural': 0.2817761093764548,
  'gram9-plural-verbs': 0.3981351137423251,
  'overall_average': 0.42961090919634254}}