In [1]:
import logging
import sys


date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.WARNING, format="%(asctime)s %(message)s", datefmt=date_strftime_format)

# Data

### Training data
- Source: http://mattmahoney.net/dc/text8.zip

### Analogies data
- Source: https://raw.githubusercontent.com/nicholas-leonard/word2vec/refs/heads/master/questions-words.txt

# Defining model

## Model wrapper

In [2]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

In [3]:
class StoreLossCurveCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.last_logged_loss = 0
        self.loss_curve = []

    def on_epoch_end(self, model):
        curr_loss = model.get_latest_training_loss() - self.last_logged_loss
        self.last_logged_loss = model.get_latest_training_loss()

        self.loss_curve.append(curr_loss)

        print(
            f"Loss for epoch #{self.epoch}: {curr_loss}"
        )

        self.epoch += 1

In [4]:
class Word2VecModel:
    def __init__(
        self,
        model_type: str,
        window_size: int,
        embedding_size: int,
        min_word_count: int = 0,
        workers: int = 8,
    ):
        self.model_type = model_type
        self._sg = 1 if model_type ==  "skipgram" else 0
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.min_word_count = min_word_count
        self.workers = workers
        self.compute_loss = True

        self._loss_container = StoreLossCurveCallback()
        self.loss_curve = []

        self.model =  None


    @property
    def wv(self):
        return self.model.wv


    def train(
        self,
        dataset: str,
        epochs: int,
    ):
        self.model = Word2Vec(
            sentences=dataset,
            sg=self._sg,
            window=self.window_size,
            vector_size=self.embedding_size,
            min_count=self.min_word_count,
            epochs=epochs,
            compute_loss=self.compute_loss,
            callbacks=[self._loss_container],
            workers=self.workers,
        )

        self.loss_curve = self._loss_container.loss_curve


## Searching best hyper-parameters configs

### Preparing test data

In [5]:
analogies_file_name = "../data/analogies.txt"

with open(analogies_file_name) as file:
    file_content = file.read().splitlines()

all_test_analogies = {}
last_key_added = None
for line in file_content:
    if line[0] == ":":
        last_key_added = line.replace(": ", "")
        all_test_analogies[last_key_added] = []

    else:
        all_test_analogies[last_key_added].append(
            line.lower().split(" ")
        )

In [6]:
import numpy as np

def evaluate_analogy(model, analogy):
    def cosine_similarity(a, b):
        return (
            np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        )

    w0_embedding = model.wv[analogy[0]]
    w1_embedding = model.wv[analogy[1]]
    w2_embedding = model.wv[analogy[2]]
    w3_embedding = model.wv[analogy[3]]

    return cosine_similarity(
        w0_embedding - w1_embedding + w3_embedding,
        w2_embedding,
    )

### Defining evaluation

In [7]:
def build_report(model, test_analogies):
    total_ignored_analogies = 0

    report = {}
    for sub_category in test_analogies.keys():
        similarities = []
        for curr_sample in test_analogies[sub_category]:
            if all([model.wv.__contains__(sample) for sample in curr_sample]):
                curr_similarity = evaluate_analogy(model, curr_sample)
                similarities.append(curr_similarity)

            else:
                total_ignored_analogies += 1

        report[sub_category] = np.average(similarities)

    if total_ignored_analogies:
        print(
            f"[WARNING] A total of {total_ignored_analogies} samples were ignored because they contained "
            "words out of the model's vocabulary."
        )

    report["overall_average"] = np.average(list(report.values()))

    return report

### Grid search routine

In [8]:
from itertools import product
import gensim.downloader as gensim_downloader
from gensim.test.utils import datapath

In [9]:
train_dataset = gensim_downloader.load("text8")

In [10]:
def run_grid_search(
    train_dataset,
    param_grid: dict,
    test_analogies: dict,
    return_best: bool = False
):
    param_combinations = list(product(*param_grid.values()))

    param_keys = list(param_grid.keys())

    curr_train = 0

    results = []
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))

        model = Word2VecModel(
            model_type=param_dict["model_type"],
            window_size=param_dict["window_size"],
            embedding_size=param_dict["embedding_size"],
            min_word_count=0,
            workers=12,
        )

        print(
            f"Starting training model {curr_train}. Will train for {param_dict['epochs']} epochs."
        )
        print(
            f"model_type: {model.model_type}, window_size: {model.window_size}, embedding_size: {model.embedding_size}"
        )

        model.train(
            dataset=train_dataset,
            epochs=param_dict["epochs"],
        )

        curr_model_report = build_report(model, test_analogies)
        score = curr_model_report["overall_average"]

        print(f"Final analogies score: {score}\n")

        results.append(
            {"params": param_dict, "score": score, "full_report": curr_model_report}
        )

        curr_train += 1

    if not return_best:
        return results

    return max(results, key=lambda x: x["score"])

In [11]:
%%time

param_grid = {
    "model_type": ["skipgram", "cbow"],
    "embedding_size": [5, 25, 100],
    "window_size": [3, 7, 15],
    "epochs": [2, 5, 10],
}

grid_search_results = run_grid_search(
    train_dataset=train_dataset,
    param_grid=param_grid,
    test_analogies=all_test_analogies,
    return_best=False
)

Starting training model 0. Will train for 2 epochs.
model_type: skipgram, window_size: 3, embedding_size: 5
Loss for epoch #0: 14831020.0
Loss for epoch #1: 11385590.0
Final analogies score: 0.911324679851532

Starting training model 1. Will train for 5 epochs.
model_type: skipgram, window_size: 3, embedding_size: 5
Loss for epoch #0: 14525665.0
Loss for epoch #1: 10843795.0
Loss for epoch #2: 9537772.0
Loss for epoch #3: 5853708.0
Loss for epoch #4: 5772788.0
Final analogies score: 0.9100650548934937

Starting training model 2. Will train for 10 epochs.
model_type: skipgram, window_size: 3, embedding_size: 5
Loss for epoch #0: 14938645.0
Loss for epoch #1: 10696035.0
Loss for epoch #2: 9501100.0
Loss for epoch #3: 6105528.0
Loss for epoch #4: 5940780.0
Loss for epoch #5: 5967468.0
Loss for epoch #6: 5817036.0
Loss for epoch #7: 5854748.0
Loss for epoch #8: 2441372.0
Loss for epoch #9: 237776.0
Final analogies score: 0.9032546281814575

Starting training model 3. Will train for 2 epoch

### Results

In [14]:
best_model_params = max(grid_search_results, key=lambda x: x["score"])

In [15]:
worst_model_params = min(grid_search_results, key=lambda x: x["score"])

In [16]:
worst_model_params["score"], best_model_params["score"]

(0.43733042, 0.91239774)