In [1]:
import logging
import sys


date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.WARNING, format="%(asctime)s %(message)s", datefmt=date_strftime_format)

# Data

### Training data
- Source: http://mattmahoney.net/dc/text8.zip
- Stored in: `data/train.txt`

### Analogies data
- Source: https://raw.githubusercontent.com/nicholas-leonard/word2vec/refs/heads/master/questions-words.txt
- Stored in: `data/analogies.txt`

# Defining model

## Model wrapper

In [2]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.word2vec import Text8Corpus

In [3]:
class StoreLossCurveCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.last_logged_loss = 0
        self.loss_curve = []

    def on_epoch_end(self, model):
        curr_loss = model.get_latest_training_loss() - self.last_logged_loss
        self.last_logged_loss = model.get_latest_training_loss()

        self.loss_curve.append(curr_loss)

        print(
            f"Loss for epoch #{self.epoch}: {curr_loss}"
        )

        self.epoch += 1

In [4]:
class Word2VecModel:
    def __init__(
        self,
        model_type: str,
        window_size: int,
        embedding_size: int,
        min_word_count: int = 0,
    ):
        self.model_type = model_type
        self._sg = 1 if model_type ==  "skipgram" else 0
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.min_word_count = min_word_count
        self.compute_loss = True

        self._loss_container = StoreLossCurveCallback()
        self.loss_curve = []

        self.model = None


    def train(
        self,
        training_corpus_fpath: str,
        epochs: int,
        workers: int = 8,
    ):
        self.model = Word2Vec(
            sentences=Text8Corpus(fname=training_corpus_fpath),
            sg=self._sg,
            window=self.window_size,
            vector_size=self.embedding_size,
            epochs=epochs,
            min_count=self.min_word_count,
            compute_loss=self.compute_loss,
            callbacks=[self._loss_container],
            workers=workers,
        )

        self.loss_curve = self._loss_container.loss_curve


    def score(
        self,
        test_analogies_fpath: str,
        return_test_sections: bool = True,
    ):
        if not self.model:
            raise Exception("Model not trained. Call `self.train` before calculating score.")

        score, sections = self.model.wv.evaluate_word_analogies(
            test_analogies_fpath,
        )

        if return_test_sections: return score, sections

        return score


## Searching best hyper-parameters configs

### Grid search routine

In [5]:
from itertools import product

In [6]:
def run_grid_search(
    param_grid: dict,
    param_conditions_callback: callable = None,
    return_best: bool = False
):
    param_combinations = list(product(*param_grid.values()))

    param_keys = list(param_grid.keys())

    results = []
    for params in param_combinations:
        param_dict = dict(zip(param_keys, params))

        if not param_conditions_callback(param_dict):
            continue

        model = Word2VecModel(
            model_type=param_dict["model_type"],
            window_size=param_dict["window_size"],
            embedding_size=param_dict["embedding_size"],
        )

        print("Starting training...")

        model.train(
            training_corpus_fpath="../data/train.txt",
            epochs=param_dict["epochs"],
        )

        score = model.score(
            test_analogies_fpath="../data/analogies.txt",
            return_test_sections=False,
        )

        print(
            f"model_type: {model.model_type}, window_size: {model.window_size}, embedding_size: {model.embedding_size}"
        )
        print(
            f"Final score: {score}\n"
        )

        results.append({"params": param_dict, "score": score})

    if not return_best:
        return results

    return max(results, key=lambda x: x["score"])

In [7]:
def is_valid_param_combination(selected_params: dict):
    return (
        selected_params["window_size"]
        <= selected_params["epochs"]
        <= selected_params["embedding_size"]
    )

In [8]:
param_grid = {
    "model_type": ["skipgram", "cbow"],
    "embedding_size": [5, 10, 15],
    "window_size": [3, 5],
    "epochs": [5, 10],
}

grid_search_results = run_grid_search(
    param_grid=param_grid,
    param_conditions_callback=is_valid_param_combination,
    return_best=True
)

grid_search_results

Starting training...
Loss for epoch #0: 13289499.0
Loss for epoch #1: 9969957.0
Loss for epoch #2: 9446364.0
Loss for epoch #3: 5579412.0
Loss for epoch #4: 5070080.0
model_type: skipgram, window_size: 3, embedding_size: 5
Final score: 0.0020935831675913327

Starting training...
Loss for epoch #0: 18343570.0
Loss for epoch #1: 13659034.0
Loss for epoch #2: 8358636.0
Loss for epoch #3: 7761824.0
Loss for epoch #4: 7601640.0
model_type: skipgram, window_size: 5, embedding_size: 5
Final score: 0.0015178477965037162

Starting training...
Loss for epoch #0: 13067163.0
Loss for epoch #1: 9703455.0
Loss for epoch #2: 9153930.0
Loss for epoch #3: 5764720.0
Loss for epoch #4: 5041528.0
model_type: skipgram, window_size: 3, embedding_size: 10
Final score: 0.01941798387940961

Starting training...
Loss for epoch #0: 13092216.0
Loss for epoch #1: 9617222.0
Loss for epoch #2: 8903884.0
Loss for epoch #3: 5966086.0
Loss for epoch #4: 5002676.0
Loss for epoch #5: 5018288.0
Loss for epoch #6: 5039028.

{'params': {'model_type': 'skipgram',
  'embedding_size': 15,
  'window_size': 5,
  'epochs': 10},
 'score': 0.05767821626714121}