In [1]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from tqdm import tqdm
import tokenizer



In [2]:
# dataset settings
dataset_path = fr"./datasets/ultra_train.txt"

chunk_size = 1024# * 8 * 8
corpus_size = 4

# model hyperparams
vector_size = 4096      # Dimensionality of the word vectors
window = 10             # Maximum distance between the current and predicted word within a sentence
min_count = 2           # Ignores all words with total frequency lower than this
workers = 1             # Number of worker threads to train the model
sg = 0                  # Training algorithm: 1 for skip-gram; 0 for CBOW
hs = 0                  # If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
negative = 28           # If > 0, negative sampling will be used. The int for negative specifies how many "noise words" should be drawn
epochs = 8              # Number of iterations (epochs) over the corpus
alpha = 0.025           # The initial learning rate
min_alpha = 0.0001      # The minimum learning rate

In [3]:
# model init
model = Word2Vec(
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=workers,
    sg=sg,
    hs=hs,
    negative=negative,
    alpha=alpha,
    min_alpha=min_alpha
)

#model = Word2Vec.load(fr"./embedding_models/360vec_checkpt1.model")

In [4]:
class training_corpus:
    def __init__(self):
        self.corpus = []
        self.size = 0
        self.corpus_to_size = False
        self.all_chunks_to_size = True
    
    def add_chunk(self, chunk):
        self.corpus.append(chunk)
        
        self.size += 1
        self.corpus_to_size = self.size == corpus_size
        self.all_chunks_to_size = self.all_chunks_to_size and "".join(chunk) == chunk_size
    
    def corpus_ok(self):
        return self.corpus_to_size and self.all_chunks_to_size
    
    def __iter__(self):
        for chunk in self.corpus:
            yield chunk
    
    def __getitem__(self, idx):
        return self.corpus[idx]

In [5]:
def read_chunk(read_start: int, chunk_size: int=chunk_size, path: str=dataset_path) -> tuple[str, bool]:
    """
    Reads a certain number of characters from a file starting at the specified position.
    
    Args:
        read_start (int): The position in the file to start reading from.
        chunk_size (int, optional): The number of characters to read. Defaults to `chunk_size`.
        path (str, optional): The path to the dataset text file. Defaults to `dataset_path`.
    
    Returns:
        tuple[str, bool]: A tuple containing the loaded chunk and a boolean indicating if EOF is hit.
    """
    
    with open(path, 'r', encoding='utf-8', errors='ignore') as file:
        file.seek(read_start)
        chunk = file.read(chunk_size)
        
        # eof hit check
        if not chunk:
            return "", True
        
        return chunk, False

In [6]:
def build_corpus(read_start: int, corpus_size: int=corpus_size, chunk_size: int=chunk_size, path: str=dataset_path) -> tuple[training_corpus, bool]:
    corpus = training_corpus()
    
    for _ in range(corpus_size):
        chunk, eof = read_chunk(read_start, chunk_size, path)
        
        # check eof
        if eof:
            return corpus, True
        
        corpus.add_chunk(tokenizer.tokenize_segment(chunk))
        
        read_start += chunk_size
        
    return corpus, False

In [7]:
def train_corpus(model: Word2Vec, corpus: training_corpus, first=False):
    """
    Train a Word2Vec model on a given corpus chunk.

    Args:
        model (Word2Vec): The Word2Vec model to train.
        corpus (list): A list of tokenized sentences to train on.
        first (bool): If True, build the vocabulary from the corpus. Use this only for the first chunk.

    Returns:
        None
    """
    
    if first:
        model.build_vocab(corpus.corpus)
        total_examples = model.corpus_count
    else:
        # Update the vocabulary with new words from the corpus
        model.build_vocab(corpus.corpus, update=True)
        total_examples = model.corpus_count
    
    model.train(corpus.corpus, total_examples=total_examples, epochs=model.epochs)

In [8]:
def full_train(model, dataset_path, train_corpuses):
    read_start = 0
    corpus = training_corpus()
    
    # first train
    corpus, eof = build_corpus(read_start, corpus_size, chunk_size, dataset_path)
    train_corpus(model, corpus, first=True)
    
    for _ in tqdm(range(train_corpuses)):
        corpus, eof = build_corpus(read_start, corpus_size, chunk_size, dataset_path)
        
        if eof:
            break
        
        if _ > 85858 or True:
            train_corpus(model, corpus)
        
        read_start += corpus_size * chunk_size

In [9]:
full_train(model, dataset_path, 6000 * 16 * 2)

  0%|          | 1/192000 [00:00<6:25:24,  8.30it/s]

  1%|▏         | 2755/192000 [17:06<19:35:38,  2.68it/s]


KeyboardInterrupt: 

In [10]:
# benchmark:
# 40 min train   /   580 examples

In [18]:
model.wv.similar_by_word(" when")

[(' could', 0.9956420660018921),
 (' with', 0.9951943755149841),
 (' those', 0.9949793815612793),
 (' one', 0.9946724772453308),
 (' idea', 0.9944334030151367),
 (' fan', 0.9943879842758179),
 (' enjoy', 0.9936926960945129),
 (' make', 0.9914247989654541),
 (' most', 0.9898692965507507),
 (' broadcast', 0.9895931482315063)]

In [12]:
model.save(fr"./embedding_models/4096vec_checkpt1.model")

In [11]:
len(model.wv)

4901