In [1]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
# dataset settings
dataset_path = fr"./datasets/wiki_dump.txt"
dataset_chunk_size = 256
chunks_per_corpus = 6

# word2vec settings
vector_size = 1024
epochs = 16
window = 16
min_count = 1
workers = 6

In [3]:
def load_dataset_chunk(path: str, num_words: int, seek_start: int, sep: str = " ") -> tuple[list, bool, int]:
    """
    function to load a chunk of the dataset where the words are separated by "sep" into a list
    
    parameters:
        path (str): path to the dataset txt file
        num_words (int): number of words to load
        seek_start (int): start char to pull words from
        sep (str, optional): separator in the dataset, defaults to space " "
    
    returns:
        list: list of strings (loaded words), is EOF hit, seek position to move 1 word forward
    """
    
    # some safety checks so later code looks cleaner ;)
    num_words = max(0, num_words)
    seek_start = max(0, seek_start)
    
    words = []
    current_word_idx = 0
    word_buffer = ""
    current_seek = seek_start
    next_seek = 0
    first_word_flag = True

    with open(path, 'r', encoding='utf-8', errors='ignore') as file:
        file.seek(seek_start)
        
        # loop over all chars after seek_start
        while True:
            char = file.read(1)
            current_seek += 1
            
            # end of file, return whatever has been collected immediately
            if not char:
                return words, True, next_seek
            
            # is a separator between words hit
            if char == sep or char.isspace():
                if word_buffer:
                    if current_word_idx < num_words:
                        words.append(word_buffer)
                    
                    current_word_idx += 1
                    word_buffer = ""
                
                if current_word_idx >= num_words:
                    break
                
                # the first word is covered, this is where the next chunk is going to be loaded from
                if first_word_flag:
                    first_word_flag = False
                    next_seek = current_seek
            else:
                word_buffer += char

    return words, False, next_seek

In [4]:
def load_corpus(dataset_path, dataset_start, dataset_chunk_size, chunks_per_corpus):
    chunk, eof, seek = [], False, 0
    corpus = []
    
    for _ in range(chunks_per_corpus):
        chunk, eof, seek = load_dataset_chunk(dataset_path, dataset_chunk_size, dataset_start)
        corpus.append(" ".join(chunk))
        dataset_start = seek
        
        if eof:
            return corpus
    
    return corpus

In [5]:
def preprocess_corpus(corpus):
    processed_corpus = [simple_preprocess(doc) for doc in corpus]

    return processed_corpus

In [6]:
def multi_chunk_train(model, dataset_path, dataset_chunk_size, chunks_per_corpus, train_corpuses, model_epochs=epochs):
    chunk_start = 0
    initial_corpus = load_corpus(dataset_path, chunk_start, dataset_chunk_size, chunks_per_corpus)
    processed_initial_corpus = preprocess_corpus(initial_corpus)
    
    # Build vocabulary with the first chunk
    model.build_vocab(processed_initial_corpus)
    
    chunk_start += dataset_chunk_size * chunks_per_corpus
    
    # Train with the first chunk
    model.train(processed_initial_corpus, total_examples=model.corpus_count, epochs=model_epochs)
    
    for current_corpus in tqdm(range(train_corpuses - 1)):  # already trained on the first chunk
        corpus = load_corpus(dataset_path, chunk_start, dataset_chunk_size, chunks_per_corpus)
        chunk_start += dataset_chunk_size * chunks_per_corpus
        
        processed_corpus = preprocess_corpus(corpus)
        
        # Update vocabulary with the new chunk
        model.build_vocab(processed_corpus, update=True)
        
        # Train with the new chunk
        model.train(processed_corpus, total_examples=model.corpus_count, epochs=model_epochs)

In [7]:
# create model initially
model = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers)

In [8]:
# train
multi_chunk_train(model, dataset_path, dataset_chunk_size, chunks_per_corpus, 4096 * 2 * 2)

  2%|▏         | 290/16383 [01:00<1:16:09,  3.52it/s]

In [None]:
# save model to file
model.save("wiki_model_7_mini_window.model")

In [None]:
model.wv.similar_by_word("mission")