In [1]:
from gensim.models.word2vec import LineSentence
from gensim.models import FastText
from gensim.models.callbacks import CallbackAny2Vec
import matplotlib.pyplot as plt

In [2]:
import logging
logging.basicConfig(level=logging.INFO, force = True)
logger = logging.getLogger()
logger.info("Logging initialized")

INFO:root:Logging initialized


In [3]:
loss_list = []
class Callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_list.append(loss)
        logger.info('Loss after epoch {}:{}'.format(self.epoch, loss))
        model.running_training_loss = 0.0
        self.epoch = self.epoch + 1

In [4]:
model = FastText(vector_size=100, window=3, min_count=10, workers=5)

INFO:gensim.utils:FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-05-20T13:18:43.498962', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 08:58:31) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-arm64-arm-64bit', 'event': 'created'}


In [5]:
sentences = LineSentence('processed-corpus.txt')

In [6]:
model.build_vocab(sentences, progress_per=5000000)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5000000, processed 40831628 words, keeping 101552 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000000, processed 81753225 words, keeping 114802 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15000000, processed 122575037 words, keeping 122333 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000000, processed 163439274 words, keeping 127317 word types
INFO:gensim.models.word2vec:collected 130029 word types from a corpus of 191558143 raw words and 23453583 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:FastText lifecycle event {'msg': 'effective_min_count=10 retains 79373 unique words (61.04% of original 130029, drops 50656)', 'datetime': '2023-05-20T13:19:29.175929', 'gensim': '4.3.1', 'python':

In [7]:
# we override alpha with small values similar to word2vec models
# 100 epochs is also empirical value from word2vec, since loss value is not reported for fasttext
model.train(sentences, epochs=100, start_alpha=0.0001, end_alpha=0.00001, total_examples=model.corpus_count, total_words=model.corpus_total_words, compute_loss=True, report_delay=300)


INFO:gensim.utils:FastText lifecycle event {'msg': 'training model with 5 workers on 79373 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=3 shrink_windows=True', 'datetime': '2023-05-20T13:19:34.289790', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 08:58:31) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-arm64-arm-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 0.40% examples, 732662 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0: training on 191558143 raw words (185893888 effective words) took 246.6s, 753712 effective words/s
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 0.35% examples, 639675 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1: training on 191558143 raw words (185893837 effective words) took 248.4s, 748253 effective words/s
INFO:gensim.models.word2vec:EPOCH 2 - PROGRESS: at 0.42% examples, 763216 words/s, in_qsize 9, out_qsize 0
INF

INFO:gensim.models.word2vec:EPOCH 32 - PROGRESS: at 0.39% examples, 705844 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 32: training on 191558143 raw words (185895881 effective words) took 250.4s, 742544 effective words/s
INFO:gensim.models.word2vec:EPOCH 33 - PROGRESS: at 0.38% examples, 688251 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 33: training on 191558143 raw words (185892281 effective words) took 259.9s, 715299 effective words/s
INFO:gensim.models.word2vec:EPOCH 34 - PROGRESS: at 0.40% examples, 733238 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 34: training on 191558143 raw words (185895952 effective words) took 246.7s, 753456 effective words/s
INFO:gensim.models.word2vec:EPOCH 35 - PROGRESS: at 0.41% examples, 755445 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 35: training on 191558143 raw words (185896154 effective words) took 240.5s, 772865 effective words/s
INFO:gensim.models.word2

INFO:gensim.models.word2vec:EPOCH 66 - PROGRESS: at 0.40% examples, 742358 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 66: training on 191558143 raw words (185892490 effective words) took 247.1s, 752254 effective words/s
INFO:gensim.models.word2vec:EPOCH 67 - PROGRESS: at 0.41% examples, 739470 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 67: training on 191558143 raw words (185893759 effective words) took 244.8s, 759357 effective words/s
INFO:gensim.models.word2vec:EPOCH 68 - PROGRESS: at 0.39% examples, 724124 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 68: training on 191558143 raw words (185892726 effective words) took 242.7s, 765965 effective words/s
INFO:gensim.models.word2vec:EPOCH 69 - PROGRESS: at 0.39% examples, 722224 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 69: training on 191558143 raw words (185891059 effective words) took 241.9s, 768550 effective words/s
INFO:gensim.models.word2

INFO:gensim.models.word2vec:EPOCH 99 - PROGRESS: at 0.39% examples, 716416 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 99: training on 191558143 raw words (185892329 effective words) took 245.7s, 756708 effective words/s
INFO:gensim.utils:FastText lifecycle event {'msg': 'training on 19155814300 raw words (18589328873 effective words) took 25039.5s, 742399 effective words/s', 'datetime': '2023-05-20T22:13:49.995920', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 08:58:31) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-arm64-arm-64bit', 'event': 'train'}


(18589328873, 19155814300)

In [16]:
model.wv.most_similar('сабака', topn=20)

[('табака', 0.860010027885437),
 ('бабашка', 0.8290190100669861),
 ('бака', 0.8221210837364197),
 ('кармушка', 0.8011088967323303),
 ('мядзянка', 0.8001713752746582),
 ('лядзянка', 0.7990681529045105),
 ('мішка', 0.7979294657707214),
 ('ахранка', 0.7974076271057129),
 ('сашчэпка', 0.797387421131134),
 ('асірыйка', 0.7863711714744568),
 ('казашка', 0.786071240901947),
 ('кадушка', 0.7843467593193054),
 ('табакерка', 0.784105122089386),
 ('мігалка', 0.7837947607040405),
 ('башка', 0.7831396460533142),
 ('цыганка', 0.782363772392273),
 ('заглушка', 0.7811365127563477),
 ('фінціфлюшка', 0.7808333039283752),
 ('булавешка', 0.7807796597480774),
 ('сасёнка', 0.7807261347770691)]

What?! Looks very bad, just playing with letters :(

In [12]:
model.wv.index_to_key[:10]

['быць',
 'год',
 'беларускі',
 'беларусь',
 'чалавек',
 'магчы',
 'час',
 'усё',
 'дзень',
 'большыць']

In [14]:
model.save("fasttext-cc100-d100-w3-min10.model")

INFO:gensim.utils:FastText lifecycle event {'fname_or_handle': 'fasttext-cc100-d100-w3-min10.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-05-20T23:19:08.591239', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 08:58:31) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-arm64-arm-64bit', 'event': 'saving'}
INFO:gensim.utils:storing np array 'vectors_ngrams' to fasttext-cc100-d100-w3-min10.model.wv.vectors_ngrams.npy
INFO:gensim.utils:not storing attribute vectors
INFO:gensim.utils:not storing attribute buckets_word
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.utils:saved fasttext-cc100-d100-w3-min10.model


In [13]:
model.wv.save_word2vec_format('fasttext-cc100-d100-w3-min10.vectors')

INFO:gensim.models.keyedvectors:storing 79373x100 projection weights into fasttext-cc100-d100-w3-min10.vectors
