Corpus (be.txt) taken from CC100 dataset: https://metatext.io/datasets/cc100-belarusian

In [16]:
import re
import gensim
from gensim.parsing.preprocessing import strip_punctuation, strip_short, strip_numeric, strip_multiple_whitespaces, remove_stopwords
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

In [17]:
import logging
logging.basicConfig(level=logging.INFO, force = True)
logger = logging.getLogger()
logger.info("Logging initialized")

INFO:root:Logging initialized


In [68]:
import xml.dom.minidom
from itertools import islice

def calculate_mapping_from_forms_to_base(filepath):
    xml_doc = xml.dom.minidom.parse(filepath)
    words = xml_doc.getElementsByTagName('Variant')
    result = {}
    collision_count = 0
    collisions = set()
    for word in words:
        base = word.getAttribute('lemma').replace("+", "")
        forms_objs = word.getElementsByTagName('Form')
        local_map = {}
        citation_count = max([forms_obj.getAttribute('slouniki').count(',') for forms_obj in forms_objs]) + 1
        for forms_obj in forms_objs:
            if (len(forms_obj.childNodes) == 0):
                logger.info(f"Empty for {base}")
            else:
                form = forms_obj.childNodes[0].data.replace("+", "")
                #if citation_count > 3:
                local_map[form] = (base, citation_count)
        for k, v in local_map.items():
            if k in result:
                if result[k][1] == v[1] and result[k][0] != v[0]:
                    collision_count += 1
                    collisions.add(v[0])
                    collisions.add(result[k][0])
                elif result[k][1] < v[1]:
                    result[k] = v
            else:
                result[k] = v
        #result.update(local_map)
    logger.info(f"Collisions (forms leading to different base word, and having same amount of citation): {collision_count}")
    logger.info(f"Examples of collisions: {list(islice(collisions, 5))}")
    for k in result:
        result[k] = result[k][0]
    return result

In [69]:
n1 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/N1.xml')
n2 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/N2.xml')
n3 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/N3.xml')
adj1 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/A1.xml')
adj2 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/A2.xml')

WORD_MAP = {}
WORD_MAP.update(n1)
WORD_MAP.update(n2)
WORD_MAP.update(n3)
WORD_MAP.update(adj1)
WORD_MAP.update(adj2)

INFO:root:Empty for бібікі
INFO:root:Empty for бібікі
INFO:root:Empty for бібікі
INFO:root:Empty for бібікі
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 1345
INFO:root:Examples of collisions: ['імерэцінка', 'жаўталіст', 'гераін', 'века', 'абісінец']
INFO:root:Empty for лапта
INFO:root:Empty for мальба
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 1155
INFO:root:Examples of collisions: ['мальдыўка', 'неспадзеў', 'корм', 'конь', 'падыспытная']
INFO:root:Empty for тамада
INFO:root:Empty for тамада
INFO:root:Empty for фата
INFO:root:Empty for чака
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 954
INFO:root:Examples of collisions: ['тальк', 'цэфалаподы', 'развалы', 'сума', 'страйкам']
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 142
INFO:root:Examples of collisions: ['вітавы', 'крэчатавы'

In [70]:
print(len(WORD_MAP))

1567761


In [71]:
WORD_MAP['рухам']

'рух'

In [72]:
def strip_trailing_newline(iterable):
    for i in iterable:
        yield i.rstrip()

CHARACTERS_MAP = {'ý': 'ў', 'i': 'і', 'ньн': 'нн', 'цьц': 'цц', 'сьц': 'сц', 'сьл':'сл'}
def replace_characters(iterable):
    for s in iterable:
        for k, v in CHARACTERS_MAP.items():
            s = s.replace(k, v)
        yield s

def split_sentences(iterable):
    for i in iterable:
        merged_dots = re.sub("[\.]+", ".", i)
        sentences = merged_dots.split('.')
        for s in sentences:
            yield s

STOPWORDS = ['на', 'не', 'што', 'да', 'па', 'як', 'за', 'для', 'гэта', 'ад', 'але', 'калі', 'пра', 'у', 'яго', 'якія', 'ён', 'іх', 'мы', 'каб', 'толькі', 'аб', 'таксама', 'які', 'ці', 'быў', 'было', 'яны', 'так', 'вы', 'яе', 'будзе', 'пры', 'яшчэ', 'тым', 'таму', 'вельмі', 'былі', 'можна', 'яна', 'пасля', 'пад', 'можа', 'дзе', 'якая', 'тут', 'была', 'трэба', 'тое', 'таго', 'або', 'гэтым', 'бо', 'ўсё', 'хто', 'ня', 'нас', 'гэтага', 'быць', 'гэты', 'ёсць', 'праз', 'ўжо', 'са', 'нават', 'то', 'мяне', 'ім','ва', 'той', 'усе', 'без', 'чым', 'мне', 'мае', 'сябе', 'гэтай', 'там', 'усё', 'вось', 'ды', 'каля', 'якіх', 'ты', 'якой', 'ўсе', 'жа', 'ужо', 'паводле', 'будуць', 'аднак', 'могуць', 'сваю', 'ні', 'сваёй', 'яму', 'свае', 'гэтыя', 'проста', 'ўсіх', 'якім', 'падчас', 'тады', 'свой', 'вас', 'паміж', 'нам', 'раз', 'сваіх', 'нашай', 'менавіта', 'перад', 'вам', 'тых','зь','такім', 'свайго', 'над', 'зараз', 'амаль', 'чаму', 'ёй', 'чынам', 'напрыклад', 'якога', 'якое', 'сваім', 'можаце', 'га', 'хоць', 'бы', 'тыя', 'такія', 'потым', 'адным', 'такі', 'якую', 'сабе','сам','гэтых','мая','наш','зусім','чаго','наша','зноў','дык','такіх','нашага','адразу','каго','самі','ст','ну','нашы','нашым','самы','яно','гэтае','дзеля','куды','by','гг']
def preprocess_sentences(iterable):
    for i in iterable:
        s = strip_multiple_whitespaces(strip_numeric(strip_short(strip_punctuation(i))))
        s = s.lower()
        s = re.sub("[«»“”„…—°′²]", "", s)
        s = remove_stopwords(s, stopwords=STOPWORDS)
        s = ' '.join([WORD_MAP.get(w, w) for w in s.split()])
        yield s

def remove_short_lines(iterable):
    for i in iterable:
        if not i.isspace() and len(i) >= 20:
            yield i


In [73]:
with open('be.txt', 'r') as original_file:
    with open('sentences.txt', 'w') as sentences_file:
        for s in remove_short_lines(preprocess_sentences(split_sentences(replace_characters(strip_trailing_newline(original_file))))):
            sentences_file.write(s + "\n")

In [74]:
loss_list = []
class Callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_list.append(loss)
        print('Loss after epoch {}:{}'.format(self.epoch, loss))
        model.running_training_loss = 0.0
        self.epoch = self.epoch + 1

In [75]:
model = Word2Vec(vector_size=100, window=3, min_count=10, workers=5)

INFO:gensim.utils:Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-04-22T18:18:42.284078', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 09:05:00) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-x86_64-i386-64bit', 'event': 'created'}


In [76]:
sentences = LineSentence('sentences.txt')

In [77]:
model.build_vocab(sentences, progress_per=5000000)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5000000, processed 44754492 words, keeping 895623 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000000, processed 89584467 words, keeping 1350126 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15000000, processed 134296356 words, keeping 1699597 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000000, processed 179110867 words, keeping 1997894 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #25000000, processed 223797301 words, keeping 2271233 word types
INFO:gensim.models.word2vec:collected 2286211 word types from a corpus of 226493880 raw words and 25306752 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 328798 unique

In [78]:
model.train(sentences, epochs=50, total_examples=model.corpus_count, total_words=model.corpus_total_words, compute_loss=True, report_delay=120, callbacks=[Callback()])


INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training model with 5 workers on 328798 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=3 shrink_windows=True', 'datetime': '2023-04-22T18:20:09.631800', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 09:05:00) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-x86_64-i386-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 0.58% examples, 1265770 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 47.87% examples, 871369 words/s, in_qsize 10, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 94.48% examples, 861497 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0: training on 226493880 raw words (220063745 effective words) took 250.9s, 877088 effective words/s


Loss after epoch 0:45268440.0


INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 0.57% examples, 1243133 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 52.18% examples, 949579 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1: training on 226493880 raw words (220060555 effective words) took 230.1s, 956426 effective words/s


Loss after epoch 1:43138140.0


INFO:gensim.models.word2vec:EPOCH 2 - PROGRESS: at 0.50% examples, 89227 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 2 - PROGRESS: at 48.43% examples, 806345 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 2 - PROGRESS: at 98.56% examples, 859817 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 2: training on 226493880 raw words (220059964 effective words) took 254.7s, 864026 effective words/s


Loss after epoch 2:43365704.0


INFO:gensim.models.word2vec:EPOCH 3 - PROGRESS: at 0.59% examples, 1293767 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 3 - PROGRESS: at 50.86% examples, 925457 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 3 - PROGRESS: at 94.14% examples, 860057 words/s, in_qsize 8, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 3: training on 226493880 raw words (220059516 effective words) took 266.8s, 824842 effective words/s


Loss after epoch 3:43203232.0


INFO:gensim.models.word2vec:EPOCH 4 - PROGRESS: at 0.55% examples, 1212459 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 4 - PROGRESS: at 46.38% examples, 844114 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 4 - PROGRESS: at 92.76% examples, 847382 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 4: training on 226493880 raw words (220060783 effective words) took 253.2s, 869252 effective words/s


Loss after epoch 4:43267796.0


INFO:gensim.models.word2vec:EPOCH 5 - PROGRESS: at 0.59% examples, 1275981 words/s, in_qsize 9, out_qsize 3
INFO:gensim.models.word2vec:EPOCH 5 - PROGRESS: at 44.46% examples, 809298 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 5 - PROGRESS: at 87.13% examples, 795900 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 5: training on 226493880 raw words (220060660 effective words) took 275.9s, 797644 effective words/s


Loss after epoch 5:43275668.0


INFO:gensim.models.word2vec:EPOCH 6 - PROGRESS: at 0.50% examples, 1087045 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 6 - PROGRESS: at 48.90% examples, 890095 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 6 - PROGRESS: at 96.07% examples, 877607 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 6: training on 226493880 raw words (220061714 effective words) took 248.0s, 887471 effective words/s


Loss after epoch 6:43009856.0


INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 0.56% examples, 1221637 words/s, in_qsize 10, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 46.54% examples, 847129 words/s, in_qsize 8, out_qsize 3
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 93.88% examples, 832538 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 7: training on 226493880 raw words (220060657 effective words) took 258.7s, 850617 effective words/s


Loss after epoch 7:43133328.0


INFO:gensim.models.word2vec:EPOCH 8 - PROGRESS: at 0.59% examples, 1298293 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 8 - PROGRESS: at 48.26% examples, 878538 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 8 - PROGRESS: at 97.50% examples, 890529 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 8: training on 226493880 raw words (220060562 effective words) took 245.1s, 897953 effective words/s


Loss after epoch 8:43170964.0


INFO:gensim.models.word2vec:EPOCH 9 - PROGRESS: at 0.10% examples, 13837 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 9 - PROGRESS: at 45.62% examples, 734035 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 9 - PROGRESS: at 93.24% examples, 799107 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 9: training on 226493880 raw words (220059147 effective words) took 268.0s, 820976 effective words/s


Loss after epoch 9:43238204.0


INFO:gensim.models.word2vec:EPOCH 10 - PROGRESS: at 0.60% examples, 1308078 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 10 - PROGRESS: at 49.91% examples, 908283 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 10 - PROGRESS: at 97.06% examples, 880735 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 10: training on 226493880 raw words (220059421 effective words) took 247.7s, 888543 effective words/s


Loss after epoch 10:42894520.0


INFO:gensim.models.word2vec:EPOCH 11 - PROGRESS: at 0.61% examples, 1319431 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 11 - PROGRESS: at 53.17% examples, 967389 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 11: training on 226493880 raw words (220061907 effective words) took 236.0s, 932398 effective words/s


Loss after epoch 11:43087872.0


INFO:gensim.models.word2vec:EPOCH 12 - PROGRESS: at 0.56% examples, 1232022 words/s, in_qsize 7, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 12 - PROGRESS: at 46.11% examples, 839339 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 12 - PROGRESS: at 94.16% examples, 841749 words/s, in_qsize 7, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 12: training on 226493880 raw words (220061540 effective words) took 256.2s, 858805 effective words/s


Loss after epoch 12:42866644.0


INFO:gensim.models.word2vec:EPOCH 13 - PROGRESS: at 0.61% examples, 1328466 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 13 - PROGRESS: at 51.30% examples, 933569 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 13 - PROGRESS: at 96.23% examples, 879017 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 13: training on 226493880 raw words (220059684 effective words) took 247.7s, 888510 effective words/s


Loss after epoch 13:42945276.0


INFO:gensim.models.word2vec:EPOCH 14 - PROGRESS: at 0.58% examples, 1266587 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 14 - PROGRESS: at 45.26% examples, 823850 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 14 - PROGRESS: at 91.75% examples, 809636 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 14: training on 226493880 raw words (220060655 effective words) took 263.6s, 834964 effective words/s


Loss after epoch 14:42836076.0


INFO:gensim.models.word2vec:EPOCH 15 - PROGRESS: at 0.56% examples, 1227227 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 15 - PROGRESS: at 50.69% examples, 922381 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 15: training on 226493880 raw words (220059199 effective words) took 234.4s, 939005 effective words/s


Loss after epoch 15:42672528.0


INFO:gensim.models.word2vec:EPOCH 16 - PROGRESS: at 0.58% examples, 1271720 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 16 - PROGRESS: at 58.09% examples, 998106 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 16: training on 226493880 raw words (220060059 effective words) took 213.4s, 1031283 effective words/s


Loss after epoch 16:42711492.0


INFO:gensim.models.word2vec:EPOCH 17 - PROGRESS: at 0.59% examples, 1273395 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 17 - PROGRESS: at 52.97% examples, 963624 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 17: training on 226493880 raw words (220061066 effective words) took 222.5s, 989220 effective words/s


Loss after epoch 17:42722508.0


INFO:gensim.models.word2vec:EPOCH 18 - PROGRESS: at 0.58% examples, 1241038 words/s, in_qsize 8, out_qsize 3
INFO:gensim.models.word2vec:EPOCH 18 - PROGRESS: at 54.73% examples, 984304 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 18: training on 226493880 raw words (220062642 effective words) took 216.1s, 1018454 effective words/s


Loss after epoch 18:42639984.0


INFO:gensim.models.word2vec:EPOCH 19 - PROGRESS: at 0.46% examples, 125420 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 19 - PROGRESS: at 49.07% examples, 843882 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 19: training on 226493880 raw words (220060398 effective words) took 240.5s, 914918 effective words/s


Loss after epoch 19:42465852.0


INFO:gensim.models.word2vec:EPOCH 20 - PROGRESS: at 0.58% examples, 1260218 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 20 - PROGRESS: at 52.28% examples, 911889 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 20: training on 226493880 raw words (220060649 effective words) took 234.0s, 940524 effective words/s


Loss after epoch 20:42260604.0


INFO:gensim.models.word2vec:EPOCH 21 - PROGRESS: at 0.58% examples, 1272070 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 21 - PROGRESS: at 55.83% examples, 1015711 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 21: training on 226493880 raw words (220061662 effective words) took 226.8s, 970491 effective words/s


Loss after epoch 21:42219876.0


INFO:gensim.models.word2vec:EPOCH 22 - PROGRESS: at 0.56% examples, 1223068 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 22 - PROGRESS: at 47.99% examples, 873623 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 22 - PROGRESS: at 96.26% examples, 879207 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 22: training on 226493880 raw words (220061192 effective words) took 247.2s, 890199 effective words/s


Loss after epoch 22:42389964.0


INFO:gensim.models.word2vec:EPOCH 23 - PROGRESS: at 0.60% examples, 1309848 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 23 - PROGRESS: at 50.56% examples, 920023 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 23 - PROGRESS: at 97.82% examples, 893390 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 23: training on 226493880 raw words (220060715 effective words) took 244.6s, 899777 effective words/s


Loss after epoch 23:42439336.0


INFO:gensim.models.word2vec:EPOCH 24 - PROGRESS: at 0.58% examples, 1275790 words/s, in_qsize 9, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 24 - PROGRESS: at 46.48% examples, 813504 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 24 - PROGRESS: at 92.07% examples, 808713 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 24: training on 226493880 raw words (220062316 effective words) took 264.2s, 833034 effective words/s


Loss after epoch 24:42179860.0


INFO:gensim.models.word2vec:EPOCH 25 - PROGRESS: at 0.58% examples, 1275888 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 25 - PROGRESS: at 52.30% examples, 951735 words/s, in_qsize 10, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 25 - PROGRESS: at 99.93% examples, 912374 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 25: training on 226493880 raw words (220062232 effective words) took 241.1s, 912718 effective words/s


Loss after epoch 25:41902724.0


INFO:gensim.models.word2vec:EPOCH 26 - PROGRESS: at 0.57% examples, 1246414 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 26 - PROGRESS: at 56.90% examples, 1035495 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 26: training on 226493880 raw words (220061982 effective words) took 225.0s, 978055 effective words/s


Loss after epoch 26:41855820.0


INFO:gensim.models.word2vec:EPOCH 27 - PROGRESS: at 0.58% examples, 1261506 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 27 - PROGRESS: at 50.54% examples, 919566 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 27: training on 226493880 raw words (220060632 effective words) took 225.3s, 976904 effective words/s


Loss after epoch 27:41842668.0


INFO:gensim.models.word2vec:EPOCH 28 - PROGRESS: at 0.60% examples, 1299587 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 28 - PROGRESS: at 55.18% examples, 1002824 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 28: training on 226493880 raw words (220061776 effective words) took 211.8s, 1039046 effective words/s


Loss after epoch 28:41812468.0


INFO:gensim.models.word2vec:EPOCH 29 - PROGRESS: at 0.50% examples, 116976 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 29 - PROGRESS: at 58.96% examples, 1003941 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 29: training on 226493880 raw words (220061505 effective words) took 220.1s, 999767 effective words/s


Loss after epoch 29:41615648.0


INFO:gensim.models.word2vec:EPOCH 30 - PROGRESS: at 0.59% examples, 1302219 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 30 - PROGRESS: at 54.70% examples, 995179 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 30: training on 226493880 raw words (220062507 effective words) took 218.4s, 1007727 effective words/s


Loss after epoch 30:41400372.0


INFO:gensim.models.word2vec:EPOCH 31 - PROGRESS: at 0.59% examples, 1294298 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 31 - PROGRESS: at 57.63% examples, 1008364 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 31: training on 226493880 raw words (220059280 effective words) took 212.3s, 1036526 effective words/s


Loss after epoch 31:41358844.0


INFO:gensim.models.word2vec:EPOCH 32 - PROGRESS: at 0.58% examples, 1271148 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 32 - PROGRESS: at 52.10% examples, 948235 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 32: training on 226493880 raw words (220061779 effective words) took 227.2s, 968598 effective words/s


Loss after epoch 32:41316156.0


INFO:gensim.models.word2vec:EPOCH 33 - PROGRESS: at 0.60% examples, 1320115 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 33 - PROGRESS: at 56.86% examples, 1001525 words/s, in_qsize 0, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 33: training on 226493880 raw words (220060063 effective words) took 214.2s, 1027160 effective words/s


Loss after epoch 33:41630256.0


INFO:gensim.models.word2vec:EPOCH 34 - PROGRESS: at 0.59% examples, 1287644 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 34 - PROGRESS: at 52.80% examples, 960586 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 34: training on 226493880 raw words (220061676 effective words) took 223.0s, 986976 effective words/s


Loss after epoch 34:41005404.0


INFO:gensim.models.word2vec:EPOCH 35 - PROGRESS: at 0.56% examples, 1223993 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 35 - PROGRESS: at 53.73% examples, 975067 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 35: training on 226493880 raw words (220060685 effective words) took 223.5s, 984486 effective words/s


Loss after epoch 35:40615628.0


INFO:gensim.models.word2vec:EPOCH 36 - PROGRESS: at 0.58% examples, 1259395 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 36 - PROGRESS: at 58.32% examples, 1061194 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 36: training on 226493880 raw words (220059501 effective words) took 215.8s, 1019680 effective words/s


Loss after epoch 36:40720424.0


INFO:gensim.models.word2vec:EPOCH 37 - PROGRESS: at 0.59% examples, 1277408 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 37 - PROGRESS: at 52.89% examples, 962173 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 37: training on 226493880 raw words (220059742 effective words) took 222.2s, 990218 effective words/s


Loss after epoch 37:40630364.0


INFO:gensim.models.word2vec:EPOCH 38 - PROGRESS: at 0.58% examples, 1263078 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 38 - PROGRESS: at 53.46% examples, 972729 words/s, in_qsize 7, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 38: training on 226493880 raw words (220062869 effective words) took 234.2s, 939528 effective words/s


Loss after epoch 38:40324520.0


INFO:gensim.models.word2vec:EPOCH 39 - PROGRESS: at 0.58% examples, 1262008 words/s, in_qsize 8, out_qsize 3
INFO:gensim.models.word2vec:EPOCH 39 - PROGRESS: at 47.25% examples, 857253 words/s, in_qsize 10, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 39: training on 226493880 raw words (220061227 effective words) took 233.3s, 943228 effective words/s


Loss after epoch 39:39965872.0


INFO:gensim.models.word2vec:EPOCH 40 - PROGRESS: at 0.58% examples, 1256256 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 40 - PROGRESS: at 53.27% examples, 963383 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 40: training on 226493880 raw words (220063137 effective words) took 222.1s, 990625 effective words/s


Loss after epoch 40:39781376.0


INFO:gensim.models.word2vec:EPOCH 41 - PROGRESS: at 0.51% examples, 1113402 words/s, in_qsize 9, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 41 - PROGRESS: at 57.75% examples, 1051164 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 41: training on 226493880 raw words (220061256 effective words) took 214.7s, 1024903 effective words/s


Loss after epoch 41:39536580.0


INFO:gensim.models.word2vec:EPOCH 42 - PROGRESS: at 0.58% examples, 1250310 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 42 - PROGRESS: at 53.78% examples, 978446 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 42: training on 226493880 raw words (220059504 effective words) took 220.7s, 997316 effective words/s


Loss after epoch 42:39529876.0


INFO:gensim.models.word2vec:EPOCH 43 - PROGRESS: at 0.58% examples, 1269776 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 43 - PROGRESS: at 55.70% examples, 971662 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 43: training on 226493880 raw words (220060495 effective words) took 220.0s, 1000260 effective words/s


Loss after epoch 43:39245684.0


INFO:gensim.models.word2vec:EPOCH 44 - PROGRESS: at 0.58% examples, 1277479 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 44 - PROGRESS: at 47.86% examples, 871290 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 44: training on 226493880 raw words (220060363 effective words) took 235.8s, 933304 effective words/s


Loss after epoch 44:38712500.0


INFO:gensim.models.word2vec:EPOCH 45 - PROGRESS: at 0.57% examples, 1252417 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 45 - PROGRESS: at 52.25% examples, 915032 words/s, in_qsize 7, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 45: training on 226493880 raw words (220058813 effective words) took 233.8s, 941074 effective words/s


Loss after epoch 45:38622580.0


INFO:gensim.models.word2vec:EPOCH 46 - PROGRESS: at 0.58% examples, 1271261 words/s, in_qsize 8, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 46 - PROGRESS: at 56.13% examples, 1021113 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 46: training on 226493880 raw words (220061384 effective words) took 223.9s, 982992 effective words/s


Loss after epoch 46:38158500.0


INFO:gensim.models.word2vec:EPOCH 47 - PROGRESS: at 0.58% examples, 1274743 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 47 - PROGRESS: at 49.69% examples, 904118 words/s, in_qsize 9, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 47: training on 226493880 raw words (220062300 effective words) took 235.1s, 936178 effective words/s


Loss after epoch 47:37891508.0


INFO:gensim.models.word2vec:EPOCH 48 - PROGRESS: at 0.60% examples, 1295986 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 48 - PROGRESS: at 51.80% examples, 942646 words/s, in_qsize 9, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 48: training on 226493880 raw words (220061289 effective words) took 229.5s, 959054 effective words/s


Loss after epoch 48:37483072.0


INFO:gensim.models.word2vec:EPOCH 49 - PROGRESS: at 0.55% examples, 116964 words/s, in_qsize 8, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 49 - PROGRESS: at 53.21% examples, 899569 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 49: training on 226493880 raw words (220060630 effective words) took 242.2s, 908436 effective words/s
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training on 11324694000 raw words (11003047035 effective words) took 11771.5s, 934723 effective words/s', 'datetime': '2023-04-22T23:13:31.550670', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 09:05:00) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-x86_64-i386-64bit', 'event': 'train'}


Loss after epoch 49:37343080.0


(11003047035, 11324694000)

In [87]:
model.save("word2vec-100-bel-cc100.model")

INFO:gensim.utils:Word2Vec lifecycle event {'fname_or_handle': 'word2vec-100-bel-cc100.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-23T01:10:15.447782', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 09:05:00) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-x86_64-i386-64bit', 'event': 'saving'}
INFO:gensim.utils:storing np array 'vectors' to word2vec-100-bel-cc100.model.wv.vectors.npy
INFO:gensim.utils:storing np array 'syn1neg' to word2vec-100-bel-cc100.model.syn1neg.npy
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.utils:saved word2vec-100-bel-cc100.model


In [81]:
model.wv.most_similar('воўк', topn=20)

[('мядзведзь', 0.7393637895584106),
 ('звер', 0.7242729067802429),
 ('заяц', 0.7042849063873291),
 ('бык', 0.6875906586647034),
 ('муха', 0.6802892684936523),
 ('драпежнік', 0.6730962991714478),
 ('кабан', 0.6689190864562988),
 ('алень', 0.6683720350265503),
 ('выгаладаўся', 0.656315803527832),
 ('варона', 0.6556535363197327),
 ('змей', 0.6529685854911804),
 ('сабака', 0.6515656113624573),
 ('табун', 0.6455168724060059),
 ('драпежны', 0.6449723243713379),
 ('мураш', 0.6393277049064636),
 ('лісіца', 0.633025586605072),
 ('жаба', 0.6329237818717957),
 ('курапатка', 0.6244716644287109),
 ('вуж', 0.6240508556365967),
 ('конь', 0.6231964230537415)]

In [82]:
model.wv.index_to_key

['год',
 'беларускі',
 'чалавек',
 'беларусі',
 'час',
 'дзень',
 'больш',
 'мова',
 'новы',
 'беларусь',
 'вялікі',
 'краіна',
 'жыццё',
 'праца',
 'горад',
 'месца',
 'справа',
 'гісторыя',
 'рэспубліка',
 'слова',
 'кожны',
 'гульня',
 'дзіцё',
 'дзяржаўны',
 'сайт',
 'нацыянальны',
 'раён',
 'пытанне',
 'імя',
 'галоўны',
 'свет',
 'культура',
 'кніга',
 'старонка',
 'раз',
 'другі',
 'розны',
 'адзін',
 'арганізацыя',
 'школа',
 'апошні',
 'дом',
 'цэнтр',
 'матэрыял',
 'вынік',
 'правіць',
 'праект',
 'цяпер',
 'беларус',
 'права',
 'адукацыя',
 'вайна',
 'частка',
 'праграма',
 'дзяржава',
 'зямля',
 'шмат',
 'міжнародны',
 'сёння',
 'гад',
 'работа',
 'інфармацыя',
 'развіццё',
 'выбар',
 'няма',
 'лепшы',
 'народны',
 'наступны',
 'палітычны',
 'іншых',
 'праблема',
 'пачатак',
 'вёска',
 'народ',
 'кіраўнік',
 'мяжа',
 'стол',
 'літаратура',
 'артыкул',
 'бацька',
 'дзейнасць',
 'магчымасць',
 'выпадак',
 'дапамога',
 'бок',
 'вобласць',
 'асноўны',
 'выкарыстанне',
 'сябар',

In [37]:
model.wv.get_vecattr("чарадзей", "count")

1379