In [13]:
import nltk
import pandas as pd

In [14]:
import ahocorasick

word_freq_df = pd.read_csv('../data/processed/word_freqs/freq_1000000_oshhamaho.csv')
word_freq_df = word_freq_df[word_freq_df['freq'] > 1]
words = set(word_freq_df['word'].values)

trie = ahocorasick.Automaton(key_type="str", value_type="int")
for id, word in enumerate(sorted(words)):
    trie.add_word(word, id)

trie.make_automaton()


def slice_word_suffix(word, min_len=3):
    most_prefix_match = list(trie.keys(word, '?', ahocorasick.MATCH_AT_MOST_PREFIX))
    return [word for word in most_prefix_match if len(word) >= min_len]


In [31]:
import math
from tqdm import tqdm


class Replacer:

    def __init__(self, words=words):
        self.word_slice_map = {}
        for word in words:
            self.word_slice_map[word] = slice_word_suffix(word)

    def replace_by_slice(self, line):
        line_words = nltk.word_tokenize(line)
        lines = set()

        for word in line_words:
            slice_words = self.word_slice_map.get(word, [])
            for slice_word in slice_words:
                if slice_word != word:
                    new_line = line.replace(word, slice_word)

                    sent_tokens = [w for w in nltk.word_tokenize(new_line) if len(w) > 1]
                    lines.add(tuple(sent_tokens))

        return lines

    def replace(self, line):
        lines = set()

        replaced_by_slice = self.replace_by_slice(line)
        if replaced_by_slice:
            lines.update(replaced_by_slice)

        return sorted(lines)


replacer = Replacer()

replacer.replace('Шэч хэмылъу , псэ мызагъэу , цIыхухэм яхуэпэжу , пщIэшхуэ къэзылэжьу дунейм тета')

[('Шэч',
  'хэм',
  'псэ',
  'мызагъэу',
  'цIыхухэм',
  'яхуэпэжу',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъ',
  'псэ',
  'мызагъэу',
  'цIыхухэм',
  'яхуэпэжу',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъу',
  'псэ',
  'мыз',
  'цIыхухэм',
  'яхуэпэжу',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъу',
  'псэ',
  'мызагъэ',
  'цIыхухэм',
  'яхуэпэжу',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъу',
  'псэ',
  'мызагъэу',
  'цIыху',
  'яхуэпэжу',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъу',
  'псэ',
  'мызагъэу',
  'цIыхухэ',
  'яхуэпэжу',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъу',
  'псэ',
  'мызагъэу',
  'цIыхухэм',
  'яху',
  'пщIэшхуэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмылъу',
  'псэ',
  'мызагъэу',
  'цIыхухэм',
  'яхуэпэжу',
  'пщIэ',
  'къэзылэжьу',
  'дунейм',
  'тета'),
 ('Шэч',
  'хэмы

In [32]:
class MyCorpus:

    def __init__(self, corpus_path, max_iter=math.inf):
        self.corpus_path = corpus_path
        self.iter_count = 0
        self.max_iter_count = max_iter
        with open(self.corpus_path) as f:
            text = f.read()
        self._sentences = sorted(set(nltk.sent_tokenize(text)))

        self.pbar = tqdm(self._sentences)
        self.last_flushed = None

    def __iter__(self):
        for line_1 in self.pbar:
            if self.iter_count > self.max_iter_count:
                break

            lines = replacer.replace(line_1)
            for line in lines:
                self.iter_count += 1
                yield line


In [33]:
corpus_path = '../data/processed/all_sentences.txt'
corpus = MyCorpus(corpus_path)
sentences = list(corpus)

  0%|          | 118/659164 [00:00<11:33, 950.12it/s]


In [47]:

from gensim.models.callbacks import CallbackAny2Vec


class MonitorCallback(CallbackAny2Vec):
    def __init__(self, test_words):
        self._test_words = test_words

    def on_epoch_end(self, model):
        print("Model loss:", model.get_latest_training_loss())  # print loss
        for _word in self._test_words:  # show wv logic changes
            print(_word, model.wv.most_similar(_word))

In [48]:
import gensim.models

monitor = MonitorCallback([
    'бажэ',
    'унэ',
    'бадзэ',
    'уафэ',
    'къуэш',
    'еджэн',
    'шхын',
])

model = gensim.models.FastText(
    vector_size=300,
    workers=16,
    window=7,
    negative=5,
    sg=1
)

In [49]:
model.build_vocab(corpus_iterable=sentences)

In [50]:
model.train(
    corpus_iterable=sentences,
    total_examples=len(sentences),
    epochs=5,
    callbacks=[monitor],
    compute_loss=True
)

Model loss: 0.0
бажэ [('36', 0.43034833669662476), ('щIы', 0.37056341767311096), ('16', 0.28868475556373596), ('тхьэмадэ', 0.2575737237930298), ('зиIэ', 0.2286663055419922), ('армэм', 0.2186153680086136), ('щIыналъэм', 0.2045162320137024), ('12', 0.19825555384159088), ('нэрымылъагъу', 0.19641320407390594), ('щыщ', 0.19278453290462494)]
унэ [('36', 0.42949026823043823), ('щIы', 0.3687499761581421), ('16', 0.2886733412742615), ('тхьэмадэ', 0.2605752646923065), ('зиIэ', 0.22894659638404846), ('армэм', 0.21717408299446106), ('щIыналъэм', 0.2037258893251419), ('12', 0.19798758625984192), ('нэрымылъагъу', 0.19733895361423492), ('щыщ', 0.19463656842708588)]
бадзэ [('36', 0.43005821108818054), ('щIы', 0.3667234778404236), ('16', 0.293910413980484), ('тхьэмадэ', 0.25774285197257996), ('зиIэ', 0.23419977724552155), ('армэм', 0.20682595670223236), ('нэрымылъагъу', 0.20469065010547638), ('12', 0.20038467645645142), ('щIыналъэм', 0.1980055868625641), ('6.15', 0.1881464421749115)]
уафэ [('36', 0.429

(79572, 91185)

In [51]:
model.wv.similar_by_word('гупсысащ')

[('гупсысэр', 0.9730286002159119),
 ('гупсысэ', 0.9699869155883789),
 ('нэгъуэщIхэри', 0.9373893141746521),
 ('адыгэу', 0.9204935431480408),
 ('АдыгэщI', 0.9190553426742554),
 ('нэгъуэщI', 0.9183970093727112),
 ('тхьэмыщкIэм', 0.9181773066520691),
 ('нэгъуэщIу', 0.9131913781166077),
 ('Жэпуэгъуэм', 0.9109500050544739),
 ('Iэдиихур', 0.9086714386940002)]

In [52]:
import os

os.makedirs('../data/processed/embeddings', exist_ok=True)
model.save('../data/processed/embeddings/gensim_word2vec_1000000_oshhamaho.model')