In [2]:
import os

import fasttext

embeddings_dir = '../data/processed/embeddings/'
kbd_model = fasttext.load_model(os.path.join(embeddings_dir, 'fasttext_skipgram_kbd_300.bin'))



In [3]:
from gensim.models import KeyedVectors

kbd_restricted_glossary_kv = KeyedVectors.load(
    os.path.join(embeddings_dir, 'fasttext_skipgram_kbd_300_restricted_glossary.kv'))

In [16]:
from tqdm import tqdm
import pandas as pd

token_dist_dir = '../data/processed/token_distribution/'
stem_df = pd.read_csv(os.path.join(token_dist_dir, 'stem_candidates.csv.gz'), compression='gzip')
stem_df.head(n=100)

word_freq = pd.read_csv('../data/processed/word_freqs/freq_1000000_oshhamaho.csv')
word_freq.head(n=100)

df = pd.merge(stem_df, word_freq, on='word', how='inner')
df = df[df['freq'] > 5]
df.head(n=100)

Unnamed: 0,stem_ng_len,stem,word_ng_len,word,template,tokens,stem_is_full_word,freq
2,1,-Джэрий,2,Кърым-Джэрий,Кърым???????,Кърым|-Джэрий,False,11
3,1,Кърым,2,Кърым-Джэрий,?????-Джэрий,Кърым|-Джэрий,False,11
4,1,-Джэрий,2,Хъан-Джэрий,Хъан???????,Хъан|-Джэрий,False,97
5,1,Хъан,2,Хъан-Джэрий,????-Джэрий,Хъан|-Джэрий,False,97
8,1,-Джэрий,3,Адэл-Джэрий,Адэл???????,Адэ|л|-Джэрий,False,7
...,...,...,...,...,...,...,...,...
401,2,рта,6,Муртаз-пэщэм,Му???з-пэщэм,Му|рт|а|з|-пэщэ|м,False,14
402,3,Мурта,6,Муртаз-пэщэм,?????з-пэщэм,Му|рт|а|з|-пэщэ|м,False,14
403,3,ртаз,6,Муртаз-пэщэм,Му????-пэщэм,Му|рт|а|з|-пэщэ|м,False,14
404,4,Муртаз,6,Муртаз-пэщэм,??????-пэщэм,Му|рт|а|з|-пэщэ|м,False,14


In [55]:
import ahocorasick

words = df['word'].tolist()

trie = ahocorasick.Automaton(key_type="str", value_type="int")
for id, word in enumerate(sorted(words)):
    trie.add_word(word, id)

trie.make_automaton()

In [70]:
def slice_word_suffix(word, suffix_n=1):
    most_prefix_match = list(trie.keys(word, '?', ahocorasick.MATCH_AT_MOST_PREFIX))
    try:
        return most_prefix_match[-suffix_n - 1]
    except IndexError:
        return None


print(slice_word_suffix('цIыхубзхэри'))

цIыхубзхэр


In [49]:
stem_list = set(df['stem'].tolist())


def choose_canonical_by_vector(word):
    word_vector = kbd_model.get_word_vector(word)
    canonical, similarity = kbd_restricted_glossary_kv.similar_by_vector(word_vector, topn=1)[0]
    return canonical, similarity


stem_canonicals = []
for stem in tqdm(stem_list):
    stem_canonicals.append((stem, choose_canonical_by_vector(stem)))

stem_canonicals = dict(stem_canonicals)

100%|██████████| 33252/33252 [01:15<00:00, 440.87it/s]


In [50]:
from collections import defaultdict
import nltk

trigram_index = defaultdict(list)

restricted_glossary = set(kbd_restricted_glossary_kv.index_to_key)
for glossary_word in tqdm(restricted_glossary):
    for trigram in nltk.ngrams(glossary_word, 3):
        trigram_index[''.join(trigram)].append(glossary_word)

100%|██████████| 27268/27268 [00:00<00:00, 230719.21it/s]


In [72]:
from fuzzywuzzy import fuzz


def get_candidates_by_trigram(word):
    candidates = set()
    for trigram in nltk.ngrams(word, 3):
        candidates.update(trigram_index[''.join(trigram)])

    # print(len(candidates))
    return candidates


def choose_canonical_by_distance(word):
    if word is None:
        return None, 0

    canonical = None
    max_dist = 0
    for candidate in get_candidates_by_trigram(word):
        dist = fuzz.ratio(word, candidate)
        if dist > max_dist:
            max_dist = dist
            canonical = candidate

    score = max_dist / 100
    return canonical, score


print(choose_canonical_by_distance('цIыхубзхэр'))

('цIыхубэ', 0.82)


In [73]:
stem_canonicals_by_distance = {}

for stem in tqdm(stem_list):
    canonical, score = choose_canonical_by_distance(stem)
    stem_canonicals_by_distance[stem] = (canonical, score)

100%|██████████| 33252/33252 [00:53<00:00, 618.25it/s]


In [83]:
canonicals = []

word_stem = df.set_index('word')['stem'].to_dict()

for word, stem in tqdm(word_stem.items()):
    canonical_by_similar_stem, by_similar_stem_score = stem_canonicals[stem]
    canonical_by_similar_word, by_similar_word_score = choose_canonical_by_vector(word)
    canonical_by_stem_distance, by_stem_distance_score = stem_canonicals_by_distance[stem]
    canonical_by_word_distance, by_word_distance_score = choose_canonical_by_distance(word)

    slice_suffix_1 = slice_word_suffix(word, 1)
    canonical_by_slice_suffix_1, by_slice_suffix_1_score = choose_canonical_by_distance(slice_suffix_1)

    slice_suffix_2 = slice_word_suffix(word, 2)
    canonical_by_slice_suffix_2, by_slice_suffix_2_score = choose_canonical_by_distance(slice_suffix_2)

    slice_suffix_3 = slice_word_suffix(word, 3)
    canonical_by_slice_suffix_3, by_slice_suffix_3_score = choose_canonical_by_distance(slice_suffix_3)

    canonicals.append({
        'word': word,
        'canonical_by_similar_word': canonical_by_similar_word,
        'canonical_by_similar_word_score': by_similar_word_score,
        'canonical_by_word_distance': canonical_by_word_distance,
        'canonical_by_word_distance_score': by_word_distance_score,

        'stem': stem,
        'canonical_by_similar_stem': canonical_by_similar_stem,
        'canonical_by_similar_stem_score': by_similar_stem_score,
        'canonical_by_stem_distance': canonical_by_stem_distance,
        'canonical_by_stem_distance_score': by_stem_distance_score,

        'slice_suffix_1': slice_suffix_1,
        'canonical_by_slice_suffix_1': canonical_by_slice_suffix_1,
        'by_slice_suffix_1_score': by_slice_suffix_1_score,

        'slice_suffix_2': slice_suffix_2,
        'canonical_by_slice_suffix_2': canonical_by_slice_suffix_2,
        'by_slice_suffix_2_score': by_slice_suffix_2_score,

        'slice_suffix_3': slice_suffix_3,
        'canonical_by_slice_suffix_3': canonical_by_slice_suffix_3,
        'by_slice_suffix_3_score': by_slice_suffix_3_score,
    })

canonicals_df = pd.DataFrame(canonicals)

100%|██████████| 42388/42388 [05:58<00:00, 118.29it/s]


In [94]:
canonicals_df

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,slice_suffix_1,canonical_by_slice_suffix_1,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score
0,Кърым-Джэрий,хъаний,0.593987,сымаджэрей,0.55,Кърым,хъанцэгу,0.560486,гурым,0.60,Кърым,гурым,0.60,,,0.0,,,0.0
1,Хъан-Джэрий,хъаний,0.621087,къанжэ,0.59,Хъан,хъан,0.720560,хъан,0.75,Хъан,хъан,0.75,,,0.0,,,0.0
2,Адэл-Джэрий,хьэрийкурий,0.611483,дэлэжьэн,0.53,Адэл,жонглёр,0.524843,дэлэл,0.67,,,0.00,,,0.0,,,0.0
3,Дал-Джэрий,уэлий,0.606554,парий,0.53,-Джэрий,хьэрийкурий,0.606495,жэр,0.60,,,0.00,,,0.0,,,0.0
4,Къаз-Джэрий,гербарий,0.518124,парий,0.50,Къаз,мэтэлыуан,0.497766,къаз,0.75,Къаз,къаз,0.75,,,0.0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42383,тхьэмыщкIэри,тхьэмыщкIэ,0.844754,тхьэмыщкIэ,0.91,тхьэмыщкIэ,тхьэмыщкIэ,1.000000,тхьэмыщкIэ,1.00,тхьэмыщкIэр,тхьэмыщкIэ,0.95,тхьэмыщкIэ,тхьэмыщкIэ,1.0,,,0.0
42384,тхьэмыщкIэ…,тхьэмыщкIэ,0.955718,тхьэмыщкIэ,0.95,тхьэмыщкIэ,тхьэмыщкIэ,1.000000,тхьэмыщкIэ,1.00,тхьэмыщкIэ,тхьэмыщкIэ,1.00,,,0.0,,,0.0
42385,щIэтIысхьащ,щIэтIысхьэн,0.813814,щIэтIысхьэн,0.82,этIысхьащ,дэтIысхьэн,0.727135,дэтIысхьэн,0.74,щIэтIысхьа,щIэтIысхьэн,0.86,,,0.0,,,0.0
42386,дыщIэтIысхьащ,щIэтIысхьэн,0.735218,щIэтIысхьэн,0.75,этIысхьащ,дэтIысхьэн,0.727135,дэтIысхьэн,0.74,дыщIэт,щIэт,0.80,,,0.0,,,0.0


In [95]:
similarity_columns = canonicals_df.filter(like='score').columns.tolist()
similarity_columns

['canonical_by_similar_word_score',
 'canonical_by_word_distance_score',
 'canonical_by_similar_stem_score',
 'canonical_by_stem_distance_score',
 'by_slice_suffix_1_score',
 'by_slice_suffix_2_score',
 'by_slice_suffix_3_score']

In [109]:
score_columns = [
    'canonical_by_similar_word_score',
    'canonical_by_word_distance_score',
    'canonical_by_similar_stem_score',
    'canonical_by_stem_distance_score',
    'by_slice_suffix_1_score',
    'by_slice_suffix_2_score',
    # 'by_slice_suffix_3_score'
]

filtered_df = canonicals_df[canonicals_df[score_columns].gt(0.7).all(axis=1)]
filtered_df

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,slice_suffix_1,canonical_by_slice_suffix_1,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score,most_common
71,Iеижщ,Iеижь,0.752251,Iеижь,0.80,Iеиж,Iеижь,0.750832,Iеижь,0.89,Iеиж,Iеижь,0.89,Iеи,Iеин,0.86,,,0.0,Iеижь
84,Iувти,Iуву,0.756283,Iув,0.75,Iув,Iув,1.000000,Iув,1.00,Iувт,Iув,0.86,Iув,Iув,1.00,,,0.0,Iув
89,Iувщи,Iуб,0.718909,Iув,0.75,Iув,Iув,1.000000,Iув,1.00,Iувщ,Iув,0.86,Iув,Iув,1.00,,,0.0,Iув
113,Iущащэрт,бабыщкъурт,0.722349,Iущащэ,0.86,Iущащэ,Iущащэ,1.000000,Iущащэ,1.00,Iущащэр,Iущащэ,0.92,Iущащэ,Iущащэ,1.00,,,0.0,Iущащэ
115,Iущащэурэ,небэкъебэурэ,0.823464,Iущащэу,0.88,Iущащэ,Iущащэ,1.000000,Iущащэ,1.00,Iущащэу,Iущащэу,1.00,Iущащэ,Iущащэ,1.00,,,0.0,Iущащэ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42296,къыщIыхьэри,къыщIыхьэн,0.800958,къыщIыхьэн,0.86,къыщIыхьэ,къыщIыхьэн,0.894893,къыщIыхьэн,0.95,къыщIыхьэр,къыщIыхьэн,0.90,къыщIыхьэ,къыщIыхьэн,0.95,,,0.0,къыщIыхьэн
42299,къыщIыхьэрт,къыщIыхьэн,0.791699,къыщIыхьэн,0.86,къыщIыхьэ,къыщIыхьэн,0.894893,къыщIыхьэн,0.95,къыщIыхьэр,къыщIыхьэн,0.90,къыщIыхьэ,къыщIыхьэн,0.95,,,0.0,къыщIыхьэн
42346,илъагъуну,лъагъунлъагъу,0.785294,лъагъун,0.88,илъагъуну,лъагъунлъагъу,0.785294,лъагъун,0.88,илъагъун,лъагъун,0.93,илъа,ислъам,0.80,,,0.0,лъагъун
42373,къыщIыхьэурэ,къыщIыхьэн,0.773184,къыщIыхьэпIэ,0.83,къыщIыхьэ,къыщIыхьэн,0.894893,къыщIыхьэн,0.95,къыщIыхьэу,къыщIыхьэн,0.90,къыщIыхьэ,къыщIыхьэн,0.95,,,0.0,къыщIыхьэн


In [110]:
canonicals_df.sort_values(by='canonical_by_slice_suffix_3', ascending=False)

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,slice_suffix_1,canonical_by_slice_suffix_1,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score,most_common
15153,ягъэуват,гъэувын,0.693881,гъэхуа,0.71,ягъэува,гъэувын,0.695505,гъэхуа,0.77,ягъэува,гъэхуа,0.77,ягъэув,ягъэ,0.80,ягъэу,ягъэ,0.89,гъэхуа
31339,ящыщтэкъыми,ныIэкъым,0.620658,сыткъым,0.67,щыщтэкъ,щыщтэн,0.578903,щыщтэн,0.77,ящыщтэкъым,сыткъым,0.71,ящыщт,щыщ,0.75,ящыщ,щыщ,0.86,сыткъым
31827,щыуагъэшхуэщ,щыуагъэ,0.589525,щыуагъэншэ,0.82,щыуагъэ,щыуагъэ,1.000000,щыуагъэ,1.00,щыуагъэшхуэ,щыуагъэншэ,0.86,щыуагъэ,щыуагъэ,1.00,щыуа,щыу,0.86,щыуагъэ
5723,щыгъынымрэ,щыгъыныжь,0.617490,щыгъыныкIэ,0.80,щыгъын,щыгъын,1.000000,щыгъын,1.00,щыгъыным,щыгъын,0.86,щыгъын,щыгъын,1.00,щыгъ,щыгъэ,0.89,щыгъын
5722,щыгъынымкIэ,щыгъыныкIэ,0.650964,щыгъыныкIэ,0.95,щыгъын,щыгъын,1.000000,щыгъын,1.00,щыгъыным,щыгъын,0.86,щыгъын,щыгъын,1.00,щыгъ,щыгъэ,0.89,щыгъын
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42383,тхьэмыщкIэри,тхьэмыщкIэ,0.844754,тхьэмыщкIэ,0.91,тхьэмыщкIэ,тхьэмыщкIэ,1.000000,тхьэмыщкIэ,1.00,тхьэмыщкIэр,тхьэмыщкIэ,0.95,тхьэмыщкIэ,тхьэмыщкIэ,1.00,,,0.00,тхьэмыщкIэ
42384,тхьэмыщкIэ…,тхьэмыщкIэ,0.955718,тхьэмыщкIэ,0.95,тхьэмыщкIэ,тхьэмыщкIэ,1.000000,тхьэмыщкIэ,1.00,тхьэмыщкIэ,тхьэмыщкIэ,1.00,,,0.00,,,0.00,тхьэмыщкIэ
42385,щIэтIысхьащ,щIэтIысхьэн,0.813814,щIэтIысхьэн,0.82,этIысхьащ,дэтIысхьэн,0.727135,дэтIысхьэн,0.74,щIэтIысхьа,щIэтIысхьэн,0.86,,,0.00,,,0.00,
42386,дыщIэтIысхьащ,щIэтIысхьэн,0.735218,щIэтIысхьэн,0.75,этIысхьащ,дэтIысхьэн,0.727135,дэтIысхьэн,0.74,дыщIэт,щIэт,0.80,,,0.00,,,0.00,


In [119]:
from collections import Counter


text_columns = [
    'slice_suffix_1',
    'slice_suffix_2',
    'slice_suffix_3',

    'canonical_by_similar_word',
    'canonical_by_word_distance',
    'canonical_by_similar_stem',
    'canonical_by_stem_distance',
    'canonical_by_slice_suffix_1',
    'canonical_by_slice_suffix_2',
    'canonical_by_slice_suffix_3',
]

def most_common(row):
    most_common_value = max(Counter(row).items(), key=lambda x: x[1] if x[0] is not None else 0)
    return most_common_value

canonicals_df['most_common'] = canonicals_df[text_columns].apply(most_common, axis=1)
canonicals_df['most_common_value'] = canonicals_df['most_common'].apply(lambda x: x[0])
canonicals_df['most_common_count'] = canonicals_df['most_common'].apply(lambda x: x[1])
canonicals_df.sort_values(by=['most_common_value', 'most_common_count'], ascending=False)

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,...,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score,most_common,most_common_value,most_common_count
30783,яшэжащ,ащ,0.702411,шэуэжа,0.67,яшэ,блашэ,0.622380,,0.0,...,0.0,,,0.0,,,0.0,"(яшэ, 1)",яшэ,1
30811,яшэрт,хьэрткъурт,0.758513,пшэр,0.67,яшэ,блашэ,0.622380,,0.0,...,0.0,,,0.0,,,0.0,"(яшэ, 1)",яшэ,1
30839,яшэурэ,дэрэурэ,0.865715,шэуэжа,0.67,яшэ,блашэ,0.622380,,0.0,...,0.0,,,0.0,,,0.0,"(яшэ, 1)",яшэ,1
30592,яшхауэ,шауэ,0.866231,шхахуэ,0.83,яшх,лыц,0.719494,,0.0,...,0.0,,,0.0,,,0.0,"(яшх, 1)",яшх,1
30602,яшхри,жюри,0.827310,дыхьэшхрилэ,0.50,яшх,лыц,0.719494,,0.0,...,0.0,,,0.0,,,0.0,"(яшх, 1)",яшх,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6146,1557,август,0.690190,,0.00,155,июнь,0.681028,,0.0,...,0.0,,,0.0,,,0.0,"(155, 1)",155,1
6147,1558,август,0.789203,,0.00,155,июнь,0.681028,,0.0,...,0.0,,,0.0,,,0.0,"(155, 1)",155,1
30,120-рэ,центнер,0.636866,,0.00,0-рэ,тонн,0.648594,,0.0,...,0.0,,,0.0,,,0.0,"(120, 1)",120,1
6138,120-м,нэблагъэ,0.700637,,0.00,120,пэ-,0.589752,,0.0,...,0.0,,,0.0,,,0.0,"(120, 1)",120,1
