In [2]:
import os

import fasttext

embeddings_dir = '../data/processed/embeddings/'
kbd_model = fasttext.load_model(os.path.join(embeddings_dir, 'fasttext_skipgram_kbd_100.bin'))



In [3]:
from gensim.models import KeyedVectors

kbd_restricted_glossary_kv = KeyedVectors.load(
    os.path.join(embeddings_dir, 'fasttext_skipgram_kbd_100_restricted_glossary.kv'))

In [4]:
from tqdm import tqdm
import pandas as pd

token_dist_dir = '../data/processed/token_distribution/'
stem_df = pd.read_csv(os.path.join(token_dist_dir, 'stem_candidates.csv.gz'), compression='gzip')
stem_df.head(n=100)

word_freq = pd.read_csv('../data/processed/word_freqs/freq_1000000_oshhamaho.csv')
word_freq.head(n=100)

df = pd.merge(stem_df, word_freq, on='word', how='inner')
df = df[df['freq'] > 5]
df.head(n=100)

Unnamed: 0,stem_ng_len,stem,word_ng_len,word,template,tokens,stem_is_full_word,freq
1,1,-Джэрий,2,Кърым-Джэрий,Кърым???????,Кърым|-Джэрий,False,11
2,1,Кърым,2,Кърым-Джэрий,?????-Джэрий,Кърым|-Джэрий,False,11
3,1,-Джэрий,2,Хъан-Джэрий,Хъан???????,Хъан|-Джэрий,False,97
4,1,Хъан,2,Хъан-Джэрий,????-Джэрий,Хъан|-Джэрий,False,97
7,1,-Джэрий,3,Адэл-Джэрий,Адэл???????,Адэ|л|-Джэрий,False,7
...,...,...,...,...,...,...,...,...
371,4,Муртаз,6,Муртаз-пэщэр,??????-пэщэр,Му|рт|а|з|-пэщэ|р,False,8
558,1,0-рэ,2,110-рэ,11????,11|0-рэ,False,6
559,1,0-рэ,2,120-рэ,12????,12|0-рэ,False,10
562,1,0-рэ,2,150-рэ,15????,15|0-рэ,False,34


In [5]:
import ahocorasick

words = df['word'].tolist()

trie = ahocorasick.Automaton(key_type="str", value_type="int")
for id, word in enumerate(sorted(words)):
    trie.add_word(word, id)

trie.make_automaton()

In [6]:
def slice_word_suffix(word, suffix_n=1):
    most_prefix_match = list(trie.keys(word, '?', ahocorasick.MATCH_AT_MOST_PREFIX))
    try:
        return most_prefix_match[-suffix_n - 1]
    except IndexError:
        return None


print(slice_word_suffix('цIыхубзхэри'))

цIыхубзхэр


In [7]:
stem_list = set(df['stem'].tolist())


def choose_canonical_by_vector(word):
    word_vector = kbd_model.get_word_vector(word)
    canonical, similarity = kbd_restricted_glossary_kv.similar_by_vector(word_vector, topn=1)[0]
    return canonical, similarity


stem_canonicals = []
for stem in tqdm(stem_list):
    stem_canonicals.append((stem, choose_canonical_by_vector(stem)))

stem_canonicals = dict(stem_canonicals)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
100%|██████████| 33571/33571 [00:36<00:00, 913.39it/s] 


In [8]:
from collections import defaultdict
import nltk

trigram_index = defaultdict(list)

restricted_glossary = set(kbd_restricted_glossary_kv.index_to_key)
for glossary_word in tqdm(restricted_glossary):
    for trigram in nltk.ngrams(glossary_word, 3):
        trigram_index[''.join(trigram)].append(glossary_word)

100%|██████████| 27268/27268 [00:00<00:00, 232059.95it/s]


In [9]:
from fuzzywuzzy import fuzz


def get_candidates_by_trigram(word):
    candidates = set()
    for trigram in nltk.ngrams(word, 3):
        candidates.update(trigram_index[''.join(trigram)])

    # print(len(candidates))
    return candidates


def choose_canonical_by_distance(word):
    if word is None:
        return None, 0

    canonical = None
    max_dist = 0
    for candidate in get_candidates_by_trigram(word):
        dist = fuzz.ratio(word, candidate)
        if dist > max_dist:
            max_dist = dist
            canonical = candidate

    score = max_dist / 100
    return canonical, score


print(choose_canonical_by_distance('цIыхубзхэр'))

('цIыхубз', 0.82)


In [10]:
stem_canonicals_by_distance = {}

for stem in tqdm(stem_list):
    canonical, score = choose_canonical_by_distance(stem)
    stem_canonicals_by_distance[stem] = (canonical, score)

100%|██████████| 33571/33571 [01:00<00:00, 552.05it/s]


In [11]:
canonicals = []

word_stem = df.set_index('word')['stem'].to_dict()

for word, stem in tqdm(word_stem.items()):
    canonical_by_similar_stem, by_similar_stem_score = stem_canonicals[stem]
    canonical_by_similar_word, by_similar_word_score = choose_canonical_by_vector(word)
    canonical_by_stem_distance, by_stem_distance_score = stem_canonicals_by_distance[stem]
    canonical_by_word_distance, by_word_distance_score = choose_canonical_by_distance(word)

    slice_suffix_1 = slice_word_suffix(word, 1)
    canonical_by_slice_suffix_1, by_slice_suffix_1_score = choose_canonical_by_distance(slice_suffix_1)

    slice_suffix_2 = slice_word_suffix(word, 2)
    canonical_by_slice_suffix_2, by_slice_suffix_2_score = choose_canonical_by_distance(slice_suffix_2)

    slice_suffix_3 = slice_word_suffix(word, 3)
    canonical_by_slice_suffix_3, by_slice_suffix_3_score = choose_canonical_by_distance(slice_suffix_3)

    canonicals.append({
        'word': word,
        'canonical_by_similar_word': canonical_by_similar_word,
        'canonical_by_similar_word_score': by_similar_word_score,
        'canonical_by_word_distance': canonical_by_word_distance,
        'canonical_by_word_distance_score': by_word_distance_score,

        'stem': stem,
        'canonical_by_similar_stem': canonical_by_similar_stem,
        'canonical_by_similar_stem_score': by_similar_stem_score,
        'canonical_by_stem_distance': canonical_by_stem_distance,
        'canonical_by_stem_distance_score': by_stem_distance_score,

        'slice_suffix_1': slice_suffix_1,
        'canonical_by_slice_suffix_1': canonical_by_slice_suffix_1,
        'by_slice_suffix_1_score': by_slice_suffix_1_score,

        'slice_suffix_2': slice_suffix_2,
        'canonical_by_slice_suffix_2': canonical_by_slice_suffix_2,
        'by_slice_suffix_2_score': by_slice_suffix_2_score,

        'slice_suffix_3': slice_suffix_3,
        'canonical_by_slice_suffix_3': canonical_by_slice_suffix_3,
        'by_slice_suffix_3_score': by_slice_suffix_3_score,
    })

canonicals_df = pd.DataFrame(canonicals)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
100%|██████████| 45159/45159 [06:23<00:00, 117.64it/s]


In [12]:
canonicals_df

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,slice_suffix_1,canonical_by_slice_suffix_1,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score
0,Кърым-Джэрий,дзэзешэ,0.684770,сымаджэрей,0.55,Кърым,хъаныгъуэ,0.757596,гурым,0.60,Кърым,гурым,0.60,,,0.0,,,0.0
1,Хъан-Джэрий,хъаний,0.612499,къанжэ,0.59,Хъан,хъаний,0.719508,къан,0.75,Хъан,къан,0.75,,,0.0,,,0.0
2,Адэл-Джэрий,хъаний,0.653761,пыджэрей,0.53,Адэл,ажокъуэ,0.622523,дэлэн,0.67,,,0.00,,,0.0,,,0.0
3,Дал-Джэрий,хъаний,0.649661,дарий,0.53,-Джэрий,дзэзешэ,0.680924,жэр,0.60,,,0.00,,,0.0,,,0.0
4,Къаз-Джэрий,очерк,0.584734,дарий,0.50,Къаз,шэмпIаул,0.628964,къаз,0.75,Къаз,къаз,0.75,,,0.0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45154,къызэщIэващ,къызэщIэвэн,0.893544,къызэщIэвэн,0.82,къызэщIэ,къызэщIэчын,0.900754,къызэщIэчын,0.84,,,0.00,,,0.0,,,0.0
45155,Ноябрь,ноябрь,0.937793,ноябрь,0.83,ябрь,октябрь,0.899754,ноябрь,0.80,,,0.00,,,0.0,,,0.0
45156,ноябрь,ноябрь,1.000000,ноябрь,1.00,ябрь,октябрь,0.899754,ноябрь,0.80,,,0.00,,,0.0,,,0.0
45157,октябрь,октябрь,1.000000,октябрь,1.00,ябрь,октябрь,0.899754,ноябрь,0.80,,,0.00,,,0.0,,,0.0


In [13]:
similarity_columns = canonicals_df.filter(like='score').columns.tolist()
similarity_columns

['canonical_by_similar_word_score',
 'canonical_by_word_distance_score',
 'canonical_by_similar_stem_score',
 'canonical_by_stem_distance_score',
 'by_slice_suffix_1_score',
 'by_slice_suffix_2_score',
 'by_slice_suffix_3_score']

In [14]:
score_columns = [
    'canonical_by_similar_word_score',
    'canonical_by_word_distance_score',
    'canonical_by_similar_stem_score',
    'canonical_by_stem_distance_score',
    'by_slice_suffix_1_score',
    'by_slice_suffix_2_score',
    # 'by_slice_suffix_3_score'
]

filtered_df = canonicals_df[canonicals_df[score_columns].gt(0.7).all(axis=1)]
filtered_df

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,slice_suffix_1,canonical_by_slice_suffix_1,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score
61,къикIынIауэ,къикIиикIын,0.742450,къикIыкIа,0.80,къикIын,къикIын,1.000000,къикIын,1.00,къикIын,къикIын,1.00,къикI,къикIэн,0.83,,,0.0
103,IуащхьэкIэ,Iуащхьэ,0.887017,Iуащхьэкхъэ,0.86,хьэкIэ,хьэкIэ,1.000000,хьэкIэ,1.00,Iуащхьэ,Iуащхьэ,1.00,Iуащ,тIуащIэ,0.73,,,0.0
104,Iуащхьэм,Iуащхьэ,0.935691,Iуащхьэ,0.93,Iуащхьэм,Iуащхьэ,0.935691,Iуащхьэ,0.93,Iуащхьэ,Iуащхьэ,1.00,Iуащ,тIуащIэ,0.73,,,0.0
105,Iуащхьэр,Iуащхьэ,0.920755,Iуащхьэ,0.93,Iуащхь,Iуащхьэ,0.933379,Iуащхьэ,0.92,Iуащхьэ,Iуащхьэ,1.00,Iуащ,тIуащIэ,0.73,,,0.0
109,къэIуащ,макъ,0.747449,къэуал,0.77,эIуащ,ащ,0.763757,хэIуба,0.73,къэIуа,къэуал,0.83,къэIу,къэIун,0.91,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45061,ящIэну,пщIэну,0.731455,пщIэну,0.83,ящIэну,пщIэну,0.731455,пщIэну,0.83,ящIэн,щIэн,0.89,ящIэ,щIэ,0.86,,,0.0
45074,гъащIэрэ,гъащIэ,0.720500,гъащIэ,0.86,гъащIэрэ,гъащIэ,0.720500,гъащIэ,0.86,гъащIэр,гъащIэ,0.92,гъащ,гъащIэ,0.80,,,0.0
45134,хьэлъэу,хьэлъэу,1.000000,хьэлъэу,1.00,хьэлъэу,хьэлъэу,1.000000,хьэлъэу,1.00,хьэлъэ,хьэлъэ,1.00,хьэл,хьэл,1.00,,,0.0
45135,хьэлъэщ,хьэлъэ,0.840050,хьэлъэ,0.92,хьэлъэ,хьэлъэ,1.000000,хьэлъэ,1.00,хьэлъэ,хьэлъэ,1.00,хьэл,хьэл,1.00,,,0.0


In [18]:
canonicals_df[(canonicals_df['canonical_by_similar_word'] == canonicals_df['canonical_by_word_distance'])]

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,...,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score,most_common,most_common_value,most_common_count
18,Рашид-пэщэ,зипэщэ,0.807602,зипэщэ,0.62,Рашид,мурид,0.624938,машинэ,0.55,...,0.55,,,0.00,,,0.0,"(зипэщэ, 2)",зипэщэ,2
25,Рашид-пэщэр,зипэщэ,0.770130,зипэщэ,0.59,Рашид,мурид,0.624938,машинэ,0.55,...,0.62,Рашид,машинэ,0.55,,,0.0,"(зипэщэ, 3)",зипэщэ,3
27,Муртаз-пэщэм,зипэщэ,0.771052,зипэщэ,0.56,Муртаз,Iушэнашэ,0.625758,Iустаз,0.67,...,0.67,,,0.00,,,0.0,"(зипэщэ, 2)",зипэщэ,2
28,Муртаз-пэщэр,зипэщэ,0.790668,зипэщэ,0.56,Муртаз,Iушэнашэ,0.625758,Iустаз,0.67,...,0.67,,,0.00,,,0.0,"(зипэщэ, 2)",зипэщэ,2
45,иIамэ,пижамэ,0.722205,пижамэ,0.73,Iамэ,пижамэ,0.827825,Iэрамэ,0.80,...,0.00,,,0.00,,,0.0,"(пижамэ, 3)",пижамэ,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45154,къызэщIэващ,къызэщIэвэн,0.893544,къызэщIэвэн,0.82,къызэщIэ,къызэщIэчын,0.900754,къызэщIэчын,0.84,...,0.00,,,0.00,,,0.0,"(къызэщIэвэн, 2)",къызэщIэвэн,2
45155,Ноябрь,ноябрь,0.937793,ноябрь,0.83,ябрь,октябрь,0.899754,ноябрь,0.80,...,0.00,,,0.00,,,0.0,"(ноябрь, 3)",ноябрь,3
45156,ноябрь,ноябрь,1.000000,ноябрь,1.00,ябрь,октябрь,0.899754,ноябрь,0.80,...,0.00,,,0.00,,,0.0,"(ноябрь, 3)",ноябрь,3
45157,октябрь,октябрь,1.000000,октябрь,1.00,ябрь,октябрь,0.899754,ноябрь,0.80,...,0.00,,,0.00,,,0.0,"(октябрь, 3)",октябрь,3


In [15]:
canonicals_df.sort_values(by='canonical_by_slice_suffix_3', ascending=False)

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,slice_suffix_1,canonical_by_slice_suffix_1,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score
10949,ягъэуващ,гъэувын,0.664529,ягъэмащIэ,0.71,уващ,ащ,0.729255,ващIэ,0.67,ягъэува,гъэхуа,0.77,ягъэув,ягъэ,0.80,ягъэу,ягъэ,0.89
10927,ягъэувауэ,гъэувын,0.646868,гъэхуауэ,0.82,увауэ,шауэ-вауэ,0.830079,гувауэ,0.91,ягъэува,гъэхуа,0.77,ягъэув,ягъэ,0.80,ягъэу,ягъэ,0.89
4305,ягъэуват,гъэувын,0.667262,къэуат,0.71,ягъэува,гъэувын,0.664366,гъэхуа,0.77,ягъэува,гъэхуа,0.77,ягъэув,ягъэ,0.80,ягъэу,ягъэ,0.89
37906,ящыщтэкъыми,хьэрткъурт,0.573268,сыткъым,0.67,щыщтэкъ,хьэрткъурт,0.689346,щыщтэн,0.77,ящыщтэкъым,сыткъым,0.71,ящыщт,щыщ,0.75,ящыщ,щыщ,0.86
10750,щыуагъэшхуэщ,пIэигъэщ,0.656307,щыуагъэншэ,0.82,шхуэщ,уэщ,0.799335,шхуэ,0.89,щыуагъэшхуэ,щыуагъэншэ,0.86,щыуагъэ,щыуагъэ,1.00,щыуа,щыу,0.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45154,къызэщIэващ,къызэщIэвэн,0.893544,къызэщIэвэн,0.82,къызэщIэ,къызэщIэчын,0.900754,къызэщIэчын,0.84,,,0.00,,,0.00,,,0.00
45155,Ноябрь,ноябрь,0.937793,ноябрь,0.83,ябрь,октябрь,0.899754,ноябрь,0.80,,,0.00,,,0.00,,,0.00
45156,ноябрь,ноябрь,1.000000,ноябрь,1.00,ябрь,октябрь,0.899754,ноябрь,0.80,,,0.00,,,0.00,,,0.00
45157,октябрь,октябрь,1.000000,октябрь,1.00,ябрь,октябрь,0.899754,ноябрь,0.80,,,0.00,,,0.00,,,0.00


In [16]:
from collections import Counter


text_columns = [
    'slice_suffix_1',
    'slice_suffix_2',
    'slice_suffix_3',

    'canonical_by_similar_word',
    'canonical_by_word_distance',
    'canonical_by_similar_stem',
    'canonical_by_stem_distance',
    'canonical_by_slice_suffix_1',
    'canonical_by_slice_suffix_2',
    'canonical_by_slice_suffix_3',
]

def most_common(row):
    most_common_value = max(Counter(row).items(), key=lambda x: x[1] if x[0] is not None else 0)
    return most_common_value

canonicals_df['most_common'] = canonicals_df[text_columns].apply(most_common, axis=1)
canonicals_df['most_common_value'] = canonicals_df['most_common'].apply(lambda x: x[0])
canonicals_df['most_common_count'] = canonicals_df['most_common'].apply(lambda x: x[1])
canonicals_df.sort_values(by=['most_common_value', 'most_common_count'], ascending=False)

Unnamed: 0,word,canonical_by_similar_word,canonical_by_similar_word_score,canonical_by_word_distance,canonical_by_word_distance_score,stem,canonical_by_similar_stem,canonical_by_similar_stem_score,canonical_by_stem_distance,canonical_by_stem_distance_score,...,by_slice_suffix_1_score,slice_suffix_2,canonical_by_slice_suffix_2,by_slice_suffix_2_score,slice_suffix_3,canonical_by_slice_suffix_3,by_slice_suffix_3_score,most_common,most_common_value,most_common_count
39490,ящхьэщыт,щхьэщыIууду,0.742097,щхьэщытын,0.82,ящхьэ,ужьгъащхьэ,0.647958,щхьэ,0.89,...,0.91,,,0.0,,,0.0,"(ящхьэщ, 1)",ящхьэщ,1
34347,яхэуэурэ,загъуэурэ,0.785432,хьэуэрэ,0.80,яхэуэ,цыхэуэ,0.652864,хьэуэ,0.80,...,0.73,,,0.0,,,0.0,"(яхэуэу, 1)",яхэуэу,1
36724,яхэслъэгъуащ,щыпэлъагъу,0.610907,хэлъэгъуэн,0.73,яхэслъ,Iэслъы,0.676150,яслъэ,0.73,...,0.86,,,0.0,,,0.0,"(яхэс, 1)",яхэс,1
29273,яхэлъкъым,шэч,0.719612,хьэлъкъ,0.75,лъкъым,ныIэкъым,0.771846,жылъкъы,0.77,...,0.73,,,0.0,,,0.0,"(яхэлъ, 1)",яхэлъ,1
36528,яхыхьэжащ,ащ,0.641398,хыхьэжын,0.71,яхыхь,яхтэ,0.702230,тхыхь,0.80,...,0.83,,,0.0,,,0.0,"(яхыхьэ, 1)",яхыхьэ,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13584,Iейуэ,Iейуэ,1.000000,Iейуэ,1.00,Iейуэ,Iейуэ,1.000000,Iейуэ,1.00,...,0.00,,,0.0,,,0.0,"(Iейуэ, 4)",Iейуэ,4
13581,Iеижщ,Iеижь,0.702886,Iеижь,0.80,Iеиж,нэхъеиж,0.727500,Iеижь,0.89,...,0.89,,,0.0,,,0.0,"(Iеижь, 4)",Iеижь,4
40910,фIеижь,Iеижь,0.856398,Iеижь,0.91,фIеи,фIеипс,0.789010,уфIеин,0.80,...,0.00,,,0.0,,,0.0,"(Iеижь, 2)",Iеижь,2
59,яIауэ,пауэ,0.769622,Iау,0.75,Iауэ,шкIауэ,0.822226,Iау,0.86,...,0.00,,,0.0,,,0.0,"(Iау, 2)",Iау,2
