In [1]:
import os
from tokenizers import Tokenizer

tokenizer_uni = Tokenizer.from_file('../data/processed/tokenizer/words_unigram_5000.tokenizer.json')
with open('../data/processed/word_freqs/freq_1000000_oshhamaho.txt') as f:
    words = f.read().split('\n')


word_groups_dir = '../data/processed/word_groups'
os.makedirs(word_groups_dir, exist_ok=True)

In [2]:
from collections import defaultdict

tokenized_index = defaultdict(set)

for word in words:
    for token in tokenizer_uni.encode(word).tokens:
        tokenized_index[token].add(word)

In [3]:
from fuzzywuzzy import fuzz

def search_in_tokenized_index(
    word_q, min_ratio=80, token_limit=0, word_limit_by_step=1000, tokenizer=tokenizer_uni
):
    results = {}
    result_words = None
    
    tokens = zip(
        tokenizer.encode(word_q).ids,
        tokenizer.encode(word_q).tokens
    )
    sorted_tokens = sorted(tokens, key=lambda x: x[0])
    if token_limit:
        sorted_tokens = sorted_tokens[-token_limit:]

    for step, (token_id, token) in enumerate(sorted_tokens):
        if step == 0:
            result_words = tokenized_index[token]
        else:
            result_words = result_words.intersection(
                tokenized_index[token]
            )

        if len(result_words) < word_limit_by_step:
            step_results = {
                word: fuzz.ratio(word, word_q)
                for word in result_words
            }
            results.update(
                {
                    word: ratio
                    for word, ratio in step_results.items()
                    if ratio >= min_ratio
                }
            )

    return sorted(results.items(), key=lambda x: x[1], reverse=True)

In [9]:
search_in_tokenized_index('нэхъыбапIэм', tokenizer=tokenizer_uni, min_ratio=90)

[('нэхъыбапIэм', 100),
 ('нэхъыбапэм', 95),
 ('нэхъыбапIэмрэ', 92),
 ('нэхъыбапIэр', 91)]

In [73]:
import json
import nltk
trigram_index = defaultdict(list)

for word in words:
    for trigram in nltk.ngrams(word, 3):
        trigram_index[''.join(trigram)].append(word)

with open(f'{word_groups_dir}/trigram_index.json', 'w') as f:
    json.dump(trigram_index, f, ensure_ascii=False, indent=4)

Такие индексы получаются гигантского размера. 
Надо сделать map word_id -> word, а потом уже в индексе хранить word_id.
А для еще большей компактности можно хранить в префиксном дереве.