In [1]:
import json
import pickle
import pymorphy2
import pymysql
import pytils
import nltk
import numpy as np
import re
import string

from collections import defaultdict, OrderedDict
from lxml import etree
from time import time

## Constants

In [2]:
MAX_DISTANCE = 2

RE_E = re.compile('ё', re.U)
RE_S = re.compile('\s+', re.U)

REDUNDANT_TAGS = ['Geox', 'Orgn', 'Trad', 'Qual', 'perf', 'impf', 'pres', 'past', 'futr', 'incl', 'excl', 'Infr',
                  'Slng', 'Arch', 'Litr', 'Inmx', 'Vpre', 'LATN', 'NUMB', 'SYMB', 'UNKN']

KEYBOARD = {'a': 'qwedcxz`', 'b': 'vfghn', 'c': 'xsdfv', 'd': 'xswerfvc', 'e': 'sw234rfd', 'f': 'cdertgbv',
            'g': 'vfrtyhnb', 'h': 'bgtyujmn', 'i': 'ju789olk', 'j': 'nhyuik,m', 'k': 'mjuiol.,', 'l': ',kiop;/.',
            'm': 'nhjk,', 'n': 'bghjm', 'o': 'ki890p;l', 'p': 'lo90-[\';', 'q': '12wsa', 'r': 'de345tgf',
            's': 'zaqwedcx', 't': 'fr456yhg', 'u': 'hy678ikj', 'v': 'cdfgb', 'w': 'aq123eds', 'x': 'zasdc',
            'y': 'gt567ujh', 'z': '`asx', 'а': 'свукепим', 'б': 'ьолдю', 'в': 'чыцукамс', 'г': 'рн678шло',
            'д': 'блшщзж/ю', 'е': 'ак456нрп', 'ж': 'юдщзхэ/', 'з': 'дщ90-хэж', 'и': 'мапрт', 'й': '12цыф',
            'к': 'ву345епа', 'л': 'ьогшщдюб', 'м': 'свапи', 'н': 'пе567гор', 'о': 'трнгшлбь', 'п': 'макенрти',
            'р': 'ипенгоьт', 'с': 'чывам', 'т': 'ипроь', 'у': 'ыц234кав', 'ф': 'йцычя', 'х': 'жз0-=ъеэ',
            'ц': 'й123увыф', 'ч': 'яфывс', 'ш': 'ог789щдл', 'щ': 'лш890зжд', 'ъ': 'эх-=е', 'ы': 'яфйцувсч',
            'ь': 'тролб', 'э': '/жзхъе', 'ю': 'блдж/', 'я': 'фыч'}

## MySQL connector and opencorpora loader

In [3]:
def get_conn():
    return pymysql.connect(
        host='127.0.0.1',
        unix_socket='/tmp/mysql.sock',
        user='root',
        passwd=None,
        db='spellcheck',
        charset='utf8'
    )


def load_corpora(db=False):
    # lemma = {
    #     id: {
    #         'text': '',
    #         'gram': [
    #             '',
    #             ...
    #         ],
    #         'par': [
    #             {
    #                 'text': '',
    #                 'gram': [
    #                     '',
    #                     ...
    #                 ]
    #             },
    #             ...
    #         ]
    #     }
    # }
    lemma = defaultdict(dict)

    xml_iter = etree.iterparse('dict.opcorpora.xml', events=('start', 'end'))

    conn = get_conn()
    c = conn.cursor()
    if db:
        c.execute("""
            DROP TABLE IF EXISTS `spellcheck`.`word_form`;
            DROP TABLE IF EXISTS `spellcheck`.`lemma`;
            CREATE TABLE `spellcheck`.`lemma` (
              `id`   INT(11)     NOT NULL,
              `gram` VARCHAR(50) NOT NULL,
              PRIMARY KEY (`id`)
            )
              ENGINE InnoDB
              CHARACTER SET utf8;
            CREATE TABLE `spellcheck`.`word_form` (
              `id`       INT(11)      NOT NULL AUTO_INCREMENT,
              `lemma_id` INT(11)      NOT NULL,
              `text`     VARCHAR(37)  NOT NULL,
              `length`   SMALLINT     NOT NULL DEFAULT 0,
              `gram`     VARCHAR(50)  NOT NULL,
              PRIMARY KEY (`id`),
              INDEX `lemma_id_idx` (`lemma_id` ASC),
              CONSTRAINT `lemma_id`
              FOREIGN KEY (`lemma_id`)
              REFERENCES `spellcheck`.`lemma` (`id`)
                ON DELETE NO ACTION
                ON UPDATE NO ACTION
            )
              ENGINE InnoDB
              CHARACTER SET utf8;
        """)

    while True:
        act, it = xml_iter.__next__()

        if act == 'start' and it.tag == 'lemma':
            _id = int(it.attrib['id'])
            
            if not _id % 50000:  # for debug usages (total approx 400k)
                print(_id)
            
            # retrieve lemma and its paradigm data
            act, it = xml_iter.__next__()  # <l t="">

            lemma[_id]['text'] = RE_E.sub('е', it.attrib['t'])
            lemma[_id]['gram'] = []
            lemma[_id]['par'] = []

            # retrieve lemma grams
            while True:
                act, it = xml_iter.__next__()  # <g v="">
                if act == 'start' and it.tag == 'g':
                    lemma[_id]['gram'].append(it.attrib['v'])
                    continue
                if act == 'end' and it.tag == 'l':
                    break

            # retrieve lemma paradigm
            while True:
                act, it = xml_iter.__next__()  # <f t="">
                if act == 'start' and it.tag == 'f':
                    # retrieve word form grams
                    wf = {'text': RE_E.sub('е', it.attrib['t']), 'gram': []}
                    while True:
                        act, it = xml_iter.__next__()  # <g v="">
                        if act == 'start' and it.tag == 'g':
                            wf['gram'].append(it.attrib['v'])
                            continue
                        if act == 'end' and it.tag == 'f':
                            break
                    lemma[_id]['par'].append(wf)
                if act == 'end' and it.tag == 'lemma':
                    break

            if db:
                c.execute("""INSERT INTO lemma VALUES (%s, "%s")""" % (_id, ','.join(lemma[_id]['gram'])))
                c.execute("""INSERT INTO word_form (lemma_id, text, length, gram) VALUES """ + ','.join(["""(%s, "%s", %s, "%s")""" % (_id, wf['text'], len(wf['text']), ','.join(wf['gram'])) for wf in lemma[_id]['par']]))

        if act == 'end' and it.tag == 'lemmata':
            break

    if db:
        c.execute("""
            ALTER TABLE `spellcheck`.`word_form`
                ADD INDEX `text` (`text` ASC);
        """)
        conn.commit()
        conn.close()

    return lemma

## PrefixTree class and fuzzy_match function

In [4]:
class PrefixTree(object):
    def __init__(self, char='', parent=None):
        self.char = char
        self.parent = parent
        self.children = {}
        self.is_word = False

    def trace(self):
        return (self.parent.trace() if self.parent is not None else '') + self.char

    def _to_list(self):
        if self.is_word:
            yield self.trace()
        for pt in self.children.values():
            for s in pt._to_list():
                yield s

    def __iter__(self):
        return self._to_list()

    def __contains__(self, value):
        if not value:
            return True

        if value[0] not in self.children:
            return False

        return value[1:] in self.children[value[0]]

    def __len__(self):
        return len(self.parent) + 1 if self.parent is not None else 0

    def insert(self, value):
        if not value:
            self.is_word = True
            return

        c = value[0]
        if c not in self.children:
            self.children[c] = PrefixTree(c, self)

        self.children[c].insert(value[1:])


def load_ptree(from_file=True):
    """
    Creates PrefixTree from corpora stored in DB or loads it from pickle serialization file
    """
    if from_file:
        with open('pt.pkl', mode='rb') as pt_pkl:
            pt = pickle.load(pt_pkl)
        return pt

    pt = PrefixTree()
    conn = get_conn()
    c = conn.cursor()
    c.execute("SELECT text FROM word_form")

    inserted = 0  # for debug usages (total approx 5kk)
    for row in c:
        pt.insert(row[0])
        inserted += 1
        if not inserted % 1000000:
            print(inserted)

    with open('pt.pkl', mode='wb') as pt_pkl:
        pickle.dump(pt, pt_pkl)

    return pt

def update_visited(ptree, visited):
    """
    Removes one-word branch starting from leaf, going up to root node, ending in first branching node
    """
    visited[ptree][-1] = 0
    t = ptree.parent

    while t is not None:
        if len(t.children) != 1:
            return
        visited[t][-1] = 0
        t = t.parent


def is_visited(i, ptree, k, visited):
    """
    Checks whether current node was visited within less operations (insert/delete/substitution/transposition)
    """
    d = visited[ptree]
    if -1 in d:  # -1 stands for "node processed completely"
        return True

    m = d.get(i, -1)  # get last distance value for string idx i
    if k > m:
        # proceed further if we came in this node for less operations (current k > last visit k)
        d[i] = k
        visited[ptree] = d
        return False

    return True


def fuzzy_match(s, ptree, k, i=0, visited=None, n=0):
    """
    Computes all strings contained in ptree with a distance <= k
    """
    res = set()

    # handles root node of a ptree
    if ptree.parent is None and ptree.children:
        n = len(s)
        s += '\0' * (k + 1)  # in order to leave an opportunity to insert chars into s
        visited = defaultdict(dict)
        for child in ptree.children.values():
            # main loop, process each starting char in a prefix tree
            res.update(fuzzy_match(s, child, k, i, visited, n))
        return res
    
    # already tried
    if is_visited(i, ptree, k, visited):
        return []

    # can't match
    if k == -1 or (k == 0 and s[i] != ptree.char):
        return []

    if ptree.is_word and (n - i <= k or (n - (i + 1) <= k and ptree.char == s[i])):
        res.add(ptree.trace())
        if not ptree.children:
            update_visited(ptree, visited)
            return res

    if ptree.char != s[i]:
        res.update(fuzzy_match(s, ptree, k - 1, i + 1, visited, n))  # insert s char

    for child in ptree.children.values():
        if n >= i + 2 and s[i + 1] == ptree.char and s[i] == child.char:  # transposition
            if child.is_word and k == 1 and n == i + 2:
                # following transition to grandchild omits the case (in current architecture)
                # when child node forms a valid trace, check it (in upper if) and append manually
                res.add(child.trace())
                if not child.children:
                    update_visited(child, visited)

            for grandchild in child.children.values():
                res.update(fuzzy_match(s, grandchild, k - 1, i + 2, visited, n))

        if ptree.char == s[i]:
            res.update(fuzzy_match(s, child, k, i + 1, visited, n))  # chars are matched, k remains the same
        else:
            res.update(fuzzy_match(s, child, k - 1, i + 1, visited, n))  # substitution

        res.update(fuzzy_match(s, child, k - 1, i, visited, n))  # delete candidate char

    return res

## Weightened Damerau-Levenshtein distance

In [5]:
def weighted_dam_lev(a, b):
    d = {}
    for i in range(-1, len(a)):
        d[(i, -1)] = i + 1
    for j in range(-1, len(b)):
        d[(-1, j)] = j + 1

    for i in range(len(a)):
        for j in range(len(b)):
            subst_cost = trans_cost = 0
            if a[i] != b[j]:
                subst_cost = .8 if b[j] in KEYBOARD[a[i]] or a[i] in KEYBOARD[b[j]] else 1
                trans_cost = .9

            d[(i, j)] = min([
                d[(i - 1, j)] + 1,  # deletion
                d[(i, j - 1)] + 1,  # insertion
                d[(i - 1, j - 1)] + subst_cost  # substitution
            ])

            if i and j and a[i] == b[j - 1] and a[i - 1] == b[j]:
                d[(i, j)] = min([
                    d[(i, j)],
                    d[(i - 2, j - 2)] + trans_cost
                ])

    return d[(len(a) - 1, len(b) - 1)]

## Filtering candidates according to context (within tags N-grams counts) and Damerau-Levenshtein distance

In [6]:
def get_word_tags(word, morph):
    if not word or word in string.punctuation:
        return {'PNCT': 1.0}

    res = {}
    for var in morph.parse(word):
        res[','.join(sorted([t for t in RE_S.sub(',', str(var.tag)).split(',') if t not in REDUNDANT_TAGS]))] = var.score

    return res


def get_ngram_relevance_score(left_t, c_t, right_t, c_t_pool, bi_cnt, tri_cnt):
    l_total = r_total = s_total = 0

    for t in c_t_pool:
        l_total += bi_cnt.get('%s+%s' % (left_t, t), 0)
        r_total += bi_cnt.get('%s+%s' % (t, right_t), 0)
        s_total += tri_cnt.get('%s+%s+%s' % (left_t, t, right_t), 0)

    l = 0 if not l_total else float(bi_cnt.get('%s+%s' % (left_t, c_t), 0)) / l_total
    r = 0 if not r_total else float(bi_cnt.get('%s+%s' % (c_t, right_t), 0)) / r_total
    s = 0 if not s_total else float(tri_cnt.get('%s+%s+%s' % (left_t, c_t, right_t), 0)) / s_total

    return .25 * l + .25 * r + .5 * s


def filter_candidates(left, candidates, right, morph, bi_cnt, tri_cnt, debug=True):
    """
    1. Loop through each combination of left-right tags.
       On each iteration we are trying to find the most relevant tag among each candidate tag.
            2. Relevance is measured as weighted sum of bi/trigram probabilities counted by maximum likelihood
               only among candidates tags.
            3. Relevance is multiplied on a probability of the candidate to belong to concrete tag.
            4. Relevance is multiplied on the probabilities of left/right word tags
            5. Compute log of relevance score
            6. In case of non-real-word error:
                    - multiply relevance score on candidate_dam_lev_distance + 1.
               In case of real-word error:
                    - add to relevance score:
                            - np.log(0.99)                         - for real word
                            - np.log(0.01 / (len(candidates) - 1)) - for other words
    7. Repeat 2-6 for each combination of left-right tags.
    8. Find maximum of relevance scores and argmax (which is the desired candidate).

    :param left:            left word or None in case of sentence beginning
    :param candidates:      dictionary of candidates {'candidate': candidate_dam_lev_distance}
    :param right:           right word or None in case of sentence ending
    :param morph:           instance of MorphAnalyzer
    :param bi_cnt:          dictionary of tags bigrams counts
    :param tri_cnt:         dictionary of tags trigrams counts
    :param debug:           whether to display debug information

    :return:                most possible candidate or None if no candidates passed
    """
    if len(candidates) < 2:
        return None if not candidates else candidates.keys()[0]

    all_scores = {}

    real_word = min(candidates.values()) == 0
    other_word_add = np.log(.01 / (len(candidates) - 1)) if real_word else None
    if debug:
        print("Real-word: %s" % real_word)

    tags_l = get_word_tags(left, morph)
    tags_r = get_word_tags(right, morph)
    tags_c = {}
    c_t_pool = set()  # set-pool of distinct candidates tags

    for c in candidates:
        tags_c[c] = get_word_tags(c, morph)
        for t, score in tags_c[c].items():
            c_t_pool.add(t)

    for lt in tags_l:
        for rt in tags_r:
            for c in candidates:
                for ct in tags_c[c]:
                    rel_score = get_ngram_relevance_score(lt, ct, rt, c_t_pool, bi_cnt, tri_cnt)
                    rel_score = np.log(rel_score) if rel_score else -np.inf
                    rel_score += np.log(tags_c[c][ct])
                    rel_score += np.log(tags_l[lt]) + np.log(tags_r[rt])

                    if real_word:
                        rel_score += np.log(.99) if candidates[c] == 0 else other_word_add
                    else:
                        rel_score *= (candidates[c] + 1)

                    all_scores['%s:%s|%s:%s|%s:%s' % (left, lt, c, ct, right, rt)] = rel_score

    if debug:
        print('\n'.join([str(x) for x in sorted(all_scores.items(), key=lambda x: x[1], reverse=True) if not np.isinf(x[1])]))

    if real_word:
        # in case of real-word error retrieve all max relevant sequences
        # choose the most relevant candidate from them according to min candidate_dam_lev_distance
        max_score = all_scores[max(all_scores, key=lambda k: all_scores[k])]
        most_relevant_candidates = [seq.split('|')[1].split(':')[0] for seq, score in all_scores.items() if score == max_score]
        return min(most_relevant_candidates, key=lambda k: candidates[k])
    else:
        # in case of non-real-word error find max relevant sequence and retrieve candidate from it
        return max(all_scores, key=lambda k: all_scores[k]).split('|')[1].split(':')[0]

## PrefixTree loading from pickle

In [7]:
%time pt = load_ptree()

CPU times: user 26.8 s, sys: 7.36 s, total: 34.2 s
Wall time: 41.6 s


## Counts of OpenCorpora tags N-grams loading from json

In [8]:
%time tags_bi = json.load(open('bigram.opcorpora.json', encoding='utf-8'))
%time tags_tri = json.load(open('trigram.opcorpora.json', encoding='utf-8'))

CPU times: user 14.4 ms, sys: 5.93 ms, total: 20.3 ms
Wall time: 31.2 ms
CPU times: user 68.8 ms, sys: 17.3 ms, total: 86.1 ms
Wall time: 105 ms


## PyMorphy2 MorphAnalyzer

In [9]:
%time ma = pymorphy2.MorphAnalyzer()

CPU times: user 67.7 ms, sys: 30.9 ms, total: 98.7 ms
Wall time: 153 ms


## Testing filter_candidates

In [10]:
misspelled = 'прикраснее'
st = time()
filtered = filter_candidates(
    'повести',
    {c: weighted_dam_lev(c, misspelled) for c in fuzzy_match(misspelled, pt, MAX_DISTANCE)},
    'чем',
    ma,
    tags_bi,
    tags_tri
)
print("\nBest candidate: %s\nExecution time: %ss" % (filtered, time() - st))

Real-word: False
('повести:NOUN,femn,inan,loct,sing|прекраснее:COMP|чем:CONJ', -9.2149746844128444)
('повести:NOUN,accs,femn,inan,plur|прекраснее:COMP|чем:CONJ', -9.2149746844128444)
('повести:INFN,tran|прекрасные:ADJF,accs,inan,plur|чем:CONJ', -10.893231572956974)
('повести:INFN,tran|прекраснее:COMP|чем:CONJ', -11.685130251431023)
('повести:NOUN,femn,inan,loct,sing|прекраснее:COMP|чем:NPRO,ablt,neut,sing', -12.605239951490809)
('повести:NOUN,femn,inan,loct,sing|прекраснее:COMP|чем:NPRO,loct,neut,sing', -12.605239951490809)
('повести:NOUN,accs,femn,inan,plur|прекраснее:COMP|чем:NPRO,ablt,neut,sing', -12.605239951490809)
('повести:NOUN,accs,femn,inan,plur|прекраснее:COMP|чем:NPRO,loct,neut,sing', -12.605239951490809)
('повести:NOUN,femn,gent,inan,sing|прекраснее:COMP|чем:CONJ', -12.713374394031362)
('повести:NOUN,datv,femn,inan,sing|прекраснее:COMP|чем:CONJ', -12.713374394031362)
('повести:NOUN,femn,inan,nomn,plur|прекраснее:COMP|чем:CONJ', -12.713374394031362)
('повести:VERB,impr,sing,

## Numerical to text conversion

In [11]:
RE_CASE = OrderedDict()
RE_CASE['gent'] = re.compile('^(\d+)-?\w*((о?го)|((у|е|ё)?х)|и|а)$')
RE_CASE['datv'] = re.compile('^(\d+)-?\w*((о?му)|((у|е|ё)?м)|и|а)$')
RE_CASE['ablt'] = re.compile('^(\d+)-?\w*(((у|е)?мя)|[тм]?ь?ю)$')
RE_CASE['loct'] = re.compile('^(\d+)-?\w*(((у|е|ё)?х)|([тм]?и))$')


def case_for_numerical(text, case):
    return ' '.join([ma.parse(w)[0].inflect({case})[0] for w in text.split()])


def replace(text):
    for case, regex in RE_CASE.items():
        match = regex.search(text)
        if match:
            numerical = pytils.numeral.in_words(int(match.group(1)))
            return case_for_numerical(numerical, case)

    return text


def numbers2letters(words):
    return [replace(w) if w[0] in string.digits else w for w in words]


numbers2letters(['1', '2-ух', '1997-ми'])

['1', 'двух', 'одной тысячи девятисот девяноста семи']

## Preprocessing and rule-based corrections

In [12]:
russian_vowels = ['а', 'у', 'о', 'ы', 'и', 'э', 'я', 'ю', 'ё', 'е']
russian_vowels_str = ''.join(russian_vowels)
russian_cons = ['б', 'в', 'г', 'д', 'ж', 'з', 'й', 'к', 'л', 'м', 'н', 'п', 'р', 'с', 'т', 'ф', 'х', 'ц', 'ч', 'ш', 'щ']
russian_cons_str = ''.join(russian_cons)

extra_whitespace = re.compile('\s+', re.U)
repeated_chars = re.compile('([а-яa-z])\1\1+', re.U)
tsa_ending = re.compile('(.+)(цца|ццо)$', re.U)
vobsch_reg = re.compile('в(о|а){1,2}(б|п)щ{1,2}(е|и)м', re.U)
potomy_reg = re.compile('п(о|а)т(о|а)му\s?(ч|ш)т(о|а)', re.U)

frequent_intentional_mistakes = {'собстно': 'собственно', 'собсна': 'собственно', "многабуков": "много букв",
                                 "седня": "сегодня", "естесно": "естественно", "ессно": "естественно",
                                 "естессно": "естественно", "ничо": "ничего", "неоч": "не очень", "щаз": "сейчас",
                                 "какбы": "как бы", "какбе": "как бы", "скока": "сколько", "нащщот": "насчет",
                                 "ваще": "вообще", "ващще": "вообще"}

frequent_hyphen_space_mistakes = {"изза": "из-за", "еслиб": "если б", "тоесть": "то есть", "всмысле": "в смысле",
                                  "такчто": "так что"}

hyphen_endings = ['либо', 'нибудь', "то"]
subj_particles = ["кто", "как", "если", "когда", "вот", "хоть", "пусть"]
hyphen_beg = ["вице", "камер", "контр", "лейб", "обер", "статс", "унтер", "флигель", "штаб", "штабс", "экс"]


def tokenize(text, punct_include=False):
    tokens = nltk.word_tokenize(text)
    if punct_include:
        return tokens

    return [i for i in tokens if i not in string.punctuation]


def clean_text(text):
    # at first, replace most popular mistakes
    text = vobsch_reg.sub(" в общем ", text)
    text = potomy_reg.sub(" потому что ", text)

    text = extra_whitespace.sub(' ', text)
    text = text.lower()
    text = text.replace('_', ' ')
    text = text.replace('...', '.')
    text = text.replace('ё', 'е')

    return text


def neighborhood(iterable):
    """
    Iterator wrapper, which gives access to prev and next elements
    """
    iterator = iter(iterable)
    prev = None
    curr = next(iterator)

    for nxt in iterator:
        yield (prev, curr, nxt)
        prev = curr
        curr = nxt

    yield (prev, curr, None)


def correct_hyphens_spaces(words):
    corrected = []
    skip_next = False

    for prev, word, nxt in neighborhood(words):
        if skip_next:
            skip_next = False
            continue
        n_corrected = len(corrected)

        # проверяем "не" с глаголом
        if word.startswith('не'):
            if ma.parse(word[2:])[0].tag.POS == 'VERB':
                corrected.append('не')
                corrected.append(word[2:])
                continue

        if word in frequent_hyphen_space_mistakes:
            word = frequent_hyphen_space_mistakes[word]
            if ' ' in word:
                # если одно слово заменили на несколько
                for sub_word in word.split(' '):
                    corrected.append(sub_word)
                continue

        if word.endswith("бы"):
            if word[:-2] in subj_particles:
                corrected.append(word[:-2])
                corrected.append("бы")
                continue

        # дефисы
        if '-' not in word:
            # кое-, кой-
            if word.startswith('кое') or word.startswith("кой"):
                # если приставку написали как отдельное слово - соединяем со следующим
                if len(word) == 3:
                    if len(nxt) > 1:
                        corrected.append(word + "-" + nxt)
                        skip_next = True
                        continue
                else:
                    # если часть слова после приставки есть в нашем словаре - вставляем дефис
                    if word[3:] in pt:
                        corrected.append(word[:3] + "-" + word[3:])
                        continue

            # вице-, камер-, контр-, лейб-, обер-, статс-, унтер-, флигель-, штабс- и экс-
            for beg in hyphen_beg:
                if word.startswith(beg):
                    corrected.append("{0}-{1}".format(beg, word[len(beg):]))
                    continue
            # пол-
            if word.startswith('пол'):
                if word[3:].startswith('л'):
                    corrected.append("пол-{0}".format(word[3:]))
                    continue

            # -либо, -нибудь, -то
            for ending in hyphen_endings:
                if word.endswith(ending):
                    # вставляем дефис если часть слова без окончания есть в словаре
                    first_part = word[:-len(ending)]
                    if first_part in pt:
                        corrected.append("{0}-{1}".format(first_part, ending))

        # если за эту итерацию мы еще не добавляли слов - добавляем исходное
        if n_corrected == len(corrected):
            corrected.append(word)

    return corrected


def correct_intentional_misspelling(words):
    corrected = []
    for word in words:
        if word not in pt:
            # все повторяющиеся символы (от 3 и более) заменяются на один
            word = repeated_chars.sub('\\1', word)

            if word in frequent_intentional_mistakes:
                word = frequent_intentional_mistakes[word]
                if ' ' in word:
                    # если одно слово заменили на несколько
                    for sub_word in word.split(' '):
                        corrected.append(sub_word)
                    continue
                # заменяем окончания типа -цца и -ццо
                word = tsa_ending.sub('\\1ться', word)

        corrected.append(word)

    return corrected


def preprocess_text(text, punct_include=False):
    text = clean_text(text)

    words = tokenize(text, punct_include)
    words = numbers2letters(words)
    words = correct_intentional_misspelling(words)
    words = correct_hyphens_spaces(words)

    return words

## Sentence corrector

In [13]:
# def correct_sentence(sent, pt, morph, bi_cnt, tri_cnt, real_word=False):
#     tokens = preprocess_text(sent)

#     for i in range(len(tokens)):
#         if real_word or tokens[i] not in pt:
#             corrected = filter_candidates(
#                 tokens[i - 1] if i > 0 else '',
#                 {c: weighted_dam_lev(c, tokens[i]) for c in fuzzy_match(tokens[i], pt, MAX_DISTANCE)},
#                 tokens[i + 1] if i + 1 < len(tokens) else '',
#                 morph,
#                 bi_cnt,
#                 tri_cnt
#             )
#             if corrected:
#                 tokens[i] = corrected
    
#     return tokens