In [12]:
import os, re
from string import punctuation
import numpy as np
from collections import Counter
punct = set(punctuation)
from sklearn.metrics import classification_report

In [3]:
import textdistance

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [5]:
corpus = [sent.split() for sent in open('corpus_ng.txt', encoding='utf8').read().splitlines()]
WORDS = Counter()
for sent in corpus:
    WORDS.update(sent)

In [6]:
vocab = list(WORDS.keys())
id2word = {i:word for i, word in enumerate(vocab)}

vec = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
X = vec.fit_transform(vocab)

In [16]:
def get_closest_match_vec(text, X, vec, TOPN=3):
    v = vec.transform([text])
    similarities = cosine_distances(v, X)
    topn = similarities.argsort()[0][:TOPN]
    
    return [id2word[top] for top in topn]

In [17]:
def get_closest_hybrid_match(text, X, vec, metric=textdistance.levenshtein):
    arr = get_closest_match_vec(text, X, vec, TOPN=10)
    similarities = Counter()
    for word in arr:
        similarities[word] = metric.normalized_similarity(text, word) 
    
    closest =  similarities.most_common(1)[0]
  
    return closest

In [22]:
get_closest_hybrid_match('прииветт', X, vec)[0]

'привет'

In [9]:
bad = open('sents_with_mistakes.txt', encoding='utf8').read().splitlines()
true = open('correct_sents.txt', encoding='utf8').read().splitlines()

In [10]:
def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_1 if (set(token)-punct)]
    tokens_2 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_2 if (set(token)-punct)]
    
    return list(zip(tokens_1, tokens_2))

In [13]:
mistakes = []
total = 0

for correct, wrong in zip(true, bad):
    pairs = align_words(correct, wrong)
    for correct_word, wrong_word in pairs:
        wrong1 = get_closest_hybrid_match(wrong_word, X, vec)[0]
        if wrong1 != correct_word:
            mistakes.append([correct_word, wrong1])
        
        total += 1

In [24]:
mistakes[:15]

[['симпатичнейшее', 'пластичнейшими'],
 ['шпионское', 'шпионские'],
 ['гламурный', 'лагерный'],
 ['бонда', 'банда'],
 ['superheadz', 'super'],
 ['clap', 'place'],
 ['camera', 'caterham'],
 ['получатся', 'ополчатся'],
 ['язычки', 'язычка'],
 ['очень', 'очерчен'],
 ['милые', 'милы'],
 ['насчет', 'защищает'],
 ['чавеса', 'чавес'],
 ['попавшим', 'пропавшим'],
 ['аварийно-спасательных', 'аварийно-восстановительных']]

In [15]:
print('Доля ошибок - ', len(mistakes)/total )

Доля ошибок -  0.1669996004794247
