In [55]:
import nltk

from src.logic.text_cleaner import clean_text

with open('../data/processed/word_freqs/freq_1000000_oshhamaho.txt') as f:
    text = f.read()

clean_text = clean_text(text)

book_words = nltk.word_tokenize(text)

In [56]:
fdist = nltk.FreqDist(book_words)

In [57]:
filtered_words = [word for word, freq in fdist.items() if freq < 2]

In [58]:
import random


def generate_typo(word):
    typo_type = random.choice(['swap', 'delete', 'insert'])
    if len(word) < 2:
        typo_type = 'insert'

    if typo_type == 'swap':
        idx = random.randint(0, len(word) - 2)
        word = word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]

    elif typo_type == 'delete':
        idx = random.randint(0, len(word) - 1)
        word = word[:idx] + word[idx + 1:]

    elif typo_type == 'insert':
        idx = random.randint(0, len(word))
        alphabet = (
            'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯяIi1'
            '-.,:; -!?–…«»1234567890)(№*×><'
            'IIIьььъъъi111'
        )
        random_letter = random.choice(alphabet)
        word = word[:idx] + random_letter + word[idx:]

    return word


def generate_similar_char_error(word):
    similar_letters = {
        'п': 'II',
        'пI': 'тШ',
        'гы': 'гЫ',
        'жы': 'жь',
        'шы': 'шь',
        'П': 'ТТ',
        'Ш': 'III',
        'ш': 'III',
        'ПI': 'ПIГ',
        'жэ': 'жо',
        'пэ': 'пы',
        'жь': 'жъ',
        'ий': 'нй',
        'пс': 'лс',
        'эм': 'эи',
        'щ': 'шщ',
        'къ': 'кь',
        'Къ': 'Жъ',
        'пл': 'нл',
        'им': 'нм',
        'ти': 'тн',
        'гъщ': 'гъц',
        'хуи': 'хун',
        'щх': 'шх',
    }
    for key, value in similar_letters.items():
        if key in word:
            word = word.replace(key, value)
            break
    return word


def generate_grammatical_suffix_error(word):
    if word.endswith('къым'):
        return word[:-4] + 'кым'
    elif word.endswith('мкIэ'):
        return word[:-4] + 'мкэ'
    elif word.endswith('ым'):
        return word[:-2] + 'ып'
    return word


def generate_grammatical_prefix_error(word):
    if word.startswith('зэры'):
        return 'зари' + word[4:]
    elif word.startswith('къых'):
        return 'кыху' + word[4:]
    return word


def gen_incorr_word(word):
    incorrect_word = word

    for func in [
        generate_grammatical_prefix_error, 
        generate_grammatical_suffix_error, 
        generate_similar_char_error,
        generate_typo
    ]:
        incorrect_word = func(incorrect_word)
        if incorrect_word != word:
            return incorrect_word

    return incorrect_word


synthetic_spelling_errors = set()
for word in filtered_words:
    incorrect_word = gen_incorr_word(word)
    if incorrect_word != word:
        synthetic_spelling_errors.add(incorrect_word)

In [59]:
synthetic_spelling_errors

{'хъуIIIэкIэ',
 'зэIIлъыжурэ',
 'ХуоIIсэури',
 'НетэI',
 'зэшэлIэжынып',
 'зариIукIыжхэр',
 'фыдэзгъэхьэнукым',
 'къызэрыдрашщIей',
 'семыIIсыхыжу',
 'Тетхрэ',
 'ЖъызэрымыкIуар',
 'Къалмыкьыбзэ',
 'автомаIIIинэхэм',
 'тахътэбаынр',
 'зарихуекIуэр',
 'фыкъэдаIуэркым',
 'шщхьэусыгъуэ-',
 'зыкьызэридзэкIыжу',
 'IуэхущIафэирэ',
 'къышщыкIэлъыдэжи',
 'зытриуху',
 'къышщIагъэшщашщ',
 'риилъэгъуат',
 'шщIэгъэкъуа',
 'укьытемыухь',
 'уэриа',
 'къышщIэувэ',
 'гъуэIIлъ-гъуэжьыгъэм',
 'рудр',
 'Хъуашщ',
 'Жъосыжри',
 'къышщIыхьати',
 'зыгъэкIэрхаъуэу',
 'ущыIIсэуныр',
 'хэкIуэтаэ',
 'кьыддеIэу',
 'къэдгъуэтыншщ',
 'зытемыхуъэ',
 'УемыIIлъыт',
 'кьазри',
 'къигъэщIакым',
 'сфыочауэ',
 'сыкьикIыжмэ',
 'лъагаIIIэри',
 'Усыгъэхэирэ',
 'гъэкIуауэ',
 'кыхуигъэщат',
 'къышщрикIуташщ',
 'дыэритрэ…',
 'IIхуагъэхъей',
 'рзиIэтыкIауэ',
 'къышщихьэжам',
 'къэзгъэшщIэнур',
 'кьытхэкIауэ',
 'шщабэти',
 'къызэгъэпэщынымкэ',
 'уегуакIуэи',
 'фшхыжьнурэ',
 'зэрагъэтыншьнрэ',
 'зарихъуэнар',
 'чубу;к',
 'Комеат',


In [60]:
with open('../data/processed/spell_checker/synthetic_spelling_errors.txt', 'w') as f:
    f.write('\n'.join(synthetic_spelling_errors))