In [2]:
import re
import string
from collections import Counter
import numpy as np

In [8]:
def read_corpus(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        words = []
        
        for line in lines:
            words += re.findall(r'\w+', line.lower())
            
        return words

In [9]:
words = read_corpus("./big.txt")
print(f"There are {len(words)} total words in the corpus")

There are 1115760 total words in the corpus


In [10]:
vocabs = set(words)
print(f"There are {len(vocabs)} unique words in the vocabulary")

There are 32208 unique words in the vocabulary


In [11]:
word_counts = Counter(words)
print(word_counts["love"])

484


In [12]:
word_counts = Counter(words)
print(word_counts["the"])

79814


In [13]:
word_counts = Counter(words)
print(word_counts["sherlock"])

101


In [14]:
total_word_counts = float(sum(word_counts.values()))
word_probas = {word: word_counts[word] / total_word_counts for word in word_counts.keys()}

In [15]:
print(word_probas["sherlock"])

9.052125905212591e-05


In [16]:
print(word_probas["the"])

0.07153330465333047


In [19]:
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [20]:
print(split("trash"))

[('', 'trash'), ('t', 'rash'), ('tr', 'ash'), ('tra', 'sh'), ('tras', 'h'), ('trash', '')]


In [24]:
def delete(word):
    return [l + r[1:] for l,r in split(word) if r]

In [25]:
print(delete("trash"))

['rash', 'tash', 'trsh', 'trah', 'tras']


In [26]:
def swap(word):
    return [l + r[1] + r[0] + r[2:] for l,r in split(word) if len(r)>1]

In [27]:
print(swap("trash"))

['rtash', 'tarsh', 'trsah', 'trahs']


In [28]:
def replace(word):
    letters = string.ascii_lowercase
    return [l + c + r[1:] for l,r in split(word) if r for c in letters]

In [29]:
print(replace("trash"))

['arash', 'brash', 'crash', 'drash', 'erash', 'frash', 'grash', 'hrash', 'irash', 'jrash', 'krash', 'lrash', 'mrash', 'nrash', 'orash', 'prash', 'qrash', 'rrash', 'srash', 'trash', 'urash', 'vrash', 'wrash', 'xrash', 'yrash', 'zrash', 'taash', 'tbash', 'tcash', 'tdash', 'teash', 'tfash', 'tgash', 'thash', 'tiash', 'tjash', 'tkash', 'tlash', 'tmash', 'tnash', 'toash', 'tpash', 'tqash', 'trash', 'tsash', 'ttash', 'tuash', 'tvash', 'twash', 'txash', 'tyash', 'tzash', 'trash', 'trbsh', 'trcsh', 'trdsh', 'tresh', 'trfsh', 'trgsh', 'trhsh', 'trish', 'trjsh', 'trksh', 'trlsh', 'trmsh', 'trnsh', 'trosh', 'trpsh', 'trqsh', 'trrsh', 'trssh', 'trtsh', 'trush', 'trvsh', 'trwsh', 'trxsh', 'trysh', 'trzsh', 'traah', 'trabh', 'trach', 'tradh', 'traeh', 'trafh', 'tragh', 'trahh', 'traih', 'trajh', 'trakh', 'tralh', 'tramh', 'tranh', 'traoh', 'traph', 'traqh', 'trarh', 'trash', 'trath', 'trauh', 'travh', 'trawh', 'traxh', 'trayh', 'trazh', 'trasa', 'trasb', 'trasc', 'trasd', 'trase', 'trasf', 'trasg', 

In [30]:
def insert(word):
    letters = string.ascii_lowercase
    return [l + c + r for l,r in split(word) for c in letters]

In [31]:
print(insert("trash"))

['atrash', 'btrash', 'ctrash', 'dtrash', 'etrash', 'ftrash', 'gtrash', 'htrash', 'itrash', 'jtrash', 'ktrash', 'ltrash', 'mtrash', 'ntrash', 'otrash', 'ptrash', 'qtrash', 'rtrash', 'strash', 'ttrash', 'utrash', 'vtrash', 'wtrash', 'xtrash', 'ytrash', 'ztrash', 'tarash', 'tbrash', 'tcrash', 'tdrash', 'terash', 'tfrash', 'tgrash', 'thrash', 'tirash', 'tjrash', 'tkrash', 'tlrash', 'tmrash', 'tnrash', 'torash', 'tprash', 'tqrash', 'trrash', 'tsrash', 'ttrash', 'turash', 'tvrash', 'twrash', 'txrash', 'tyrash', 'tzrash', 'traash', 'trbash', 'trcash', 'trdash', 'treash', 'trfash', 'trgash', 'trhash', 'triash', 'trjash', 'trkash', 'trlash', 'trmash', 'trnash', 'troash', 'trpash', 'trqash', 'trrash', 'trsash', 'trtash', 'truash', 'trvash', 'trwash', 'trxash', 'tryash', 'trzash', 'traash', 'trabsh', 'tracsh', 'tradsh', 'traesh', 'trafsh', 'tragsh', 'trahsh', 'traish', 'trajsh', 'traksh', 'tralsh', 'tramsh', 'transh', 'traosh', 'trapsh', 'traqsh', 'trarsh', 'trassh', 'tratsh', 'traush', 'travsh',

In [32]:
def level_one_edits(word):
    return set(delete(word) + swap(word) + replace(word) + insert(word))

In [33]:
print(level_one_edits("trash"))

{'trayh', 'traash', 'ptrash', 'trqash', 'srash', 'arash', 'btrash', 'tnrash', 'travh', 'trasf', 'truash', 'otrash', 'rtrash', 'ttrash', 'turash', 'trsash', 'trzash', 'wtrash', 'trasq', 'trasph', 'trazh', 'tirash', 'thash', 'teash', 'trach', 'tranh', 'qtrash', 'tfrash', 'trpsh', 'trafsh', 'traskh', 'traszh', 'trashh', 'trasch', 'txash', 'transh', 'tratsh', 'trase', 'tresh', 'trasz', 'tcrash', 'tmrash', 'strash', 'trarsh', 'trasih', 'traqsh', 'traswh', 'krash', 'yrash', 'frash', 'trjsh', 'trrsh', 'trafh', 'tralh', 'utrash', 'traah', 'trmsh', 'trtsh', 'trabsh', 'tryash', 'trasuh', 'trass', 'urash', 'trbsh', 'trasb', 'trasn', 'tsash', 'trosh', 'tralsh', 'tlash', 'trahs', 'trakh', 'tfash', 'trcsh', 'tragsh', 'zrash', 'trauh', 'traih', 'tash', 'tnash', 'traso', 'trath', 'brash', 'trasp', 'traxh', 'hrash', 'ktrash', 'twash', 'trmash', 'trasc', 'trash', 'trashz', 'rtash', 'trasj', 'tjrash', 'tuash', 'crash', 'trasht', 'jtrash', 'trabh', 'trast', 'tarsh', 'txrash', 'traysh', 'trashi', 'tbash', 

In [34]:
def level_two_edits(word):
    return set(e2 for e1 in level_one_edits(word) for e2 in level_one_edits(e1))

In [35]:
print(level_two_edits("trash"))

{'trxashk', 'trvsj', 'tlrasq', 'tsrashi', 'turasah', 'tuasah', 'wtfrash', 'graso', 'traphz', 'trvasu', 'mtrasth', 'trbasrh', 'ltbash', 'trscsh', 'twrasqh', 'tryass', 'trbach', 'ptsash', 'trqaeh', 'trasahc', 'trcasnh', 'patrash', 'ctrasb', 'tsaah', 'tramshh', 'vteash', 'crqsh', 'trgshq', 'ttzash', 'tqamsh', 'tratnsh', 'taasa', 'utraah', 'tkrashx', 'atrpsh', 'tarasth', 'tarasxh', 'trabsq', 'trdvsh', 'zfrash', 'otrdsh', 'traae', 'hwash', 'txrzash', 'troamsh', 'trwarh', 'traspm', 'thxsh', 'mrashc', 'strasha', 'jrashh', 'traslf', 'sgrash', 'eriash', 'lrasl', 'brashp', 'trwashp', 'etrashi', 'wrpsh', 'hrasc', 'tmraseh', 'traxjsh', 'urush', 'trosm', 'trazshu', 'traphn', 'trkagh', 'sitrash', 'trmagh', 'tralsw', 'tranhq', 'wtrasnh', 'traxsa', 'xrasz', 'trarc', 'wbrash', 'hraxsh', 'turasph', 'tradhj', 'eyash', 'trzjh', 'rtqsh', 'trasinh', 'trqcash', 'fras', 'trrarh', 'tractsh', 'tpsah', 'grasx', 'wtdash', 'trlkh', 'mktrash', 'terasc', 'gpash', 'trjash', 'trashnf', 'qyash', 'tsasnh', 'tcasn', 'qrr

In [42]:
def correct_spelling(word, vocabulary, word_probabilities):
    if word in vocabulary:
        print(f"{word} is already correctly spelt")
        return
    suggestions = level_one_edits(word) or level_two_edits(word) or [word]
    best_guesses = [w for w in suggestions if w in vocabulary]
    return [(w, word_probabilities[w]) for w in best_guesses]

In [43]:
word = "famile"
guesses = correct_spelling(word, vocabs, word_probas)

print(guesses)

[('famine', 2.688750268875027e-06), ('family', 0.00018821251882125188)]


In [44]:
word = "jsus"
guesses = correct_spelling(word, vocabs, word_probas)

print(guesses)

[('jesus', 9.8587509858751e-06)]


In [45]:
word = "aflica"
guesses = correct_spelling(word, vocabs, word_probas)

print(guesses)

[('africa', 2.1510002151000216e-05)]


In [46]:
word = "foad"
guesses = correct_spelling(word, vocabs, word_probas)

print(guesses)

[('ford', 6.2737506273750625e-06), ('foal', 8.96250089625009e-07), ('foam', 3.585000358500036e-06), ('road', 0.00023123252312325231), ('fold', 2.0613752061375205e-05), ('fad', 8.96250089625009e-07), ('fond', 5.467125546712555e-05), ('load', 7.170000717000072e-06), ('food', 6.721875672187568e-05)]


In [47]:
jovian.commit()

<IPython.core.display.Javascript object>

[jovian] Committed successfully! https://jovian.ai/martins3569/spell-checker


'https://jovian.ai/martins3569/spell-checker'

In [61]:
class SpellChecker(object):
    
    def __init__(self, corpus_file_path):
        with open(corpus_file_path, "r") as file:
            lines = file.readlines()
            words = []
            for line in lines:
                words += re.findall(r'\w+', line.lower())
            
        self.vocabs = set(words)
        self.word_counts = Counter(words)
        total_words = float(sum(self.word_counts.values()))
        self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}
        
    def _level_one_edits(self, word):
        letters = string.ascii_lowercase
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [l + r[1:] for l,r in splits if r]
        swaps = [l + r[1] + r[0] + r[2:] for l,r in splits if len(r)>1]
        replaces = [l + c + r[1:] for l,r in splits if r for c in letters]
        inserts = [l + c + r for l,r in splits for c in letters]
        
        return set(deletes + swaps + replaces + inserts)
    
    def _level_two_edits(self, word):
        return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edit(e1))
    
    def check(self, word):
        candidate = self._level_one_edits(word) or self._level_two_edits(word) or [word]
        valid_candidates = [w for w in candidate if w in self.vocabs]
        return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)
    

In [62]:
checker = SpellChecker("./big.txt")

In [63]:
mispelled_word = "sentense"
print(checker.check(mispelled_word))

[('sentence', 2.3302502330250233e-05)]


In [64]:
mispelled_word = "sentenee"
print(checker.check(mispelled_word))

[('sentence', 2.3302502330250233e-05)]


In [66]:
mispelled_word = "artiicial"
print(checker.check(mispelled_word))

[('artificial', 3.316125331612533e-05)]


In [67]:
jovian.commit()

<IPython.core.display.Javascript object>

[jovian] Updating notebook "martins3569/spell-checker" on https://jovian.ai/
[jovian] Committed successfully! https://jovian.ai/martins3569/spell-checker


'https://jovian.ai/martins3569/spell-checker'