# Imports

In [1]:
import re
from collections import Counter

## Tokenize the document

In [2]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [3]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('big.txt').read()))

In [4]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [5]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38311),
 ('to', 28765),
 ('in', 22020),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [17]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    #print("splits",splits,sep="\n")
    deletes    = [left + right[1:]                       for left, right in splits if right]
    #print("deletes",deletes,sep="\n")
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    #print("inserts",inserts,sep="\n")
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    #print("replaces",replaces,sep="\n")
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    #print("transposes",transposes,sep="\n")
    return set(deletes + inserts + replaces + transposes)

In [18]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

## This function filter out all the correct words which are generated

In [25]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [26]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [44]:
possible_corrections("monny")

{'bonny', 'money'}

In [28]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [45]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'monfney', 'monneyz', 'monnely', 'mmnney', 'monxney', 'monnehy', 'monnemy', 'mosnney', 'monnyy', 'monnex', 'monnjy', 'monnek', 'monneyk', 'monnes', 'monneu', 'qmonney', 'monnkey', 'moneny', 'mongney', 'monneyc', 'monneyd', 'mpnney', 'monneiy', 'monxey', 'monneyi', 'mtnney', 'moneey', 'monncy', 'mynney', 'monny', 'monnery', 'zonney', 'mgnney', 'monney', 'monngey', 'monneyh', 'monpey', 'monnpy', 'monoey', 'monuney', 'motnney', 'monnxey', 'monnewy', 'mvnney', 'munney', 'monneyr', 'mkonney', 'msonney', 'money', 'morney', 'monrey', 'mwnney', 'monneyy', 'monneyg', 'mjnney', 'monnecy', 'monhey', 'monsey', 'moenney', 'monnry', 'bonney', 'wonney', 'mbnney', 'mwonney', 'monnpey', 'monvey', 'moniey', 'monnwy', 'momnney', 'mfnney', 'nonney', 'mobney', 'rmonney', 'eonney', 'mknney', 'monneyo', 'monnay', 'monnny', 'mondney', 'moiney', 'monnep', 'monnzey', 'jmonney', 'mobnney', 'moeney', 'mnney', 'monnvey', 'monkney', 'myonney', 'monnsey', 'moznney', 'monnejy', 'mounney', 'mooney', 'xonney', 'mo

In [46]:
print(known(edits_one("monney")))

{'money', 'monkey'}


In [47]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'money', 'monkey'}


In [48]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'money', 'monkey'}


In [49]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922385563056744
5.378623735687258e-06


In [50]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [51]:
# test spell check
print(spell_check("sherlok"))

Did you mean sherlock?


###  1 edit distance words for "emfasize"

In [57]:
len(edits_one("emfasize"))

442

###  Unique 2 edit distance words for "emfasize"

In [58]:
len(set(edits_two("emfasize")))

90902

### Number of possible correction for the word "emfasize"

In [59]:
len(possible_corrections("emfasize"))

1

In [60]:
possible_corrections("emfasize")

{'emphasize'}