In [7]:
import re
from collections import Counter
import string
import csv

def tokenize(text):
    # separate text into words, normalize to lower case
    return re.findall('[a-z]+', text.lower()) 

## create dictionary of words from training corpus
WORDS = Counter(tokenize(open('big.txt').read()))
##  count of number of words in corpus
N=sum(WORDS.values())
## 
COUNTS = Counter(WORDS)

# find prob of word
def prob(word): 
    return WORDS[word] / N
            
#     return set(w for w in words if w in WORDS)

def oneedit(word):
    ## all strings 1 edit away from word 
    # one character edit f(x) from Peter Norvig @ https://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def twoedit(word): 
    # two edits away from word
    w = []
    for e1 in oneedit(word):
        for e2 in oneedit(e1):
            w.append(e2)
    return w

def threeedit(word): 
    # three edits away from word
    w = []
    for e1 in oneedit(word):
        for e2 in oneedit(e1):
            for e3 in oneedit(e2):
                w.append(e3)
    return w

def realwords(words): 
    k = []
    for w in words:
        if w in WORDS:
            k.append(w)
    return k

def correct(word):
    # create set, starting with one edit. if not, go to two, if not, go to three
    #  |
    # \/ too expensive for larger input texts, good for smaller obscurely spelled texts
    #candidates = (known(oneedit(word)) or known(twoedit(word)) or known(threeedit(word)) or [word])

    # create set with only 1 or 2 edits, three is too slow
    candidates = (realwords(oneedit(word)) or realwords(twoedit(word)) or [word])

    
    # return sorted list of corrections, based on probability. max 5 words
    return sorted(candidates, key=prob, reverse=True)[:3]


def commonmis(l, r):
    with open("spell-errors.txt") as f:
        reader = csv.reader(f, delimiter='\n')
        for row in reader:
            i = str(row).split(":")
            l.append(tokenize(i[0]))
            r.append(tokenize(i[1]))
                     
l = []
r = []
commonmis(l, r)

def spellcheck(text):
    print("---------------")
    print("incorrect words")
    print("---------------")
    
    IWORDS = Counter(tokenize(open(text).read()))
    
    for i in IWORDS:
        if i in r:
            print('in r')
        if i not in WORDS:
            print(i,':', correct(i))

    txt = open('big.txt').read()

In [6]:
spellcheck("input.txt")

---------------
incorrect words
---------------
rainning : ['raining']
raning : ['ranging', 'raging', 'racing']
writtings : ['writings']
disparagingly : ['disparagingly']
disparingly : ['despairingly', 'despairingly', 'sparingly']
yello : ['yellow', 'yell', 'yells']
forer : ['former', 'forcer', 'fore']
fuore : ['fore']
woodes : ['wooden', 'woods', 'wooded']
haing : ['having', 'hang', 'hating']
agression : ['aggression']
loking : ['looking', 'losing', 'loving']
begining : ['beginning']
luing : ['lying', 'lung', 'luring']
lucking : ['lacking', 'locking', 'lurking']
louk : ['look', 'loud', 'lock']
looing : ['looking', 'losing', 'loving']
lookin : ['looking']
eligble : ['eligible']
elegable : ['eligible', 'eligible', 'elegance']
eligable : ['eligible']
electrisity : ['electricity']
electricty : ['electricity']
electrizity : ['electricity']
schold : ['scold']
skold : ['sold', 'scold']
adaptable : ['adaptable']
adabtable : ['adabtable']
caned : ['cared', 'cane', 'caged']
cained : ['gained', 