In [29]:
import re
from collections import Counter
import string
import csv
import timeit

### preprocessing dictionary ###

def tokenize(text):
    # separate text into words, normalize to lower case
    return re.findall('[a-z]+', text.lower()) 

## create dictionary of words from training corpus
WORDS = Counter(tokenize(open('big.txt').read()))
##  count of number of words in corpus
N=sum(WORDS.values())
## 

### end preprocess ###

def oneedit(word):
    ## all strings 1 edit away from word 
    # one character edit f(x) from Peter Norvig @ https://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def twoedit(word): 
    # two edits away from word
    w = []
    for e1 in oneedit(word):
        for e2 in oneedit(e1):
            w.append(e2)
    return w

def threeedit(word): 
    # three edits away from word
    w = []
    for e1 in oneedit(word):
        for e2 in oneedit(e1):
            for e3 in oneedit(e2):
                w.append(e3)
    return w

def realwords(words): 
    # checks param words with WORDS dict, returns array k with matched words 
    k = []
    for w in words:
        if w in WORDS:
            k.append(w)
    return k

def prob(word): 
    # find prob of word
    return WORDS[word] / N

def correct(word):
    # create set, starting with one edit. if not, go to two, if not, go to three
    #  |
    # \/ too expensive for larger input texts, good for smaller obscurely spelled texts
    #candidates = (known(oneedit(word)) or known(twoedit(word)) or known(threeedit(word)) or [word])

    # create set with only 1 or 2 edits, three is too slow
    candidates = (realwords(oneedit(word)) or realwords(twoedit(word)) or [word])

    
    # return sorted list of corrections, based on probability. max 5 words
    return sorted(candidates, key=prob, reverse=True)[:3]

def commonmis(l, r):
    # first pass, find common mispelling from dictionary
    # SCRAPPED: speed does not decrease by much
    with open("spell-errors.txt") as f:
        reader = csv.reader(f, delimiter='\n')
        for row in reader:
            i = str(row).split(":")
            l.append(tokenize(i[0]))
            r.append(tokenize(i[1]))
                     
l = []
r = []
commonmis(l, r)

def spellcheck(text):
    print("---------------")
    print("incorrect words")
    print("---------------")
    
    IWORDS = Counter(tokenize(open(text).read()))
    
    for i in IWORDS:
        if i in r:
            print('in r')
        if i not in WORDS:
            print(i,':', correct(i))

    txt = open('big.txt').read()

In [30]:
# spellcheck("input.txt")

start = timeit.default_timer()
result = spellcheck("This is the test of how fast my code is and proof i do leetcode cause i am the TechLead best in Silicon Valley")
stop = timeit.default_timer()
print("The time difference is :", stop - start)


---------------
incorrect words
---------------
leetcode : ['leetcode']
techlead : ['techlead']
silicon : ['silicon']
The time difference is : 0.4027951190000181
