# Implementing a basic spell-checker 

2019-12-10 

Nayef Ahmad 

Reference: https://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb 

In [1]:
import re
import math
import string
from collections import Counter
from __future__ import division

## Setting up the data in bag-of-words format 

In [2]:
corpus = open("test-text-doc.txt", "r").read()  # read in text file 
corpus

'Vancouver\nVancouver\nRichmond\nfree\nfree\nFred\n'

In [3]:
def tokenize(text):
    "List all the word tokens (consecutive letters) in a text. Normalize to lowercase."
    return re.findall('[a-z]+', text.lower())  # tabs and newlines should be dropped  

In [4]:
tokens = tokenize(corpus)
tokens

['vancouver', 'vancouver', 'richmond', 'free', 'free', 'fred']

In [5]:
# Another representation for a bag of words is a Counter, which is a dictionary of {'word': count} pairs.
# Example: 
Counter(tokenize('Is this a test? It is a test!'))

Counter({'is': 2, 'this': 1, 'a': 2, 'test': 2, 'it': 1})

In [6]:
# Applying to our actual data:
counts = Counter(tokens)
counts

Counter({'vancouver': 2, 'richmond': 1, 'free': 2, 'fred': 1})

In [7]:
# dir(counts)  # returns list of the attributes and methods of any object

## Intermediate functions 

In [8]:
def known(words):
    '''Return the subset of words that are actually in the dictionary.
    This is used in the correct() function to drastically reduce candidates 
    to only words that are in our corpus 
    '''
    return {w for w in words if w in counts}  # curly braces means we return a set object


def edits0(word): 
    "Return all strings that are zero edits away from word (i.e., just word itself)."
    return {word}


def splits(word):
    '''Return a list of all possible (first, rest) pairs that comprise word.
    This is required in the implementation of the edits1() function. 
    '''
    return [(word[:i], word[i:]) 
            for i in range(len(word)+1)]


def edits1(word):
    "Return all strings that are one edit away from this word."
    
    alphabet   = 'abcdefghijklmnopqrstuvwxyz'

    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]  # todo: what's "if b" doing here?
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "Return all strings that are two edits away from this word."
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}  # todo: how does this work??

In [9]:
known(['vancouver', 'coastal'])

{'vancouver'}

In [10]:
type(known(['vancouver', 'coastal']))

set

In [11]:
splits('vancouver')

[('', 'vancouver'),
 ('v', 'ancouver'),
 ('va', 'ncouver'),
 ('van', 'couver'),
 ('vanc', 'ouver'),
 ('vanco', 'uver'),
 ('vancou', 'ver'),
 ('vancouv', 'er'),
 ('vancouve', 'r'),
 ('vancouver', '')]

In [12]:
len(edits1('vancouver'))

494

In [13]:
len(known(edits1('vancouver')))

1

In [14]:
len(edits2('vancouver'))

114324

## Final function to correct an input word 

In [15]:
def correct(word):
    "Find the best spelling correction for this word."
    # Prefer edit distance 0, then 1, then 2; otherwise default to word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=counts.get)

In [16]:
correct('mancouver')

'vancouver'

In [17]:
for i in map(correct, ['washington', 'Vancouver', 'Vacouver', 'vacomver', 'rickmond', 'freb']):
    print(i)

washington
vancouver
vancouver
vancouver
richmond
free
