In [1]:
import pandas as pd
import numpy as np
import re

corpus = pd.read_csv("/Users/piyushmundhra/Desktop/cs173/words/en-basic", sep='\n', header=None)
corpus[1] = 0
corpus.columns = ['words', 'distance']
print(list(corpus))

['words', 'distance']


In [2]:
def tokenize(string):
    regex = r'\b[a-zA-Z]*\b'
    words = pd.Series(re.findall(regex, string))
    words = words[words != '']
    return words

In [3]:
print(tokenize("I, am a human;fsas,/e"))

0         I
2        am
4         a
6     human
8      fsas
10        e
dtype: object


In [4]:
# Included a helper function to ease debugging
def editDistanceHelper(word1, word2):

    word1 = " " + word1
    word2 = " " + word2

    # Setting up edit distance matrix and inputting given values (constructing from null string)
    rows = []
    rows[:] = word1
    cols = []
    cols[:] = word2

    matrix = pd.DataFrame(np.zeros((len(word1), len(word2))), columns=cols, index=rows)
    for i in range (0, len(word1)): 
        matrix.iloc[i,0] = i
    for i in range (0, len(word2)):
        matrix.iloc[0,i] = i


    # Computing the rest of the values based on the assumption that editing, deleting, and inserting a character all have the same weight/cost
    for i in range (1, len(word1)):
        for j in range (1, len(word2)):
            matrix.iloc[i,j] = min(matrix.iloc[i-1,j-1], matrix.iloc[i-1,j], matrix.iloc[i,j-1])
            if(word1[i] != word2[j]):
                matrix.iloc[i,j] = matrix.iloc[i,j] + 1
    return matrix

# Will return integer edit distance value
def editDistance(word1, word2):
    return editDistanceHelper(word1, word2).iloc[len(word1), len(word2)]

In [5]:
def closestWords(word, corpus):
    a = len(word) + 1
    b = len(word) - 1
    temp = corpus.loc[(corpus.words.str.len() <= a) & (corpus.words.str.len() >= b)]
    for c in range (0, temp.shape[0]):
        temp.iloc[c,1] = editDistance(temp.iloc[c,0], word)
    return temp

In [6]:
def suggestWords(word, corpus, num):
    temp = closestWords(word, corpus)
    temp = temp.sort_values(by='distance', ascending=True)
    return temp.iloc[0:num,:]

In [7]:
def spellcheck(sentence, corpus):
    for w in tokenize(sentence):
        if(not(w in corpus['words'].unique())): 
            print('Suggestions for ', w, ': ', end=' ')
            close = suggestWords(w, corpus, 3)
            for cw in close['words']:
                print(cw, end= ', ')
            print()

In [8]:
spellcheck('I table all, ;amoyt ablr smooth criminal drink water', corpus)

Suggestions for  amoyt :  about, almost, among, 
Suggestions for  ablr :  able, all, air, 
Suggestions for  criminal :  chemical, writing, driving, 
