In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter

In [2]:
file_name = '/content/shakespeare.txt'

## Part 1: Data preprocessing

- Read in a corpus
- Change everything to lowercase
- Return a list of words

In [7]:
def process_data(file_name):
  with open(file_name) as f:
    text = f.read()

  text = text.lower()
  words = re.findall(r'\w+', text)
  return words

In [8]:
words = process_data(file_name)
vocab = set(words)
print(f'The first ten words in the text are: \n', words[:10])
print(f'There are {len(vocab)} unique words in vocabulary.')

The first ten words in the text are: 
 ['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in vocabulary.


### Building a dictionary of word frequencies

In [10]:
def get_count(words):
  """
  Input:
    words: a list of words representing the corpus
  Output:
    word_count_dict: a dictionary where key is the word and value is the frequency
  """
  word_count_dict = Counter(words)
  return word_count_dict

In [11]:
word_count_dict = get_count(words)
print(f'There are {len(word_count_dict)} key-value pairs')

There are 6116 key-value pairs


### Building a dictionary of probabilities of words.

That is compute the probability that each word will appear if randomly selected from the corpus of words.

$P(w) = \frac {freq(w)}{M}$
where M = total number of words in the corpus

In [12]:
def get_probs(word_count_dict):
  """
  Input:
    word_count_dict: The wordcount dictionary where key is word and value is its frequency
  Output:
    probs: A dictionary where key are the words and values are the probabilitiy that a word will occur
  """
  probs = {}
  M = len(words)
  for word, freq in word_count_dict.items():
    probs[word] = freq / M
  
  return probs

In [13]:
probs = get_probs(word_count_dict)
print(f'Length of probs is {len(probs)}')
print(f'P("the") is {probs["the"]:.4f}')

Length of probs is 6116
P("the") is 0.0284


## Part2: String manipulations

In [14]:
# function to delete a character in a string
def delete_letter(word, verbose=False):
  # returns a list of all possible words after deleting a letter 
  split_l = [(word[:i], word[i:]) for i in range(len(word) + 1)]
  delete_l = [L+R[1:] for L, R in split_l if R]
  return delete_l

delete_word_l = delete_letter('cans')
print(delete_word_l)

['ans', 'cns', 'cas', 'can']


In [15]:
# function to switch two adjacent letters. Returns all such possible words
def switch_letter(word):
  switch_l = []
  temp = word
  
  for i in range(len(word)-1):
      a = temp[i]
      b = temp[i+1]
      w = temp[:i] + b + a + temp[i+2:]
      switch_l.append(w)

  return switch_l

switch_word_l = switch_letter(word="abc")
print(switch_word_l) 

['bac', 'acb']


In [16]:
# funtion to replace a letter 
def replace_letter(word, verbose=False):
  '''
  Input:
      word: the input string/word 
  Output:
      replaces: a list of all possible strings where we replaced one letter from the original word. 
  ''' 
  
  letters = 'abcdefghijklmnopqrstuvwxyz'

  split_l = [(word[:i], word[i:]) for i in range(len(word))]
  
  replace_l = [L + c + R[1:] for L, R in split_l for c in letters if R and R[0] != c]
  
  replace_set = set(replace_l)

  replace_l = sorted(list(replace_set))
  
  return replace_l

replace_l = replace_letter(word='can')
print(replace_l)

['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']


In [17]:
# function to insert a letter
def insert_letter(word, verbose=False):
  '''
  Input:
      word: the input string/word 
  Output:
      inserts: a set of all possible strings with one new letter inserted at every offset
  ''' 
  letters = 'abcdefghijklmnopqrstuvwxyz'
  insert_l = []
  split_l = []
  
  split_l = [(word[:i], word[i:]) for i in range(len(word) + 1)]
  
  insert_l = [L + c + R for L,R in split_l for c in letters]
  
  return insert_l

insert_l = insert_letter('at', True)
print(f"Number of strings output by insert_letter('at') is {len(insert_l)}")

Number of strings output by insert_letter('at') is 78


## Part3: Combining the edits

### Edit one letter

In [18]:
# function to get all possible edits that are one edit away from a word
def edit_one_letter(word, allow_switches = True):
  """
  Input:
      word: the string/word for which we will generate all possible wordsthat are one edit away.
  Output:
      edit_one_set: a set of words with one possible edit. Please return a set. and not a list.
  """
  
  edit_one_set = set()
  
  l1 = delete_letter(word)
  l2 = switch_letter(word)
  l3 = replace_letter(word)
  l4 = insert_letter(word)
  
  for ele in l1:
      edit_one_set.add(ele)
  for ele in l2:
      edit_one_set.add(ele)
  for ele in l3:
      edit_one_set.add(ele)
  for ele in l4:
      edit_one_set.add(ele)
  
  return edit_one_set

tmp_word = "at"
tmp_edit_one_set = edit_one_letter(tmp_word)
tmp_edit_one_l = sorted(list(tmp_edit_one_set))

print(f"input word {tmp_word} \nedit_one_l \n{tmp_edit_one_l}\n")
print(f"The type of the returned object should be a set {type(tmp_edit_one_set)}")
print(f"Number of outputs from edit_one_letter('at') is {len(edit_one_letter('at'))}")

input word at 
edit_one_l 
['a', 'aa', 'aat', 'ab', 'abt', 'ac', 'act', 'ad', 'adt', 'ae', 'aet', 'af', 'aft', 'ag', 'agt', 'ah', 'aht', 'ai', 'ait', 'aj', 'ajt', 'ak', 'akt', 'al', 'alt', 'am', 'amt', 'an', 'ant', 'ao', 'aot', 'ap', 'apt', 'aq', 'aqt', 'ar', 'art', 'as', 'ast', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz', 'au', 'aut', 'av', 'avt', 'aw', 'awt', 'ax', 'axt', 'ay', 'ayt', 'az', 'azt', 'bat', 'bt', 'cat', 'ct', 'dat', 'dt', 'eat', 'et', 'fat', 'ft', 'gat', 'gt', 'hat', 'ht', 'iat', 'it', 'jat', 'jt', 'kat', 'kt', 'lat', 'lt', 'mat', 'mt', 'nat', 'nt', 'oat', 'ot', 'pat', 'pt', 'qat', 'qt', 'rat', 'rt', 'sat', 'st', 't', 'ta', 'tat', 'tt', 'uat', 'ut', 'vat', 'vt', 'wat', 'wt', 'xat', 'xt', 'yat', 'yt', 'zat', 'zt']

The type of the returned object should be a set <class 'set'>
Number of outputs from edit_one_letter('at') is 129


In [19]:
def edit_two_letters(word, allow_switches = True):
  '''
  Input:
      word: the input string/word 
  Output:
      edit_two_set: a set of strings with all possible two edits
  '''
  
  edit_two_set = set()
    
  e1 = list(edit_one_letter(word))
  
  for w in e1:
      w_l = list(edit_one_letter(w))
      for ww in w_l:
          edit_two_set.add(ww)
  
  return edit_two_set

tmp_edit_two_set = edit_two_letters("abc")
tmp_edit_two_l = sorted(list(tmp_edit_two_set))
print(f"Number of strings with edit distance of two: {len(tmp_edit_two_l)}")
print(f"First 10 strings {tmp_edit_two_l[:10]}")
print(f"Last 10 strings {tmp_edit_two_l[-10:]}")
print(f"The data type of the returned object should be a set {type(tmp_edit_two_set)}")
print(f"Number of strings that are 2 edit distances from 'at' is {len(edit_two_letters('abc'))}")

Number of strings with edit distance of two: 14352
First 10 strings ['a', 'aa', 'aaa', 'aaabc', 'aaac', 'aab', 'aaba', 'aabac', 'aabb', 'aabbc']
Last 10 strings ['zwc', 'zxabc', 'zxbc', 'zxc', 'zyabc', 'zybc', 'zyc', 'zzabc', 'zzbc', 'zzc']
The data type of the returned object should be a set <class 'set'>
Number of strings that are 2 edit distances from 'at' is 14352


## Part4: Suggest spelling suggestions

In [20]:
def get_corrections(word, probs, vocab, n=2, verbose = False):
  '''
  Input: 
      word: a user entered string to check for suggestions
      probs: a dictionary that maps each word to its probability in the corpus
      vocab: a set containing all the vocabulary
      n: number of possible word corrections you want returned in the dictionary
  Output: 
      n_best: a list of tuples with the most probable n corrected words and their probabilities.
  '''

  suggestions = []
  n_best = []
  
  if word in vocab:
      n_best.append((word, probs[word]))
      return n_best
  
  e1 = edit_one_letter(word)
  for w in e1:
      if w in vocab:
          suggestions.append(w)
  
  if len(e1) == 0:
      e2 = edit_two_letters(word)
      for w in e1:
          if w in vocab:
              suggestions.append(w)

  
  for w in suggestions:
      n_best.append((w, probs[w]))
  
  n_best.sort(key = lambda x : x[1], reverse=True)

  if verbose: print("entered word = ", word, "\nsuggestions = ", suggestions)

  return n_best

In [22]:
my_word = 'hors' 
tmp_corrections = get_corrections(my_word, probs, vocab, 2, verbose=True)
for i, word_prob in enumerate(tmp_corrections):
    print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")

print(f"data type of corrections {type(tmp_corrections)}")

entered word =  hors 
suggestions =  ['horns', 'horn', 'hers', 'hours', 'horse']
word 0: hours, probability 0.000317
word 1: horse, probability 0.000224
word 2: hers, probability 0.000168
word 3: horns, probability 0.000037
word 4: horn, probability 0.000019
data type of corrections <class 'list'>
