# part 0 : importing

In [139]:
import re
from collections import Counter
import numpy as np
import pandas as pd

import w1_unittest

In [140]:
words = []

with open('./data/shakespeare.txt',"r") as file:
    data = file.read()

# Part 1: Data Preprocessing

In [141]:
def process_data(file_name):

    words = []

    with open(file_name,"r") as file:
        data = file.read()

    data_L = data.lower()

    words = re.findall('\w+',data_L) 

    return words

In [142]:
word_l = process_data('./data/shakespeare.txt')
vocab = set(word_l) 

In [143]:
def get_count(word_l):

    word_count_dict = {}

    word_count_dict = Counter(word_l)

    return word_count_dict

In [144]:
word_count_dict = get_count(word_l)

In [145]:
def get_probs(word_count_dict):

    probs = {}

    M = sum(word_count_dict.values())

    for key in word_count_dict.keys():
        probs[key] = word_count_dict[key]/M

    return probs

In [146]:
probs = get_probs(word_count_dict)

# Part 2: String Manipulations

In [147]:
def delete_letter(word):

    delete_l = []
    split_l = []

    for i in range (len(word)):
        split_l.append((word[:i],word[i:]))
    
    for L, R in split_l:
        if R:
            delete_l.append(L + R[1:])
    
    return delete_l


In [148]:
def switch_letter(word):

    switch_l = []
    split_l = []

    for i in range (len(word)+1):
        split_l.append((word[:i],word[i:]))

    for L,R in split_l:
        if len(R) >=2:
            switch_l.append(L + R[1] + R[0] + R[2:])
    
    return switch_l

In [149]:
def replace_letter(word):

    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    replace_l = []
    split_l = []

    for i in range (len(word)+1):
        split_l.append((word[:i],word[i:]))

    for a, b in split_l:
        if b:
            for l in letters:
                if len(b)>1:
                    replace_l.append(a+l+b[1:])
                else:
                    replace_l.append(a+l+'')
    
    replace_set = set(replace_l)
    replace_set.remove(word)

    replace_l = sorted(list(replace_set))
        
    return replace_l

In [150]:
def insert_letter(word, verbose=False):

    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = []
    split_l = []

    for i in range (len(word)+1):
        split_l.append((word[:i],word[i:]))

    for L, R in split_l :
        for l in letters:
            insert_l.append(L+l+R)
            
    if verbose: print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")
    
    return insert_l


# Part 3: Combining the edits

## 3.1 Edit one letter

In [151]:
def edit_one_letter(word, allow_switches = True):

    edit_one_set = set()

    edit_one_set.update(delete_letter(word))
    if allow_switches:
        edit_one_set.update(switch_letter(word))
    edit_one_set.update(replace_letter(word))
    edit_one_set.update(insert_letter(word))

    return set(edit_one_set)

## 3.2. Edit two letters

In [152]:
def edit_two_letters(word, allow_switches = True):

    edit_two_set = set()
    
    edit_one = edit_one_letter(word,allow_switches=allow_switches)
    for wordd in edit_one:
        if wordd:
            edit_two = edit_one_letter(wordd,allow_switches=allow_switches)
            edit_two_set.update(edit_two)
        
    return set(edit_two_set)

## 3.3. suggest spelling suggestions

In [183]:
def get_corrections(word, probs, vocab, n=2):

    suggestions = []
    n_best = []

    suggestions = list((word in vocab and word) 
                    or edit_one_letter(word).intersection(vocab) 
                    or edit_two_letters(word).intersection(vocab))
    
    best_words = {}

    for i, w in enumerate (suggestions):
        if w in probs.keys():
            best_words[w] = probs[w]
        else:
            best_words[w] = 0

    best_words = sorted(best_words.items())

    for i, key in enumerate (best_words):
        if i < n :
            n_best.append(tuple(key))
    
    return n_best


# Part 4: Minimum Edit distance

In [188]:
def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
    
    m = len(source)
    n = len (target)
    
    D = np.zeros((m+1, n+1), dtype= int)

    # fill the col 0
    for row in range (1,m+1):
        D[row,0] = D[row-1,0] + del_cost

    # fill the row 0
    for col in range(1, n+1):
        D[0, col] = D[0, col-1] + ins_cost

    # use the equation
    for row in range (1,m+1):

        for col in range(1,n+1):

            r_cost = rep_cost

            if source[row-1] == target[col-1]:
                r_cost = 0
            
            D[row, col] = min ([D[row-1, col] + del_cost,
                                D[row, col-1] + ins_cost,
                                D[row-1, col-1] + r_cost])
            
    med = D[m,n]

    return D, med
