# Advanced Canonicalisation Algorithms

## Soundex Algorithm

In [1]:
def get_soundex(token):
    """Get the soundex code for the string"""
    token = token.upper()

    soundex = ""
    
    # first letter of input is always the first letter of soundex
    soundex += token[0]
    
    # create a dictionary which maps letters to respective soundex codes. Vowels and 'H', 'W' and 'Y' will be represented by '.'
    dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}

    for char in token[1:]:
        for key in dictionary.keys():
            if char in key:
                code = dictionary[key]
                if code != soundex[-1]:
                    soundex += code

    # remove vowels and 'H', 'W' and 'Y' from soundex
    soundex = soundex.replace(".", "")
    
    # trim or pad to make soundex a 4-character code
    soundex = soundex[:4].ljust(4, "0")
        
    return soundex

In [6]:
print(get_soundex("Bangalore"))
print(get_soundex("Bengaluru"))
print(get_soundex("benglore"))

B524
B524
B524


In [5]:
print(get_soundex("Rachet"))
print(get_soundex("Rachit"))
print(get_soundex("Racheet"))

R230
R230
R230


In [14]:
print(get_soundex("Kaushik Ghatak"))
print(get_soundex("kousik gatak"))
print(get_soundex("kousik ghatak"))
print(get_soundex("kouski ghatak"))

K223
K223
K223
K223


In [10]:
print(get_soundex("Sandeep"))
print(get_soundex("Sandip"))
print(get_soundex("Sandep"))
print(get_soundex("sanedep"))

S531
S531
S531
S531


## Levenshtein distance

In [15]:
from nltk.metrics.distance import edit_distance

In [16]:
edit_distance("apple", "appel")

2

## Damerau-Levenshtein Distance (wil consider transpose)

In [17]:
edit_distance("apple", "appel", transpositions=False, )

2

In [18]:
edit_distance("Damerau", "Levenshtein", transpositions=False, )

10