In [89]:
import jellyfish
import pandas as pd

In [93]:
nombre_correcto = 'MAYRA moRAtaYA'
nombres_incorrectos = ['maira morataya', 'MAira moratalla', 'mayra morataya', 'mayra moratall', 'mayr moataya', 'mayramorataya', 'ma morataya', 'mayar moarat']

# String comparison
## Algoritmo: Jaro Distance
Compute the Jaro distance between s1 and s2.

Jaro distance is a string-edit distance that gives a floating point response in [0,1] where 0 represents two completely dissimilar strings and 1 represents identical strings.

In [55]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.jaro_distance(nombre_correcto.upper(), nombre_incorrecto.upper())
    print(f'{nombre_incorrecto.upper()}: {round(results * 100, 2)}%')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATALLA: 88.57%
MAYRA MORATAYA: 100.0%
MAYRA MORATALL: 90.48%
MAYR MOATAYA: 84.13%
MAYRAMORATAYA: 97.62%
MA MORATAYA: 80.74%
MAYAR MOARAT: 86.9%


## Jaro-Winkler Distance
Compute the Jaro-Winkler distance between s1 and s2.

Jaro-Winkler is a modification/improvement to Jaro distance, like Jaro it gives a floating point response in [0,1] where 0 represents two completely dissimilar strings and 1 represents identical strings.

See the Jaro-Winkler distance article at Wikipedia for more details.

In [87]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.jaro_winkler(nombre_correcto.upper(), nombre_incorrecto.upper())
    print(f'{nombre_incorrecto.upper()}: {round(results * 100, 2)}%')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: 96.19%
MAIRA MORATALLA: 90.86%
MAYRA MORATAYA: 100.0%
MAYRA MORATALL: 94.29%
MAYR MOATAYA: 90.48%
MAYRAMORATAYA: 98.57%
MA MORATAYA: 84.59%
MAYAR MOARAT: 90.83%


## Levenshtein Distance
Compute the Levenshtein distance between s1 and s2.

Levenshtein distance represents the number of insertions, deletions, and substitutions required to change one word to another.

For example: levenshtein_distance('berne', 'born') == 2 representing the transformation of the first e to o and the deletion of the second e.

See the Levenshtein distance article at Wikipedia for more details.

In [63]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.levenshtein_distance(nombre_correcto.upper(), nombre_incorrecto.upper())
    print(f'{nombre_incorrecto.upper()}: {round(results * 1, 2)} Letras incorrectas')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: 1 Letras incorrectas
MAIRA MORATALLA: 3 Letras incorrectas
MAYRA MORATAYA: 0 Letras incorrectas
MAYRA MORATALL: 2 Letras incorrectas
MAYR MOATAYA: 2 Letras incorrectas
MAYRAMORATAYA: 1 Letras incorrectas
MA MORATAYA: 3 Letras incorrectas
MAYAR MOARAT: 6 Letras incorrectas


## Damerau-Levenshtein Distance
Compute the Damerau-Levenshtein distance between s1 and s2.

A modification of Levenshtein distance, Damerau-Levenshtein distance counts transpositions (such as ifhs for fish) as a single edit.

Where levenshtein_distance('fish', 'ifsh') == 2 as it would require a deletion and an insertion, though damerau_levenshtein_distance('fish', 'ifsh') == 1 as this counts as a transposition.

See the Damerau-Levenshtein distance article at Wikipedia for more details.

In [64]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.damerau_levenshtein_distance(nombre_correcto.upper(), nombre_incorrecto.upper())
    print(f'{nombre_incorrecto.upper()}: {round(results * 1, 2)} Letras incorrectas')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: 1 Letras incorrectas
MAIRA MORATALLA: 3 Letras incorrectas
MAYRA MORATAYA: 0 Letras incorrectas
MAYRA MORATALL: 2 Letras incorrectas
MAYR MOATAYA: 2 Letras incorrectas
MAYRAMORATAYA: 1 Letras incorrectas
MA MORATAYA: 3 Letras incorrectas
MAYAR MOARAT: 5 Letras incorrectas


## Match Rating Approach Comparison
Compare s1 and s2 using the match rating approach algorithm, returns True if strings are considered equivalent or False if not. Can also return None if s1 and s2 are not comparable (length differs by more than 3).

The Match rating approach algorithm is an algorithm for determining whether or not two names are pronounced similarly. Strings are first encoded using match_rating_codex() then compared according to the MRA algorithm.

See the Match Rating Approach article at Wikipedia for more details.

In [78]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.match_rating_comparison(nombre_correcto.upper(), nombre_incorrecto.upper())
    print(f'{nombre_incorrecto.upper()}: {round(results * 1, 2)} Letras incorrectas')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: 1 Letras incorrectas
MAIRA MORATALLA: 1 Letras incorrectas
MAYRA MORATAYA: 1 Letras incorrectas
MAYRA MORATALL: 1 Letras incorrectas
MAYR MOATAYA: 1 Letras incorrectas
MAYRAMORATAYA: 1 Letras incorrectas
MA MORATAYA: 1 Letras incorrectas
MAYAR MOARAT: 1 Letras incorrectas


## Hamming Distance
Compute the Hamming distance between s1 and s2.

Hamming distance is the measure of the number of characters that differ between two strings.

Typically Hamming distance is undefined when strings are of different length, but this implementation considers extra characters as differing. For example hamming_distance('abc', 'abcd') == 1.

See the Hamming distance article at Wikipedia for more details.

In [72]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.hamming_distance(nombre_correcto.upper(), nombre_incorrecto.upper())
    print(f'{nombre_incorrecto.upper()}: {round(results * 1, 2)} Letras incorrectas')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: 1 Letras incorrectas
MAIRA MORATALLA: 4 Letras incorrectas
MAYRA MORATAYA: 0 Letras incorrectas
MAYRA MORATALL: 2 Letras incorrectas
MAYR MOATAYA: 8 Letras incorrectas
MAYRAMORATAYA: 9 Letras incorrectas
MA MORATAYA: 12 Letras incorrectas
MAYAR MOARAT: 8 Letras incorrectas


# Phonetic encoding:
## American Soundex
Calculate the American Soundex of the string s.

Soundex is an algorithm to convert a word (typically a name) to a four digit code in the form 'A123' where 'A' is the first letter of the name and the digits represent similar sounds.

For example soundex('Ann') == soundex('Anne') == 'A500' and soundex('Rupert') == soundex('Robert') == 'R163'.

See the Soundex article at Wikipedia for more details.

In [82]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.soundex(nombre_correcto)
    results2 = jellyfish.soundex(nombre_incorrecto)
    es_igual = results == results2
    print(f'{nombre_incorrecto.upper()}: {results} = {results2} -> {es_igual}')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: M656 = M656 -> True
MAIRA MORATALLA: M656 = M656 -> True
MAYRA MORATAYA: M656 = M656 -> True
MAYRA MORATALL: M656 = M656 -> True
MAYR MOATAYA: M656 = M653 -> False
MAYRAMORATAYA: M656 = M656 -> True
MA MORATAYA: M656 = M563 -> False
MAYAR MOARAT: M656 = M656 -> True


## Metaphone
Calculate the metaphone code for the string s.

The metaphone algorithm was designed as an improvement on Soundex. It transforms a word into a string consisting of '0BFHJKLMNPRSTWXY' where '0' is pronounced 'th' and 'X' is a '[sc]h' sound.

For example metaphone('Klumpz') == metaphone('Clumps') == 'KLMPS'.

See the Metaphone article at Wikipedia for more details.

In [81]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.metaphone(nombre_correcto)
    results2 = jellyfish.metaphone(nombre_incorrecto)
    es_igual = results == results2
    print(f'{nombre_incorrecto.upper()}: {results} = {results2} -> {es_igual}')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: MR MRTY = MR MRTY -> True
MAIRA MORATALLA: MR MRTY = MR MRTL -> False
MAYRA MORATAYA: MR MRTY = MR MRTY -> True
MAYRA MORATALL: MR MRTY = MR MRTL -> False
MAYR MOATAYA: MR MRTY = MR MTY -> False
MAYRAMORATAYA: MR MRTY = MRMRTY -> False
MA MORATAYA: MR MRTY = M MRTY -> False
MAYAR MOARAT: MR MRTY = MYR MRT -> False


## NYSIIS (New York State Identification and Intelligence System)
Calculate the NYSIIS code for the string s.

The NYSIIS algorithm is an algorithm developed by the New York State Identification and Intelligence System. It transforms a word into a phonetic code. Like soundex and metaphone it is primarily intended for use on names (as they would be pronounced in English).

For example nysiis('John') == nysiis('Jan') == JAN.

See the NYSIIS article at Wikipedia for more details.

In [80]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.nysiis(nombre_correcto)
    results2 = jellyfish.nysiis(nombre_incorrecto)
    es_igual = results == results2
    print(f'{nombre_incorrecto.upper()}: {results} = {results2} -> {es_igual}')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: MAYRA NARATAY = MARA NARATAY -> False
MAIRA MORATALLA: MAYRA NARATAY = MARA NARATAL -> False
MAYRA MORATAYA: MAYRA NARATAY = MAYRA NARATAY -> True
MAYRA MORATALL: MAYRA NARATAY = MAYRA NARATAL -> False
MAYR MOATAYA: MAYRA NARATAY = MAYR NATAY -> False
MAYRAMORATAYA: MAYRA NARATAY = MAYRANARATAY -> False
MA MORATAYA: MAYRA NARATAY = MA NARATAY -> False
MAYAR MOARAT: MAYRA NARATAY = MAYAR NARAT -> False


## Match Rating Codex
Calculate the match rating approach value (also called PNI) for the string s.

The Match rating approach algorithm is an algorithm for determining whether or not two names are pronounced similarly. The algorithm consists of an encoding function (similar to soundex or nysiis) which is implemented here as well as match_rating_comparison() which does the actual comparison.

See the Match Rating Approach article at Wikipedia for more details.

In [79]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.match_rating_codex(nombre_correcto)
    results2 = jellyfish.match_rating_codex(nombre_incorrecto)
    es_igual = results == results2
    print(f'{nombre_incorrecto.upper()}: {results} = {results2} -> {es_igual}')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: MYRRTY = MR RTY -> False
MAIRA MORATALLA: MYRRTY = MR RTL -> False
MAYRA MORATAYA: MYRRTY = MYRRTY -> True
MAYRA MORATALL: MYRRTY = MYRRTL -> False
MAYR MOATAYA: MYRRTY = MYRMTY -> False
MAYRAMORATAYA: MYRRTY = MYRRTY -> True
MA MORATAYA: MYRRTY = M MRTY -> False
MAYAR MOARAT: MYRRTY = MYRMRT -> False


## Porter Stemmer
Reduce the string s to its stem using the common Porter stemmer.

Stemming is the process of reducing a word to its root form, for example 'stemmed' to 'stem'.

Martin Porter's algorithm is a common algorithm used for stemming that works for many purposes.

See the official homepage for the Porter Stemming Algorithm for more details.

In [84]:
print(f'Nombre a comparar: {nombre_correcto.upper()}')
print('Resultados:')
for nombre_incorrecto in nombres_incorrectos:
    results = jellyfish.porter_stem(nombre_correcto.upper())
    results2 = jellyfish.porter_stem(nombre_incorrecto.upper())
    es_igual = results == results2
    print(f'{nombre_incorrecto.upper()}: {results} = {results2} -> {es_igual}')

Nombre a comparar: MAYRA MORATAYA
Resultados:
MAIRA MORATAYA: MAYRA MORATAYA = MAIRA MORATAYA -> False
MAIRA MORATALLA: MAYRA MORATAYA = MAIRA MORATALLA -> False
MAYRA MORATAYA: MAYRA MORATAYA = MAYRA MORATAYA -> True
MAYRA MORATALL: MAYRA MORATAYA = MAYRA MORATALL -> False
MAYR MOATAYA: MAYRA MORATAYA = MAYR MOATAYA -> False
MAYRAMORATAYA: MAYRA MORATAYA = MAYRAMORATAYA -> False
MA MORATAYA: MAYRA MORATAYA = MA MORATAYA -> False
MAYAR MOARAT: MAYRA MORATAYA = MAYAR MOARAT -> False


# COMPARACION CON EXCEL
## Cargar Excel de Input y genera Excel de Output

In [134]:
def validar_algoritmos(nombre_correcto, nombre_incorrecto):
    nombre_correcto = nombre_correcto.upper()
    nombre_incorrecto = nombre_incorrecto.upper()
    
    jaro_distance = str(round(jellyfish.jaro_distance(nombre_correcto, nombre_incorrecto) * 100, 2)) + '%'
    jaro_winkler = str(round(jellyfish.jaro_winkler(nombre_correcto, nombre_incorrecto) * 100, 2)) + '%'
    levenshtein_distance = str(jellyfish.levenshtein_distance(nombre_correcto, nombre_incorrecto)) + ' Caracteres Distintos'
    damerau_levenshtein_distance = str(jellyfish.damerau_levenshtein_distance(nombre_correcto, nombre_incorrecto)) + ' Caracteres Distintos'
    match_rating_comparison = jellyfish.match_rating_comparison(nombre_correcto, nombre_incorrecto)
    hamming_distance = str(jellyfish.hamming_distance(nombre_correcto, nombre_incorrecto)) + ' Caracteres Distintos'

    soundex = jellyfish.soundex(nombre_correcto) == jellyfish.soundex(nombre_incorrecto)
    metaphone = jellyfish.metaphone(nombre_correcto) == jellyfish.metaphone(nombre_incorrecto)
    match_rating_codex = jellyfish.match_rating_codex(nombre_correcto) == jellyfish.match_rating_codex(nombre_incorrecto)
    nysiis = jellyfish.nysiis(nombre_correcto) == jellyfish.nysiis(nombre_incorrecto)
    porter_stem = jellyfish.porter_stem(nombre_correcto.upper()) == jellyfish.porter_stem(nombre_incorrecto.upper())

    datos = {'NombreCorrecto': nombre_correcto, 'NombreComparacion': nombre_incorrecto, 'jaro_distance': jaro_distance, 'jaro_winkler': jaro_winkler, 'levenshtein_distance': levenshtein_distance, 'damerau_levenshtein_distance': damerau_levenshtein_distance, 'match_rating_comparison': match_rating_comparison, 'hamming_distance': hamming_distance, 'soundex': soundex, 'metaphone': metaphone, 'nysiis': nysiis, 'match_rating_codex': match_rating_codex, 'porter_stem': porter_stem}
   
    return datos

In [138]:
def ejecuta_comparacion(datos_comparacion):
    columnas = ['NombreCorrecto', 'NombreComparacion', 'jaro_distance', 'jaro_winkler', 'levenshtein_distance', 'damerau_levenshtein_distance', 'match_rating_comparison', 'hamming_distance', 'soundex', 'metaphone', 'nysiis', 'match_rating_codex', 'porter_stem']
    df_resultado = pd.DataFrame(columns=columnas)

    for index, row in datos_comparacion.iterrows():
        nombre1 = row[0]
        nombre2 = row[1]
        resultado = validar_algoritmos(nombre1, nombre2)
        df_resultado = df_resultado.append(resultado, ignore_index = True)
    return df_resultado

In [140]:
datos_comparacion = pd.read_excel('Input_comparacion.xlsx', sheet_name = 0, usecols = "A:B", index=False)
df_resultado = ejecuta_comparacion(datos_comparacion)
df_resultado.to_excel("Output_comparacion.xlsx") 
df_resultado.head(3)

Unnamed: 0,NombreCorrecto,NombreComparacion,jaro_distance,jaro_winkler,levenshtein_distance,damerau_levenshtein_distance,match_rating_comparison,hamming_distance,soundex,metaphone,nysiis,match_rating_codex,porter_stem
0,MAYRA MORATAYA,MAIRA MORATAYA,95.24%,96.19%,1 Caracteres Distintos,1 Caracteres Distintos,True,1 Caracteres Distintos,True,True,False,False,False
1,MAYRA MORATAYA,MAIRA MORATALLA,88.57%,90.86%,3 Caracteres Distintos,3 Caracteres Distintos,True,4 Caracteres Distintos,True,False,False,False,False
2,MAYRA MORATAYA,MAYRA MORATAYA,100.0%,100.0%,0 Caracteres Distintos,0 Caracteres Distintos,True,0 Caracteres Distintos,True,True,True,True,True
