# Fuzzy Name Matching
## Feature Engineering

### This notebook engineers the following features for modeling.
1. Partial Ratio
2. Token Sort Ratio
3. Token Set Ratio
4. Sum IPA (International Phonetic Alphabet) features
5. PSHP Soundex First
6. Iterative Substring
7. BI-SIM similarity
8. Discounted Levenshtein
9. Prefix Distance
10. Longest Common Substring (LCSstr)
11. Modified Language-Independent Product Name Search Distance (MLIPNS)
12. Strcmp95 Distance
13. Match Rating Algorithm (MRA) Comparison
14. Editex
15. Syllable Alignment Pattern Searching (SAPS) Similarity
16. FlexMetric Distance
17. Jaro-Winkler Distance
18. Higuera-Mico contextual normalized edit distance
19. Sift4 Distance
20. Eudex Distance
21. ALINE Distance
22. Covington Distance
23. Phonetic Edit Distance
24. Hello My Name Is (HMNI) Similarity

### Each feature is used to compare the first names (list vs. transaction) and last names (list vs. transaction)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unidecode
import re
import hmni
from datetime import datetime
from fuzzywuzzy import fuzz
from abydos.distance import (IterativeSubString, BISIM, DiscountedLevenshtein, Prefix, LCSstr, MLIPNS, Strcmp95,
MRA, Editex, SAPS, FlexMetric, JaroWinkler, HigueraMico, Sift4, Eudex, ALINE, Covington, PhoneticEditDistance)
from abydos.phonetic import PSHPSoundexFirst, Ainsworth
from abydos.phones import *

2022-09-24 20:24:55.142608: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-24 20:24:59.281501: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-24 20:24:59.281557: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-24 20:24:59.673521: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-24 20:25:06.237306: W tensorflow/stream_executor/platform/de

In [2]:
# Interim Data File Locations
interim_data = '../data/interim/'

In [4]:
df = pd.read_csv(interim_data + 'eda_results.csv')
df.shape

(97108, 7)

In [5]:
df[df.confidential==0].sample(10)

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential
74988,12898,ONKO,BELOTE,ONKOLINO,BELOTE,1,0
93069,30982,POLLE,LEVER,RADIVOJE,LO,0,0
75938,13849,REBUS,BOLTZ,JACOB,BOLTZ,1,0
93749,31662,ROBIN,WANN,NATAKHA,HEADEN,0,0
77680,15591,TANCHIK,BARREIRO,TANUSHKA,BARREIRO,1,0
81705,19616,BRADE,HANDY,KIT,HOLDERMAN,0,0
77198,15109,SLOBA,MUMMA,BOBA,MUMMA,1,0
65450,3359,DENNIS,STARLING,DENNE,STARLING,1,0
82241,20152,CHECO,COLBY,ANKA,MCGINLEY,0,0
87285,25197,JUQUINHA,CONSTANTINO,JOHNNE,TURNEY,0,0


In [6]:
def syllables(word):
    # single syllable word
    if len(re.findall('[aeiouy]', word)) <= 1:
        return [word]

    # sonority hierarchy: vowels, nasals, fricatives, stops
    hierarchy = {
        'a': 4, 'e': 4, 'i': 4, 'o': 4, 'u': 4, 'y': 4,
        'l': 3, 'm': 3, 'n': 3, 'r': 3, 'w': 3,
        'f': 2, 's': 2, 'v': 2, 'z': 2,
        'b': 1, 'c': 1, 'd': 1, 'g': 1, 'h': 1, 'j': 1, 'k': 1, 'p': 1, 'q': 1, 't': 1, 'x': 1,
    }
    syllables_values = [(c, hierarchy[c]) for c in word]

    syllables = []
    syll = syllables_values[0][0]
    for trigram in zip(*[syllables_values[i:] for i in range(3)]):
        (phonemes, values) = zip(*trigram)
        (previous, val, following) = values
        phoneme = phonemes[1]

        if previous > val < following:
            syllables.append(syll)
            syll = phoneme
        elif previous >= val == following:
            syll += phoneme
            syllables.append(syll)
            syll = ''
        else:
            syll += phoneme
    syll += syllables_values[-1][0]
    syllables.append(syll)

    final_syllables = []
    front = ''
    for (i, syllable) in enumerate(syllables):
        if not re.search('[aeiouy]', syllable):
            if len(final_syllables) == 0:
                front += syllable
            else:
                final_syllables = final_syllables[:-1] \
                                  + [final_syllables[-1] + syllable]
        else:
            if len(final_syllables) == 0:
                final_syllables.append(front + syllable)
            else:
                final_syllables.append(syllable)
    return final_syllables

In [7]:
df['list_first_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['list_first_name']).lower().strip()), axis=1)
df['txn_first_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['txn_first_name']).lower().strip()), axis=1)
df['list_last_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['list_last_name']).lower().strip()), axis=1)
df['txn_last_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['txn_last_name']).lower().strip()), axis=1)
df[df.confidential==0].sample(5)

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential
77479,15390,sui,ofarrell,zig,ofarrell,1,0
86106,24018,jagusia,zahn,nata,terpstra,0,0
89778,27690,margo,ostler,janet,hoey,0,0
76066,13977,riche,mire,rich,mire,1,0
65047,2956,conchi,ridgeway,concha,ridgeway,1,0


In [8]:
df['syll_list_first'] = df.apply(lambda row: syllables(row.list_first_name), axis=1)
df['syll_txn_first'] = df.apply(lambda row: syllables(row.txn_first_name), axis=1)
df['syll_list_last'] = df.apply(lambda row: syllables(row.list_last_name), axis=1)
df['syll_txn_last'] = df.apply(lambda row: syllables(row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith]
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]"
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]"
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]"
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown]


In [9]:
df['partial_first'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_list_first,row.syll_txn_first), axis=1)
df['partial_last'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_list_last,row.syll_txn_last), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100


In [10]:
df['tkn_sort_first'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_list_first,row.syll_txn_first), axis=1)
df['tkn_sort_last'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_list_last,row.syll_txn_last), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last,tkn_sort_first,tkn_sort_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100,50,100
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100,73,100
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100,55,100
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100,55,100
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100,67,100


In [11]:
df['tkn_set_first'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_list_first,row.syll_txn_first), axis=1)
df['tkn_set_last'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_list_last,row.syll_txn_last), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last,tkn_sort_first,tkn_sort_last,tkn_set_first,tkn_set_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100,50,100,50,100
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100,73,100,73,100
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100,55,100,55,100
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100,55,100,55,100
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100,67,100,100,100


In [12]:
pe = Ainsworth()
def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    if len(feat1)==0:
        score = 0
    else:
        score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score
df['sum_ipa_first'] = df.apply(lambda row: sum_ipa(row.list_first_name, row.txn_first_name), axis=1)
df['sum_ipa_last'] = df.apply(lambda row: sum_ipa(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last,tkn_sort_first,tkn_sort_last,tkn_set_first,tkn_set_last,sum_ipa_first,sum_ipa_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100,50,100,50,100,0.822581,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100,73,100,73,100,0.876344,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100,55,100,55,100,0.876344,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100,55,100,55,100,0.790323,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100,67,100,100,100,0.255376,1.0


In [None]:
pshp_soundex_first = PSHPSoundexFirst()
df['pshp_soundex_first_first'] = df.apply(
    lambda row: 1 if pshp_soundex_first.encode(row.list_first_name)==pshp_soundex_first.encode(row.txn_first_name) else 0, axis=1)
df['pshp_soundex_first_last'] = df.apply(
    lambda row: 1 if pshp_soundex_first.encode(row.list_last_name)==pshp_soundex_first.encode(row.txn_last_name) else 0, axis=1)
df[df.confidential==0].head()

In [None]:
iss = IterativeSubString()
df['iterativesubstring_first'] = df.apply(lambda row: iss.sim(row.list_first_name, row.txn_first_name), axis=1)
df['iterativesubstring_last'] = df.apply(lambda row: iss.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
bisim = BISIM()
df['bisim_first'] = df.apply(lambda row: bisim.sim(row.list_first_name, row.txn_first_name), axis=1)
df['bisim_last'] = df.apply(lambda row: bisim.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
dlev = DiscountedLevenshtein()
df['discountedlevenshtein_first'] = df.apply(lambda row: dlev.sim(row.list_first_name, row.txn_first_name), axis=1)
df['discountedlevenshtein_last'] = df.apply(lambda row: dlev.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
prefix = Prefix()
df['prefix_first'] = df.apply(lambda row: prefix.sim(row.list_first_name, row.txn_first_name), axis=1)
df['prefix_last'] = df.apply(lambda row: prefix.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
lcs = LCSstr()
df['lcsstr_first'] = df.apply(lambda row: lcs.sim(row.list_first_name, row.txn_first_name), axis=1)
df['lcsstr_last'] = df.apply(lambda row: lcs.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
mlipns = MLIPNS()
df['mlipns_first'] = df.apply(lambda row: mlipns.sim(row.list_first_name, row.txn_first_name), axis=1)
df['mlipns_last'] = df.apply(lambda row: mlipns.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
strcmp95 = Strcmp95()
df['strcmp95_first'] = df.apply(lambda row: strcmp95.sim(row.list_first_name, row.txn_first_name), axis=1)
df['strcmp95_last'] = df.apply(lambda row: strcmp95.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
mra = MRA()
df['mra_first'] = df.apply(lambda row: mra.sim(row.list_first_name, row.txn_first_name), axis=1)
df['mra_last'] = df.apply(lambda row: mra.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
editex = Editex()
df['editex_first'] = df.apply(lambda row: editex.sim(row.list_first_name, row.txn_first_name), axis=1)
df['editex_last'] = df.apply(lambda row: editex.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
saps = SAPS()
df['saps_first'] = df.apply(lambda row: saps.sim(row.list_first_name, row.txn_first_name), axis=1)
df['saps_last'] = df.apply(lambda row: saps.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
flexmetric = FlexMetric()
df['flexmetric_first'] = df.apply(lambda row: flexmetric.sim(row.list_first_name, row.txn_first_name), axis=1)
df['flexmetric_last'] = df.apply(lambda row: flexmetric.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
jaro = JaroWinkler(mode='Jaro')
df['jaro_first'] = df.apply(lambda row: jaro.sim(row.list_first_name, row.txn_first_name), axis=1)
df['jaro_last'] = df.apply(lambda row: jaro.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
higuera_mico = HigueraMico()
df['higueramico_first'] = df.apply(lambda row: higuera_mico.sim(row.list_first_name, row.txn_first_name), axis=1)
df['higueramico_last'] = df.apply(lambda row: higuera_mico.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
sift4 = Sift4()
df['sift4_first'] = df.apply(lambda row: sift4.sim(row.list_first_name, row.txn_first_name), axis=1)
df['sift4_last'] = df.apply(lambda row: sift4.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
eudex = Eudex()
df['eudex_first'] = df.apply(lambda row: eudex.sim(row.list_first_name, row.txn_first_name), axis=1)
df['eudex_last'] = df.apply(lambda row: eudex.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
aline = ALINE()
df['aline_first'] = df.apply(lambda row: aline.sim(row.list_first_name, row.txn_first_name), axis=1)
df['aline_last'] = df.apply(lambda row: aline.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
covington = Covington()
df['covington_first'] = df.apply(lambda row: covington.sim(row.list_first_name, row.txn_first_name), axis=1)
df['covington_last'] = df.apply(lambda row: covington.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
phonetic_edit = PhoneticEditDistance()
df['phoneticeditdistance_first'] = df.apply(lambda row: phonetic_edit.sim(row.list_first_name, row.txn_first_name), axis=1)
df['phoneticeditdistance_last'] = df.apply(lambda row: phonetic_edit.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
hmni = hmni.Matcher(model='latin')
df['hmni_first'] = df.apply(lambda row: hmni.similarity(row.list_first_name, row.txn_first_name), axis=1)
df['hmni_last'] = df.apply(lambda row: hmni.similarity(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [None]:
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
covington = Covington()
phonetic_edit = PhoneticEditDistance()

In [None]:
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
         aline, covington, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
              'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington',
              'phoneticeditdistance']
for i, algo in enumerate(algos):
    df[algo_names[i]] = df.apply(lambda row: algo.sim(row.name_a, row.name_b), axis=1)
    print(datetime.now(), " ", algo_names[i], " completed...")
df.head()

In [None]:
df.drop(['syll_a', 'syll_b'], axis=1, inplace=True)
df.sample(5)

In [None]:
# Save output to interim data folder as csv file
df.to_csv(interim_data + 'feature_engineering_results.csv', index=False)