# Fuzzy Name Matching
## Feature Engineering

### This notebook engineers the following features for modeling.
1. Partial Ratio
2. Token Sort Ratio
3. Token Set Ratio
4. Sum IPA (International Phonetic Alphabet) features
5. PSHP Soundex First
6. Iterative Substring
7. BI-SIM similarity
8. Discounted Levenshtein
9. Prefix Distance
10. Longest Common Substring (LCSstr)
11. Modified Language-Independent Product Name Search Distance (MLIPNS)
12. Strcmp95 Distance
13. Match Rating Algorithm (MRA) Comparison
14. Editex
15. Syllable Alignment Pattern Searching (SAPS) Similarity
16. FlexMetric Distance
17. Jaro-Winkler Distance
18. Higuera-Mico contextual normalized edit distance
19. Sift4 Distance
20. Eudex Distance
21. ALINE Distance
22. Covington Distance
23. Phonetic Edit Distance
24. Hello My Name Is (HMNI) Similarity

### Each feature is used to compare the first names (list vs. transaction) and last names (list vs. transaction)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unidecode
import re
import hmni
from datetime import datetime
from fuzzywuzzy import fuzz
from abydos.distance import (IterativeSubString, BISIM, DiscountedLevenshtein, Prefix, LCSstr, MLIPNS, Strcmp95,
MRA, Editex, SAPS, FlexMetric, JaroWinkler, HigueraMico, Sift4, Eudex, ALINE, Covington, PhoneticEditDistance)
from abydos.phonetic import PSHPSoundexFirst, Ainsworth
from abydos.phones import *

In [2]:
# Interim Data File Locations
interim_data = '../data/interim/'

In [3]:
df = pd.read_csv(interim_data + 'eda_results.csv')
df.shape

(97108, 7)

In [4]:
df[df.confidential==0].sample(10)

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential
79886,17797,ALBERT,MILES,HANNA,FITZHUGH,0,0
89960,27872,MARIJANA,EPP,SYLVANUS,MAGEE,0,0
69144,7054,JINCE,WALDMAN,JOANNA,WALDMAN,1,0
64585,2494,CATRIN,LEVESQUE,CARA,LEVESQUE,1,0
78660,16571,VELJO,DELEE,VELJA,DELEE,1,0
94948,32861,STIPO,SELLNER,ANTOINE,RIZO,0,0
81636,19547,BON,METZGER,MELINDA,AHO,0,0
90731,28643,MICHAL,UTTERBACK,SLY,KEENAN,0,0
64073,1982,BOCA,SALDANA,BOSA,SALDANA,1,0
81118,19029,BEKS,BRUNSON,LENA,GUERTIN,0,0


In [5]:
def syllables(word):
    # single syllable word
    if len(re.findall('[aeiouy]', word)) <= 1:
        return [word]

    # sonority hierarchy: vowels, nasals, fricatives, stops
    hierarchy = {
        'a': 4, 'e': 4, 'i': 4, 'o': 4, 'u': 4, 'y': 4,
        'l': 3, 'm': 3, 'n': 3, 'r': 3, 'w': 3,
        'f': 2, 's': 2, 'v': 2, 'z': 2,
        'b': 1, 'c': 1, 'd': 1, 'g': 1, 'h': 1, 'j': 1, 'k': 1, 'p': 1, 'q': 1, 't': 1, 'x': 1,
    }
    syllables_values = [(c, hierarchy[c]) for c in word]

    syllables = []
    syll = syllables_values[0][0]
    for trigram in zip(*[syllables_values[i:] for i in range(3)]):
        (phonemes, values) = zip(*trigram)
        (previous, val, following) = values
        phoneme = phonemes[1]

        if previous > val < following:
            syllables.append(syll)
            syll = phoneme
        elif previous >= val == following:
            syll += phoneme
            syllables.append(syll)
            syll = ''
        else:
            syll += phoneme
    syll += syllables_values[-1][0]
    syllables.append(syll)

    final_syllables = []
    front = ''
    for (i, syllable) in enumerate(syllables):
        if not re.search('[aeiouy]', syllable):
            if len(final_syllables) == 0:
                front += syllable
            else:
                final_syllables = final_syllables[:-1] \
                                  + [final_syllables[-1] + syllable]
        else:
            if len(final_syllables) == 0:
                final_syllables.append(front + syllable)
            else:
                final_syllables.append(syllable)
    return final_syllables

In [6]:
df['list_first_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['list_first_name']).lower().strip()), axis=1)
df['txn_first_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['txn_first_name']).lower().strip()), axis=1)
df['list_last_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['list_last_name']).lower().strip()), axis=1)
df['txn_last_name'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['txn_last_name']).lower().strip()), axis=1)
df[df.confidential==0].sample(5)

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential
92258,30170,nob,claunch,jasiulek,crampton,0,0
93867,31780,ronni,knickerbocker,via,prendergast,0,0
79705,17616,ad,wallace,krasimir,caston,0,0
93039,30952,pisti,outland,becke,mcfarren,0,0
90806,28718,midge,hutter,char,muldoon,0,0


In [7]:
df['syll_list_first'] = df.apply(lambda row: syllables(row.list_first_name), axis=1)
df['syll_txn_first'] = df.apply(lambda row: syllables(row.txn_first_name), axis=1)
df['syll_list_last'] = df.apply(lambda row: syllables(row.list_last_name), axis=1)
df['syll_txn_last'] = df.apply(lambda row: syllables(row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith]
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]"
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]"
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]"
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown]


In [8]:
df['partial_first'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_list_first,row.syll_txn_first), axis=1)
df['partial_last'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_list_last,row.syll_txn_last), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100


In [9]:
df['tkn_sort_first'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_list_first,row.syll_txn_first), axis=1)
df['tkn_sort_last'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_list_last,row.syll_txn_last), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last,tkn_sort_first,tkn_sort_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100,50,100
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100,73,100
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100,55,100
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100,55,100
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100,67,100


In [10]:
df['tkn_set_first'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_list_first,row.syll_txn_first), axis=1)
df['tkn_set_last'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_list_last,row.syll_txn_last), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last,tkn_sort_first,tkn_sort_last,tkn_set_first,tkn_set_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100,50,100,50,100
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100,73,100,73,100
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100,55,100,55,100
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100,55,100,55,100
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100,67,100,100,100


In [11]:
pe = Ainsworth()
def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    if len(feat1)==0:
        score = 0
    else:
        score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score
df['sum_ipa_first'] = df.apply(lambda row: sum_ipa(row.list_first_name, row.txn_first_name), axis=1)
df['sum_ipa_last'] = df.apply(lambda row: sum_ipa(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,syll_txn_last,partial_first,partial_last,tkn_sort_first,tkn_sort_last,tkn_set_first,tkn_set_last,sum_ipa_first,sum_ipa_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],[smith],57,100,50,100,50,100,0.822581,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]","[john, son]",83,100,73,100,73,100,0.876344,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]","[wil, liams]",75,100,55,100,55,100,0.876344,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]","[jo, nes]",75,100,55,100,55,100,0.790323,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],[brown],88,100,67,100,100,100,0.255376,1.0


In [12]:
pshp_soundex_first = PSHPSoundexFirst()
df['pshp_soundex_first_first'] = df.apply(
    lambda row: 1 if pshp_soundex_first.encode(row.list_first_name)==pshp_soundex_first.encode(row.txn_first_name) else 0, axis=1)
df['pshp_soundex_first_last'] = df.apply(
    lambda row: 1 if pshp_soundex_first.encode(row.list_last_name)==pshp_soundex_first.encode(row.txn_last_name) else 0, axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,partial_first,partial_last,tkn_sort_first,tkn_sort_last,tkn_set_first,tkn_set_last,sum_ipa_first,sum_ipa_last,pshp_soundex_first_first,pshp_soundex_first_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,57,100,50,100,50,100,0.822581,1.0,0,1
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,83,100,73,100,73,100,0.876344,1.0,0,1
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,75,100,55,100,55,100,0.876344,1.0,0,1
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,75,100,55,100,55,100,0.790323,1.0,0,1
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,88,100,67,100,100,100,0.255376,1.0,0,1


In [13]:
iss = IterativeSubString()
df['iterativesubstring_first'] = df.apply(lambda row: iss.sim(row.list_first_name, row.txn_first_name), axis=1)
df['iterativesubstring_last'] = df.apply(lambda row: iss.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,tkn_sort_first,tkn_sort_last,tkn_set_first,tkn_set_last,sum_ipa_first,sum_ipa_last,pshp_soundex_first_first,pshp_soundex_first_last,iterativesubstring_first,iterativesubstring_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,50,100,50,100,0.822581,1.0,0,1,0.05,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,73,100,73,100,0.876344,1.0,0,1,0.1,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,55,100,55,100,0.876344,1.0,0,1,0.1,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,55,100,55,100,0.790323,1.0,0,1,0.05,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,67,100,100,100,0.255376,1.0,0,1,0.863636,1.0


In [14]:
bisim = BISIM()
df['bisim_first'] = df.apply(lambda row: bisim.sim(row.list_first_name, row.txn_first_name), axis=1)
df['bisim_last'] = df.apply(lambda row: bisim.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,tkn_set_first,tkn_set_last,sum_ipa_first,sum_ipa_last,pshp_soundex_first_first,pshp_soundex_first_last,iterativesubstring_first,iterativesubstring_last,bisim_first,bisim_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,50,100,0.822581,1.0,0,1,0.05,1.0,0.357143,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,73,100,0.876344,1.0,0,1,0.1,1.0,0.6,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,55,100,0.876344,1.0,0,1,0.1,1.0,0.5,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,55,100,0.790323,1.0,0,1,0.05,1.0,0.5,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,100,100,0.255376,1.0,0,1,0.863636,1.0,0.5,1.0


In [15]:
dlev = DiscountedLevenshtein()
df['discountedlevenshtein_first'] = df.apply(lambda row: dlev.sim(row.list_first_name, row.txn_first_name), axis=1)
df['discountedlevenshtein_last'] = df.apply(lambda row: dlev.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,sum_ipa_first,sum_ipa_last,pshp_soundex_first_first,pshp_soundex_first_last,iterativesubstring_first,iterativesubstring_last,bisim_first,bisim_last,discountedlevenshtein_first,discountedlevenshtein_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.822581,1.0,0,1,0.05,1.0,0.357143,1.0,0.305797,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.876344,1.0,0,1,0.1,1.0,0.6,1.0,0.627066,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.876344,1.0,0,1,0.1,1.0,0.5,1.0,0.467913,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.790323,1.0,0,1,0.05,1.0,0.5,1.0,0.409011,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.255376,1.0,0,1,0.863636,1.0,0.5,1.0,0.527905,1.0


In [16]:
prefix = Prefix()
df['prefix_first'] = df.apply(lambda row: prefix.sim(row.list_first_name, row.txn_first_name), axis=1)
df['prefix_last'] = df.apply(lambda row: prefix.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,pshp_soundex_first_first,pshp_soundex_first_last,iterativesubstring_first,iterativesubstring_last,bisim_first,bisim_last,discountedlevenshtein_first,discountedlevenshtein_last,prefix_first,prefix_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0,1,0.05,1.0,0.357143,1.0,0.305797,1.0,0.333333,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0,1,0.1,1.0,0.6,1.0,0.627066,1.0,0.5,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0,1,0.1,1.0,0.5,1.0,0.467913,1.0,0.5,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0,1,0.05,1.0,0.5,1.0,0.409011,1.0,0.25,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0,1,0.863636,1.0,0.5,1.0,0.527905,1.0,0.0,1.0


In [17]:
lcs = LCSstr()
df['lcsstr_first'] = df.apply(lambda row: lcs.sim(row.list_first_name, row.txn_first_name), axis=1)
df['lcsstr_last'] = df.apply(lambda row: lcs.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,iterativesubstring_first,iterativesubstring_last,bisim_first,bisim_last,discountedlevenshtein_first,discountedlevenshtein_last,prefix_first,prefix_last,lcsstr_first,lcsstr_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.05,1.0,0.357143,1.0,0.305797,1.0,0.333333,1.0,0.285714,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.1,1.0,0.6,1.0,0.627066,1.0,0.5,1.0,0.4,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.1,1.0,0.5,1.0,0.467913,1.0,0.5,1.0,0.4,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.05,1.0,0.5,1.0,0.409011,1.0,0.25,1.0,0.2,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.863636,1.0,0.5,1.0,0.527905,1.0,0.0,1.0,0.571429,1.0


In [18]:
mlipns = MLIPNS()
df['mlipns_first'] = df.apply(lambda row: mlipns.sim(row.list_first_name, row.txn_first_name), axis=1)
df['mlipns_last'] = df.apply(lambda row: mlipns.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,bisim_first,bisim_last,discountedlevenshtein_first,discountedlevenshtein_last,prefix_first,prefix_last,lcsstr_first,lcsstr_last,mlipns_first,mlipns_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.357143,1.0,0.305797,1.0,0.333333,1.0,0.285714,1.0,0.0,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.6,1.0,0.627066,1.0,0.5,1.0,0.4,1.0,0.0,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.5,1.0,0.467913,1.0,0.5,1.0,0.4,1.0,0.0,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.5,1.0,0.409011,1.0,0.25,1.0,0.2,1.0,0.0,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.5,1.0,0.527905,1.0,0.0,1.0,0.571429,1.0,0.0,1.0


In [19]:
strcmp95 = Strcmp95()
df['strcmp95_first'] = df.apply(lambda row: strcmp95.sim(row.list_first_name, row.txn_first_name), axis=1)
df['strcmp95_last'] = df.apply(lambda row: strcmp95.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,discountedlevenshtein_first,discountedlevenshtein_last,prefix_first,prefix_last,lcsstr_first,lcsstr_last,mlipns_first,mlipns_last,strcmp95_first,strcmp95_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.305797,1.0,0.333333,1.0,0.285714,1.0,0.0,1.0,0.698413,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.627066,1.0,0.5,1.0,0.4,1.0,0.0,1.0,0.826667,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.467913,1.0,0.5,1.0,0.4,1.0,0.0,1.0,0.678333,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.409011,1.0,0.25,1.0,0.2,1.0,0.0,1.0,0.678333,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.527905,1.0,0.0,1.0,0.571429,1.0,0.0,1.0,0.503571,1.0


In [20]:
mra = MRA()
df['mra_first'] = df.apply(lambda row: mra.sim(row.list_first_name, row.txn_first_name), axis=1)
df['mra_last'] = df.apply(lambda row: mra.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,prefix_first,prefix_last,lcsstr_first,lcsstr_last,mlipns_first,mlipns_last,strcmp95_first,strcmp95_last,mra_first,mra_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.333333,1.0,0.285714,1.0,0.0,1.0,0.698413,1.0,0.666667,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.5,1.0,0.4,1.0,0.0,1.0,0.826667,1.0,0.666667,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.5,1.0,0.4,1.0,0.0,1.0,0.678333,1.0,0.666667,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.25,1.0,0.2,1.0,0.0,1.0,0.678333,1.0,0.666667,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.0,1.0,0.571429,1.0,0.0,1.0,0.503571,1.0,0.666667,1.0


In [21]:
editex = Editex()
df['editex_first'] = df.apply(lambda row: editex.sim(row.list_first_name, row.txn_first_name), axis=1)
df['editex_last'] = df.apply(lambda row: editex.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,lcsstr_first,lcsstr_last,mlipns_first,mlipns_last,strcmp95_first,strcmp95_last,mra_first,mra_last,editex_first,editex_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.285714,1.0,0.0,1.0,0.698413,1.0,0.666667,1.0,0.5,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.4,1.0,0.0,1.0,0.826667,1.0,0.666667,1.0,0.6,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.4,1.0,0.0,1.0,0.678333,1.0,0.666667,1.0,0.5,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.2,1.0,0.0,1.0,0.678333,1.0,0.666667,1.0,0.5,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.571429,1.0,0.0,1.0,0.503571,1.0,0.666667,1.0,0.714286,1.0


In [22]:
saps = SAPS()
df['saps_first'] = df.apply(lambda row: saps.sim(row.list_first_name, row.txn_first_name), axis=1)
df['saps_last'] = df.apply(lambda row: saps.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,mlipns_first,mlipns_last,strcmp95_first,strcmp95_last,mra_first,mra_last,editex_first,editex_last,saps_first,saps_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.0,1.0,0.698413,1.0,0.666667,1.0,0.5,1.0,0.0,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.0,1.0,0.826667,1.0,0.666667,1.0,0.6,1.0,0.333333,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.0,1.0,0.678333,1.0,0.666667,1.0,0.5,1.0,0.2,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.0,1.0,0.678333,1.0,0.666667,1.0,0.5,1.0,0.35,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.0,1.0,0.503571,1.0,0.666667,1.0,0.714286,1.0,0.409091,1.0


In [23]:
flexmetric = FlexMetric()
df['flexmetric_first'] = df.apply(lambda row: flexmetric.sim(row.list_first_name, row.txn_first_name), axis=1)
df['flexmetric_last'] = df.apply(lambda row: flexmetric.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,strcmp95_first,strcmp95_last,mra_first,mra_last,editex_first,editex_last,saps_first,saps_last,flexmetric_first,flexmetric_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.698413,1.0,0.666667,1.0,0.5,1.0,0.0,1.0,0.521429,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.826667,1.0,0.666667,1.0,0.6,1.0,0.333333,1.0,0.61,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.678333,1.0,0.666667,1.0,0.5,1.0,0.2,1.0,0.57,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.678333,1.0,0.666667,1.0,0.5,1.0,0.35,1.0,0.43,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.503571,1.0,0.666667,1.0,0.714286,1.0,0.409091,1.0,0.721429,1.0


In [24]:
jaro = JaroWinkler(mode='Jaro')
df['jaro_first'] = df.apply(lambda row: jaro.sim(row.list_first_name, row.txn_first_name), axis=1)
df['jaro_last'] = df.apply(lambda row: jaro.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,mra_first,mra_last,editex_first,editex_last,saps_first,saps_last,flexmetric_first,flexmetric_last,jaro_first,jaro_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.666667,1.0,0.5,1.0,0.0,1.0,0.521429,1.0,0.650794,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.666667,1.0,0.6,1.0,0.333333,1.0,0.61,1.0,0.783333,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.666667,1.0,0.5,1.0,0.2,1.0,0.57,1.0,0.633333,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.666667,1.0,0.5,1.0,0.35,1.0,0.43,1.0,0.633333,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.666667,1.0,0.714286,1.0,0.409091,1.0,0.721429,1.0,0.464286,1.0


In [25]:
higuera_mico = HigueraMico()
df['higueramico_first'] = df.apply(lambda row: higuera_mico.sim(row.list_first_name, row.txn_first_name), axis=1)
df['higueramico_last'] = df.apply(lambda row: higuera_mico.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,editex_first,editex_last,saps_first,saps_last,flexmetric_first,flexmetric_last,jaro_first,jaro_last,higueramico_first,higueramico_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.5,1.0,0.0,1.0,0.521429,1.0,0.650794,1.0,0.097619,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.6,1.0,0.333333,1.0,0.61,1.0,0.783333,1.0,0.6,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.5,1.0,0.2,1.0,0.57,1.0,0.633333,1.0,0.4,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.5,1.0,0.35,1.0,0.43,1.0,0.633333,1.0,0.4,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.714286,1.0,0.409091,1.0,0.721429,1.0,0.464286,1.0,0.490476,1.0


In [26]:
sift4 = Sift4()
df['sift4_first'] = df.apply(lambda row: sift4.sim(row.list_first_name, row.txn_first_name), axis=1)
df['sift4_last'] = df.apply(lambda row: sift4.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,saps_first,saps_last,flexmetric_first,flexmetric_last,jaro_first,jaro_last,higueramico_first,higueramico_last,sift4_first,sift4_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.0,1.0,0.521429,1.0,0.650794,1.0,0.097619,1.0,0.285714,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.333333,1.0,0.61,1.0,0.783333,1.0,0.6,1.0,0.6,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.2,1.0,0.57,1.0,0.633333,1.0,0.4,1.0,0.4,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.35,1.0,0.43,1.0,0.633333,1.0,0.4,1.0,0.4,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.409091,1.0,0.721429,1.0,0.464286,1.0,0.490476,1.0,0.571429,1.0


In [27]:
eudex = Eudex()
df['eudex_first'] = df.apply(lambda row: eudex.sim(row.list_first_name, row.txn_first_name), axis=1)
df['eudex_last'] = df.apply(lambda row: eudex.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,flexmetric_first,flexmetric_last,jaro_first,jaro_last,higueramico_first,higueramico_last,sift4_first,sift4_last,eudex_first,eudex_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.521429,1.0,0.650794,1.0,0.097619,1.0,0.285714,1.0,0.984314,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.61,1.0,0.783333,1.0,0.6,1.0,0.6,1.0,0.990196,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.57,1.0,0.633333,1.0,0.4,1.0,0.4,1.0,0.990196,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.43,1.0,0.633333,1.0,0.4,1.0,0.4,1.0,0.984314,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.721429,1.0,0.464286,1.0,0.490476,1.0,0.571429,1.0,0.862745,1.0


In [28]:
aline = ALINE()
df['aline_first'] = df.apply(lambda row: aline.sim(row.list_first_name, row.txn_first_name), axis=1)
df['aline_last'] = df.apply(lambda row: aline.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,jaro_first,jaro_last,higueramico_first,higueramico_last,sift4_first,sift4_last,eudex_first,eudex_last,aline_first,aline_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.650794,1.0,0.097619,1.0,0.285714,1.0,0.984314,1.0,0.363636,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.783333,1.0,0.6,1.0,0.6,1.0,0.990196,1.0,0.347826,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.633333,1.0,0.4,1.0,0.4,1.0,0.990196,1.0,0.26087,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.633333,1.0,0.4,1.0,0.4,1.0,0.984314,1.0,0.37037,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.464286,1.0,0.490476,1.0,0.571429,1.0,0.862745,1.0,0.606061,1.0


In [None]:
covington = Covington()
df['covington_first'] = df.apply(lambda row: covington.sim(row.list_first_name, row.txn_first_name), axis=1)

In [None]:
df['covington_last'] = df.apply(lambda row: covington.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

In [29]:
phonetic_edit = PhoneticEditDistance()
df['phoneticeditdistance_first'] = df.apply(lambda row: phonetic_edit.sim(row.list_first_name, row.txn_first_name), axis=1)
df['phoneticeditdistance_last'] = df.apply(lambda row: phonetic_edit.sim(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,higueramico_first,higueramico_last,sift4_first,sift4_last,eudex_first,eudex_last,aline_first,aline_last,phoneticeditdistance_first,phoneticeditdistance_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.097619,1.0,0.285714,1.0,0.984314,1.0,0.363636,1.0,0.419355,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.6,1.0,0.6,1.0,0.990196,1.0,0.347826,1.0,0.751613,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.4,1.0,0.4,1.0,0.990196,1.0,0.26087,1.0,0.732258,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.4,1.0,0.4,1.0,0.984314,1.0,0.37037,1.0,0.687097,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.490476,1.0,0.571429,1.0,0.862745,1.0,0.606061,1.0,0.571429,1.0


In [31]:
hmni = hmni.Matcher(model='latin')
df['hmni_first'] = df.apply(lambda row: hmni.similarity(row.list_first_name, row.txn_first_name), axis=1)
df['hmni_last'] = df.apply(lambda row: hmni.similarity(row.list_last_name, row.txn_last_name), axis=1)
df[df.confidential==0].head()

Unnamed: 0,key,list_first_name,list_last_name,txn_first_name,txn_last_name,is_match,confidential,syll_list_first,syll_txn_first,syll_list_last,...,sift4_first,sift4_last,eudex_first,eudex_last,aline_first,aline_last,phoneticeditdistance_first,phoneticeditdistance_last,hmni_first,hmni_last
62092,1,aad,smith,adriaan,smith,1,0,[aad],"[a, dria, an]",[smith],...,0.285714,1.0,0.984314,1.0,0.363636,1.0,0.419355,1.0,0.810772,1.0
62093,2,aake,johnson,aarne,johnson,1,0,"[aa, ke]","[aar, ne]","[john, son]",...,0.6,1.0,0.990196,1.0,0.347826,1.0,0.751613,1.0,0.91509,1.0
62094,3,aake,williams,aarno,williams,1,0,"[aa, ke]","[aar, no]","[wil, liams]",...,0.4,1.0,0.990196,1.0,0.26087,1.0,0.732258,1.0,0.8512,1.0
62095,4,aake,jones,arska,jones,1,0,"[aa, ke]","[ars, ka]","[jo, nes]",...,0.4,1.0,0.984314,1.0,0.37037,1.0,0.687097,1.0,0.811576,1.0
62096,5,aandrea,brown,drea,brown,1,0,"[aan, drea]",[drea],[brown],...,0.571429,1.0,0.862745,1.0,0.606061,1.0,0.571429,1.0,0.973328,1.0


In [32]:
# Save output to interim data folder as csv file
df.to_csv(interim_data + 'feature_engineering_results.csv', index=False)