In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unidecode
import re
from datetime import datetime
from fuzzywuzzy import fuzz
from abydos.distance import (IterativeSubString, BISIM, DiscountedLevenshtein, Prefix, LCSstr, MLIPNS, Strcmp95,
MRA, Editex, SAPS, FlexMetric, JaroWinkler, HigueraMico, Sift4, Eudex, ALINE, Covington, PhoneticEditDistance)
from abydos.phonetic import PSHPSoundexFirst, Ainsworth
from abydos.phones import *



In [2]:
# Interim Data File Locations
interim_data = '../data/interim/'

In [3]:
df = pd.read_csv(interim_data + 'eda_results.csv')
df.shape

(87550, 3)

In [4]:
df.sample(10)

Unnamed: 0,name_a,name_b,target
30330,Abdullah,Lolle,0
13282,Peter,Petike,1
23297,Darus,Robinson,0
33436,Ronnie,Luisa,0
4310,Ellie,Elizabeth,1
71115,Yavor,Flora,0
9034,Lisa,Lilibet,1
25280,Michelle,Neele,0
78749,Le,Kaloyan,0
73671,Newton,Jen,0


In [5]:
def syllables(word):
    # single syllable word
    if len(re.findall('[aeiouy]', word)) <= 1:
        return [word]

    # sonority hierarchy: vowels, nasals, fricatives, stops
    hierarchy = {
        'a': 4, 'e': 4, 'i': 4, 'o': 4, 'u': 4, 'y': 4,
        'l': 3, 'm': 3, 'n': 3, 'r': 3, 'w': 3,
        'f': 2, 's': 2, 'v': 2, 'z': 2,
        'b': 1, 'c': 1, 'd': 1, 'g': 1, 'h': 1, 'j': 1, 'k': 1, 'p': 1, 'q': 1, 't': 1, 'x': 1,
    }
    syllables_values = [(c, hierarchy[c]) for c in word]

    syllables = []
    syll = syllables_values[0][0]
    for trigram in zip(*[syllables_values[i:] for i in range(3)]):
        (phonemes, values) = zip(*trigram)
        (previous, val, following) = values
        phoneme = phonemes[1]

        if previous > val < following:
            syllables.append(syll)
            syll = phoneme
        elif previous >= val == following:
            syll += phoneme
            syllables.append(syll)
            syll = ''
        else:
            syll += phoneme
    syll += syllables_values[-1][0]
    syllables.append(syll)

    final_syllables = []
    front = ''
    for (i, syllable) in enumerate(syllables):
        if not re.search('[aeiouy]', syllable):
            if len(final_syllables) == 0:
                front += syllable
            else:
                final_syllables = final_syllables[:-1] \
                                  + [final_syllables[-1] + syllable]
        else:
            if len(final_syllables) == 0:
                final_syllables.append(front + syllable)
            else:
                final_syllables.append(syllable)
    return final_syllables

In [6]:
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['name_a'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower().strip()), axis=1)
    df['name_b'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower().strip()), axis=1)
    
    df['syll_a'] = df.apply(lambda row: syllable_tokenizer.tokenize(row.name_a), axis=1)
    df['syll_b'] = df.apply(lambda row: syllable_tokenizer.tokenize(row.name_b), axis=1)
    
    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_a,row.syll_b), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_a,row.syll_b), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_a,row.syll_b), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.name_a, row.name_b), axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.name_a)==pshp_soundex_first.encode(row.name_b) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.name_a, row.name_b), axis=1)
            
    df.drop(['syll_a', 'syll_b'], axis=1, inplace=True)
    return df

In [7]:
df.columns=['a', 'b', 'target']
df.head()

Unnamed: 0,a,b,target
0,Aad,Adriaan,1
1,Aake,Aarne,1
2,Aake,Aarno,1
3,Aake,Arska,1
4,Aandrea,Drea,1


In [8]:
df['name_a'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower().strip()), axis=1)
df['name_b'] = df.apply(lambda row: re.sub(
    '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower().strip()), axis=1)
df.head()

Unnamed: 0,a,b,target,name_a,name_b
0,Aad,Adriaan,1,aad,adriaan
1,Aake,Aarne,1,aake,aarne
2,Aake,Aarno,1,aake,aarno
3,Aake,Arska,1,aake,arska
4,Aandrea,Drea,1,aandrea,drea


In [9]:
df['syll_a'] = df.apply(lambda row: syllables(row.name_a), axis=1)
df['syll_b'] = df.apply(lambda row: syllables(row.name_b), axis=1)
df.head()

Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b
0,Aad,Adriaan,1,aad,adriaan,[aad],"[a, dria, an]"
1,Aake,Aarne,1,aake,aarne,"[aa, ke]","[aar, ne]"
2,Aake,Aarno,1,aake,aarno,"[aa, ke]","[aar, no]"
3,Aake,Arska,1,aake,arska,"[aa, ke]","[ars, ka]"
4,Aandrea,Drea,1,aandrea,drea,"[aan, drea]",[drea]


In [10]:
df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_a,row.syll_b), axis=1)
df.head()

Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b,partial
0,Aad,Adriaan,1,aad,adriaan,[aad],"[a, dria, an]",57
1,Aake,Aarne,1,aake,aarne,"[aa, ke]","[aar, ne]",83
2,Aake,Aarno,1,aake,aarno,"[aa, ke]","[aar, no]",75
3,Aake,Arska,1,aake,arska,"[aa, ke]","[ars, ka]",75
4,Aandrea,Drea,1,aandrea,drea,"[aan, drea]",[drea],88


In [11]:
df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_a,row.syll_b), axis=1)
df.head()

Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b,partial,tkn_sort
0,Aad,Adriaan,1,aad,adriaan,[aad],"[a, dria, an]",57,50
1,Aake,Aarne,1,aake,aarne,"[aa, ke]","[aar, ne]",83,73
2,Aake,Aarno,1,aake,aarno,"[aa, ke]","[aar, no]",75,55
3,Aake,Arska,1,aake,arska,"[aa, ke]","[ars, ka]",75,55
4,Aandrea,Drea,1,aandrea,drea,"[aan, drea]",[drea],88,67


In [12]:
df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_a,row.syll_b), axis=1)
df.head()

Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b,partial,tkn_sort,tkn_set
0,Aad,Adriaan,1,aad,adriaan,[aad],"[a, dria, an]",57,50,50
1,Aake,Aarne,1,aake,aarne,"[aa, ke]","[aar, ne]",83,73,73
2,Aake,Aarno,1,aake,aarno,"[aa, ke]","[aar, no]",75,55,55
3,Aake,Arska,1,aake,arska,"[aa, ke]","[ars, ka]",75,55,55
4,Aandrea,Drea,1,aandrea,drea,"[aan, drea]",[drea],88,67,100


In [13]:
pe = Ainsworth()
def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score
df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.name_a, row.name_b), axis=1)
df.head()

Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b,partial,tkn_sort,tkn_set,sum_ipa
0,Aad,Adriaan,1,aad,adriaan,[aad],"[a, dria, an]",57,50,50,0.822581
1,Aake,Aarne,1,aake,aarne,"[aa, ke]","[aar, ne]",83,73,73,0.876344
2,Aake,Aarno,1,aake,aarno,"[aa, ke]","[aar, no]",75,55,55,0.876344
3,Aake,Arska,1,aake,arska,"[aa, ke]","[ars, ka]",75,55,55,0.790323
4,Aandrea,Drea,1,aandrea,drea,"[aan, drea]",[drea],88,67,100,0.255376


In [14]:
pshp_soundex_first = PSHPSoundexFirst()
df['pshp_soundex_first'] = df.apply(
    lambda row: 1 if pshp_soundex_first.encode(row.name_a)==pshp_soundex_first.encode(row.name_b) else 0, axis=1)
df[df.pshp_soundex_first!=0].head()

Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b,partial,tkn_sort,tkn_set,sum_ipa,pshp_soundex_first
6,Aarne,Aarno,1,aarne,aarno,"[aar, ne]","[aar, no]",92,83,83,0.979839,1
12,Ab,Abbe,1,ab,abbe,[ab],"[ab, be]",83,57,100,1.0,1
17,Ab,Appie,1,ab,appie,[ab],"[ap, pie]",67,25,25,0.983871,1
22,Abbe,Ab,1,abbe,ab,"[ab, be]",[ab],83,57,100,0.5,1
27,Abbey,Abbie,1,abbey,abbie,"[ab, bey]","[ab, bie]",92,83,83,0.967742,1


In [15]:
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
covington = Covington()
phonetic_edit = PhoneticEditDistance()

In [16]:
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
         aline, covington, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
              'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington',
              'phoneticeditdistance']
for i, algo in enumerate(algos):
    df[algo_names[i]] = df.apply(lambda row: algo.sim(row.name_a, row.name_b), axis=1)
    print(datetime.now(), " ", algo_names[i], " completed...")
df.head()

2022-09-23 15:16:12.652254   iterativesubstring  completed...
2022-09-23 15:16:20.047310   bisim  completed...
2022-09-23 15:16:31.828205   discountedlevenshtein  completed...
2022-09-23 15:16:33.532339   prefix  completed...
2022-09-23 15:16:36.434578   lcsstr  completed...
2022-09-23 15:16:38.282929   mlipns  completed...
2022-09-23 15:16:42.375278   strcmp95  completed...
2022-09-23 15:16:47.420327   mra  completed...
2022-09-23 15:17:04.090674   editex  completed...
2022-09-23 15:17:15.978920   saps  completed...
2022-09-23 15:17:29.937813   flexmetric  completed...
2022-09-23 15:17:34.793178   jaro  completed...
2022-09-23 15:18:34.507025   higueramico  completed...
2022-09-23 15:18:37.286190   sift4  completed...
2022-09-23 15:18:43.194593   eudex  completed...
2022-09-23 15:21:43.610622   aline  completed...
2022-09-23 15:28:09.464906   covington  completed...
2022-09-23 15:28:46.773351   phoneticeditdistance  completed...


Unnamed: 0,a,b,target,name_a,name_b,syll_a,syll_b,partial,tkn_sort,tkn_set,...,editex,saps,flexmetric,jaro,higueramico,sift4,eudex,aline,covington,phoneticeditdistance
0,Aad,Adriaan,1,aad,adriaan,[aad],"[a, dria, an]",57,50,50,...,0.5,0.0,0.521429,0.650794,0.097619,0.285714,0.984314,0.363636,0.489362,0.419355
1,Aake,Aarne,1,aake,aarne,"[aa, ke]","[aar, ne]",83,73,73,...,0.6,0.333333,0.61,0.783333,0.6,0.6,0.990196,0.347826,0.722222,0.751613
2,Aake,Aarno,1,aake,aarno,"[aa, ke]","[aar, no]",75,55,55,...,0.5,0.2,0.57,0.633333,0.4,0.4,0.990196,0.26087,0.666667,0.732258
3,Aake,Arska,1,aake,arska,"[aa, ke]","[ars, ka]",75,55,55,...,0.5,0.35,0.43,0.633333,0.4,0.4,0.984314,0.37037,0.611111,0.687097
4,Aandrea,Drea,1,aandrea,drea,"[aan, drea]",[drea],88,67,100,...,0.714286,0.409091,0.721429,0.464286,0.490476,0.571429,0.862745,0.606061,0.735849,0.571429


In [17]:
df.drop(['syll_a', 'syll_b'], axis=1, inplace=True)
df.sample(5)

Unnamed: 0,a,b,target,name_a,name_b,partial,tkn_sort,tkn_set,sum_ipa,pshp_soundex_first,...,editex,saps,flexmetric,jaro,higueramico,sift4,eudex,aline,covington,phoneticeditdistance
21370,Wole,Masa,0,wole,masa,67,20,20,0.645161,0,...,0.25,0.0,0.05,0.0,0.0,0.0,0.933333,0.2,0.5,0.818548
24733,Carolyn,Teodor,0,carolyn,teodor,57,25,25,0.448925,0,...,0.285714,0.0,0.242857,0.436508,0.142857,0.142857,0.902451,0.383784,0.453846,0.781106
39901,Kole,Tola,0,kole,tola,83,20,20,0.733871,0,...,0.625,0.285714,0.525,0.666667,0.5,0.5,0.811765,0.85,0.784091,0.931452
29479,Polde,Kym,0,polde,kym,43,0,0,0.384409,0,...,0.1,0.0,0.24,0.0,0.0,0.0,0.929902,0.225926,0.205128,0.493548
27316,Melvin,Miroslava,0,melvin,miroslava,67,42,42,0.723118,0,...,0.333333,0.0,0.333333,0.5,0.221032,0.222222,0.881373,0.319149,0.60274,0.584229


In [18]:
# Save output to interim data folder as csv file
df.to_csv(interim_data + 'feature_engineering_results.csv', index=False)