# Chapter 3 - Text Matching

## Text Matching Techniques

In [6]:
%pip install jellyfish
import jellyfish as jf

Collecting jellyfish
  Using cached jellyfish-1.0.0-cp311-none-win_amd64.whl (206 kB)
Installing collected packages: jellyfish
Successfully installed jellyfish-1.0.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
jf.levenshtein_distance('Michael','Micheal')

2

In [8]:
min(jf.levenshtein_distance('el','ael'), jf.levenshtein_distance('ael','al'), jf.levenshtein_distance('el','al'))

1

In [9]:
jf.jaro_similarity('Michael','Micheal')

0.9523809523809524

In [10]:
jf.jaro_winkler_similarity('Michael','Micheal')

0.9714285714285714

In [11]:
jf.jaro_winkler_similarity('michael','MICHAEL')

0.0

In [12]:
jf.jaro_winkler_similarity('michael'.lower(),'MICHAEL'.lower())

1.0

In [13]:
jf.metaphone('michael')

'MXL'

In [14]:
jf.metaphone('micheal')

'MXL'

In [15]:
import pandas as pd

In [16]:
mylist = ['Michael','Micheal','Michel','Mike','Mick']

In [17]:
combs = []
import itertools
for a, b in itertools.combinations(mylist, 2):
    combs.append([a,b,jf.jaro_similarity(a,b),jf.jaro_winkler_similarity(a, b), jf.levenshtein_distance(a,b), jf.match_rating_comparison(a,b),(jf.soundex(a)==jf.soundex(b))])
pd.DataFrame(combs, columns=['Name1','Name2','Jaro','JaroW','Levenshtein','MRA','Soundex'])

Unnamed: 0,Name1,Name2,Jaro,JaroW,Levenshtein,MRA,Soundex
0,Michael,Micheal,0.952381,0.971429,2,True,True
1,Michael,Michel,0.952381,0.971429,1,True,True
2,Michael,Mike,0.72619,0.780952,4,False,False
3,Michael,Mick,0.72619,0.808333,4,True,False
4,Micheal,Michel,0.952381,0.971429,1,True,True
5,Micheal,Mike,0.72619,0.780952,4,False,False
6,Micheal,Mick,0.72619,0.808333,4,True,False
7,Michel,Mike,0.75,0.8,3,False,False
8,Michel,Mick,0.75,0.825,3,True,False
9,Mike,Mick,0.833333,0.866667,2,True,True


## Sample Problem

In [18]:
df_w_un = pd.read_csv('mps_unmatched.csv')
df_w_un

Unnamed: 0,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Firstname_t,Lastname_t,Flink
0,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,Kate,Kniveton,
1,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,Daniel,Poulter,
2,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,Anne,Morris,https://facebook.com/annemarie.morris.NA
3,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,Ian,Paisley Jnr,
4,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,Tan,Dhesi,https://facebook.com/tandhesi
5,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,Christopher,Hazzard,https://facebook.com/chris.hazzard.77
6,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk
7,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,Nusrat,Ghani,https://facebook.com/NusGhaniofficial
8,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,Martin,Docherty,https://facebook.com/MartinDochertySNP


In [19]:
df_w_un['Firstname_jaro'] = df_w_un.apply(lambda x: jf.jaro_winkler_similarity(x.Firstname_w, x.Firstname_t), axis=1)
df_w_un['Lastname_jaro'] = df_w_un.apply(lambda x: jf.jaro_winkler_similarity(x.Lastname_w, x.Lastname_t), axis=1)

In [20]:
df_w_un[(df_w_un['Firstname_jaro'] > 0.8) & (df_w_un['Lastname_jaro'] > 0.8)]

Unnamed: 0,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro
1,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,Daniel,Poulter,,0.883333,1.0
3,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,Ian,Paisley Jnr,,1.0,0.927273
4,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,Tan,Dhesi,https://facebook.com/tandhesi,0.836667,1.0
5,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,Christopher,Hazzard,https://facebook.com/chris.hazzard.77,0.890909,1.0
7,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,Nusrat,Ghani,https://facebook.com/NusGhaniofficial,0.883333,1.0
8,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,Martin,Docherty,https://facebook.com/MartinDochertySNP,1.0,0.906667


In [21]:
import pandas as pd

df_w = pd.read_csv('mps_wiki_clean.csv')
df_t = pd.read_csv('mps_they_clean.csv')

cross = df_w.merge(df_t, how='cross',suffixes=('_w', '_t'))

In [22]:
cross.head(n=5)

Unnamed: 0,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Flink
0,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Hackney North and Stoke Newington,Diane,Abbott,https://facebook.com/Dianeabbott
1,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Oldham East and Saddleworth,Debbie,Abrahams,
2,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Selby and Ainsty,Nigel,Adams,https://facebook.com/nigel.adamsmp
3,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Hitchin and Harpenden,Bim,Afolami,
4,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Windsor,Adam,Afriyie,https://facebook.com/adamafriyieofficial


In [23]:
cross['Firstname_jaro'] = cross.apply(lambda x: True if jf.jaro_winkler_similarity(x.Firstname_w, x.Firstname_t)>0.8 else False, axis=1)
cross['Lastname_jaro'] = cross.apply(lambda x: True if jf.jaro_winkler_similarity(x.Lastname_w, x.Lastname_t)>0.8 else False, axis=1)

In [24]:
cross.head(n=5)

Unnamed: 0,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro
0,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Hackney North and Stoke Newington,Diane,Abbott,https://facebook.com/Dianeabbott,False,False
1,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Oldham East and Saddleworth,Debbie,Abrahams,,False,False
2,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Selby and Ainsty,Nigel,Adams,https://facebook.com/nigel.adamsmp,False,False
3,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Hitchin and Harpenden,Bim,Afolami,,False,False
4,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,Windsor,Adam,Afriyie,https://facebook.com/adamafriyieofficial,False,False


In [25]:
tp = cross[(cross['Firstname_jaro'] & cross['Lastname_jaro']) & (cross['Constituency_w']==cross['Constituency_t'])]
len(tp)

634

In [26]:
fp = cross[(cross['Firstname_jaro'] & cross['Lastname_jaro']) & (cross['Constituency_w']!=cross['Constituency_t'])]
len(fp)

19

In [40]:
fp.head(n=5)

Unnamed: 0,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro
40659,Blyth Valley,Ian Levy,"Previous incumbent, Ronnie Campbell, did not s...",Ian,Levy,Wansbeck,Ian,Lavery,https://facebook.com/IanLaveryMP,True,True
72718,Cardiff Central,Jo Stevens,Seat held\n,Jo,Stevens,Carlisle,John,Stevenson,https://facebook.com/JohnStevensonCarlisle,True,True
73245,Cardiff North,Anna McMorrin,Seat held\n,Anna,McMorrin,Newton Abbot,Anne,Morris,https://facebook.com/annemarie.morris.NA,True,True
75316,Carlisle,John Stevenson,Seat held\n,John,Stevenson,Cardiff Central,Jo,Stevens,https://facebook.com/JoStevensLabour,True,True
76340,Carmarthen West and South Pembrokeshire,Simon Hart,Seat held\n,Simon,Hart,North Dorset,Simon,Hoare,https://facebook.com/simonhoarenorthdorset,True,True
93928,Clwyd West,David Jones,Seat held\n,David,Jones,Wantage,David,Johnston,,True,True
103760,Cumbernauld Kilsyth and Kirkintilloch East,Stuart McDonald,Seat held\n,Stuart,McDonald,Glasgow South,Stewart,McDonald,https://facebook.com/Stewart-McDonald-for-Glas...,True,True
157711,Glasgow South,Stewart McDonald,Seat held\n,Stewart,McDonald,Cumbernauld Kilsyth and Kirkintilloch East,Stuart,McDonald,https://facebook.com/Stuart-McDonald-For-Cumbe...,True,True
161993,Grantham and Stamford,Gareth Davies,"Previous incumbent, Nick Boles, did not stand\n",Gareth,Davies,Swansea West,Geraint,Davies,https://facebook.com/GeraintDaviesMP,True,True
168491,Haltemprice and Howden,David Davis,Seat held\n,David,Davis,Monmouth,David,Davies,https://facebook.com/davidtcdavies,True,True


In [28]:
fntn = cross[(~cross['Firstname_jaro'] | ~cross['Lastname_jaro']) & (cross['Constituency_w']==cross['Constituency_t'])]
len(fntn)

16

In [29]:
fntn

Unnamed: 0,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro
3100,Airdrie and Shotts,Neil Gray,Seat held\n,Neil,Gray,Airdrie and Shotts,Anum,Qaisar,,False,False
19211,Batley and Spen,Tracy Brabin,Seat held\n,Tracy,Brabin,Batley and Spen,Kim,Leadbeater,,False,False
30812,Birmingham Erdington,Jack Dromey,Seat held\n,Jack,Dromey,Birmingham Erdington,Paulette,Hamilton,,False,False
64699,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,Burton,Kate,Kniveton,,True,False
84750,Chesham and Amersham,Cheryl Gillan,Seat held\n,Cheryl,Gillan,Chesham and Amersham,Sarah,Green,,False,False
90506,City of Chester,Chris Matheson,Seat held\n,Chris,Matheson,City of Chester,Samantha,Dixon,,False,False
174650,Hartlepool,Mike Hill,Seat held\n,Mike,Hill,Hartlepool,Jill,Mortimer,,False,False
255245,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,Newton Abbot,Anne,Morris,https://facebook.com/annemarie.morris.NA,True,False
266943,North Shropshire,Owen Paterson,Seat held\n,Owen,Paterson,North Shropshire,Helen,Morgan,,False,False
281023,Old Bexley and Sidcup,James Brokenshire,Seat held\n,James,Brokenshire,Old Bexley and Sidcup,Louie,French,,False,False


In [30]:
df_w['Firstname'].value_counts().mean()

1.8950437317784257

In [31]:
df_w['Lastname'].value_counts().mean()

1.1545293072824157

In [32]:
df_w['Constituency'].value_counts().mean()

1.0

In [33]:
df_w_un['Firstname_w_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Firstname_w), axis=1)

In [34]:
df_w_un

Unnamed: 0,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro,Firstname_w_meta
0,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,Kate,Kniveton,,1.0,0.490741,KT
1,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,Daniel,Poulter,,0.883333,1.0,TN
2,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,Anne,Morris,https://facebook.com/annemarie.morris.NA,1.0,0.583333,AN
3,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,Ian,Paisley Jnr,,1.0,0.927273,IN
4,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,Tan,Dhesi,https://facebook.com/tandhesi,0.836667,1.0,TNMNJT
5,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,Christopher,Hazzard,https://facebook.com/chris.hazzard.77,0.890909,1.0,XRS
6,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk,0.62963,1.0,LS
7,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,Nusrat,Ghani,https://facebook.com/NusGhaniofficial,0.883333,1.0,NS
8,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,Martin,Docherty,https://facebook.com/MartinDochertySNP,1.0,0.906667,MRTN


In [35]:
df_w_un['Firstname_t_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Firstname_t), axis=1)

In [36]:
df_w_un

Unnamed: 0,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro,Firstname_w_meta,Firstname_t_meta
0,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,Kate,Kniveton,,1.0,0.490741,KT,KT
1,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,Daniel,Poulter,,0.883333,1.0,TN,TNL
2,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,Anne,Morris,https://facebook.com/annemarie.morris.NA,1.0,0.583333,AN,AN
3,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,Ian,Paisley Jnr,,1.0,0.927273,IN,IN
4,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,Tan,Dhesi,https://facebook.com/tandhesi,0.836667,1.0,TNMNJT,TN
5,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,Christopher,Hazzard,https://facebook.com/chris.hazzard.77,0.890909,1.0,XRS,XRSTFR
6,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk,0.62963,1.0,LS,ELSB0
7,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,Nusrat,Ghani,https://facebook.com/NusGhaniofficial,0.883333,1.0,NS,NSRT
8,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,Martin,Docherty,https://facebook.com/MartinDochertySNP,1.0,0.906667,MRTN,MRTN


In [37]:
df_w_un['Lastname_w_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Lastname_w), axis=1)
df_w_un['Lastname_t_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Lastname_t), axis=1)

In [38]:
df_w_un

Unnamed: 0,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Firstname_t,Lastname_t,Flink,Firstname_jaro,Lastname_jaro,Firstname_w_meta,Firstname_t_meta,Lastname_w_meta,Lastname_t_meta
0,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,Kate,Kniveton,,1.0,0.490741,KT,KT,KRF0S,NFTN
1,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,Daniel,Poulter,,0.883333,1.0,TN,TNL,PLTR,PLTR
2,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,Anne,Morris,https://facebook.com/annemarie.morris.NA,1.0,0.583333,AN,AN,MR MRS,MRS
3,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,Ian,Paisley Jnr,,1.0,0.927273,IN,IN,PSL,PSL JNR
4,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,Tan,Dhesi,https://facebook.com/tandhesi,0.836667,1.0,TNMNJT,TN,THS,THS
5,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,Christopher,Hazzard,https://facebook.com/chris.hazzard.77,0.890909,1.0,XRS,XRSTFR,HSRT,HSRT
6,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk,0.62963,1.0,LS,ELSB0,TRS,TRS
7,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,Nusrat,Ghani,https://facebook.com/NusGhaniofficial,0.883333,1.0,NS,NSRT,KHN,KHN
8,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,Martin,Docherty,https://facebook.com/MartinDochertySNP,1.0,0.906667,MRTN,MRTN,TXRTHKHS,TXRT


In [39]:
combs = []
import itertools
for a, b in itertools.combinations(mylist, 2):
    combs.append([a,b,jf.jaro_similarity(a,b),jf.jaro_winkler_similarity(a, b), jf.levenshtein_distance(a,b), jf.match_rating_comparison(a,b),(jf.soundex(a)==jf.soundex(b)), jf.metaphone(a), jf.metaphone(b)])
pd.DataFrame(combs, columns=['Name1','Name2','Jaro','JaroW','Levenshtein','MRA','Soundex','Name1Meta','Name2Meta'])

Unnamed: 0,Name1,Name2,Jaro,JaroW,Levenshtein,MRA,Soundex,Name1Meta,Name2Meta
0,Michael,Micheal,0.952381,0.971429,2,True,True,MXL,MXL
1,Michael,Michel,0.952381,0.971429,1,True,True,MXL,MXL
2,Michael,Mike,0.72619,0.780952,4,False,False,MXL,MK
3,Michael,Mick,0.72619,0.808333,4,True,False,MXL,MK
4,Micheal,Michel,0.952381,0.971429,1,True,True,MXL,MXL
5,Micheal,Mike,0.72619,0.780952,4,False,False,MXL,MK
6,Micheal,Mick,0.72619,0.808333,4,True,False,MXL,MK
7,Michel,Mike,0.75,0.8,3,False,False,MXL,MK
8,Michel,Mick,0.75,0.825,3,True,False,MXL,MK
9,Mike,Mick,0.833333,0.866667,2,True,True,MK,MK
