In [1]:
import pandas as pd

df_w = pd.read_csv('mps_wiki_clean.csv')
df_t = pd.read_csv('mps_they_clean.csv')

# Extract Match and NotMatch Population

In [2]:
# Total match population computed by cross product: 650 x 649 = 421850

cross = df_w.merge(df_t, how='cross',suffixes=('_w', '_t'))
len(cross)

422500

In [3]:
# Generate exact matches as feature columns

cross['Fmatch'] = cross.apply(lambda x: x.Firstname_w == x.Firstname_t, axis=1)
cross['Lmatch'] = cross.apply(lambda x: x.Lastname_w == x.Lastname_t, axis=1)
cross['Cmatch'] = cross.apply(lambda x: x.Constituency_w == x.Constituency_t, axis=1)
cross['Tmatch'] = sum([cross['Fmatch'],cross['Lmatch'],cross['Cmatch']])

In [4]:
# Extract the match population as Constituency and either Firstname or Lastname matches

match = cross[cross['Cmatch'] & (cross['Fmatch'] | cross['Lmatch'])]
len(match)

637

In [5]:
# Extract notmatch population as either Consitutency doesn't match or Firstname and Lastname don't match (by-election)

notmatch=cross[(~cross['Cmatch']) | (~cross['Fmatch'] & ~cross['Lmatch'])]
len(notmatch)

421863

 # Firstname Counts

In [6]:
# Within match population how many Firstnames don't match?

first_match = match[match['Fmatch']]
len(first_match)

632

In [7]:
# Within match population how many Firstnames don't match?

notfirst_match = match[~match['Fmatch']]
len(notfirst_match)

5

In [8]:
# Within the non match population how many Firstnames match?

first_notmatch = notmatch[notmatch['Fmatch']]
len(first_notmatch)

2052

In [9]:
# Within the non match population how many Firstnames don't match?

notfirst_notmatch = notmatch[~notmatch['Fmatch']]
len(notfirst_notmatch)

419811

## Firstname Match Probabilities

In [10]:
# If Firstname matches how likely is it we have a match?
# Calculate as number of Firstname matches within match population divided by total number of Firstname matches

prob_match_first = len(first_match) / (len(first_match) + len(first_notmatch))
prob_match_first

0.23546944858420268

In [11]:
# Probability of a match in full population

prob_match = len(match) / len(cross)
prob_match

0.0015076923076923078

In [12]:
# Probability of Firstname match within match population

prob_first_match = len(first_match) / len(match)
prob_first_match

0.9921507064364207

In [13]:
# Probability of Firstname match within full population

prob_first = len(cross[cross['Fmatch']]) / len(cross)
prob_first

0.006352662721893491

In [14]:
# Probability of match within population of Firstname matches

prob_match_first = prob_first_match * prob_match / prob_first
prob_match_first

0.23546944858420268

In [15]:
# Probability that Firstname doesn't match within not match population

prob_first_notmatch = len(first_notmatch) / len(notmatch)
prob_first_notmatch

0.004864138357713286

In [16]:
mf = prob_first_match
uf = prob_first_notmatch
lmbda = prob_match
(lmbda * mf) / (lmbda * mf + (1-lmbda) * uf)

0.23546944858420268

# Lastname Counts

In [17]:
# Within match population how many Lastnames match?

last_match = match[match['Lmatch']]
len(last_match)

633

In [18]:
# Within match population how many Lastnames don't match?

notlast_match = match[~match['Lmatch']]
len(notlast_match)

4

In [19]:
# Within the non match population how many Lastnames match?

last_notmatch = notmatch[notmatch['Lmatch']]
len(last_notmatch)

349

In [20]:
# Within the non match population how many Lastnames don't match?

notlast_notmatch = notmatch[~notmatch['Lmatch']]
len(notlast_notmatch)

421514

## Lastname Match Probabilities

In [21]:
last = cross[cross['Lmatch']]
len(last)

982

In [22]:
# If Lastname matches how likely is it we have a match?
# Calculate as number of Lastname matches within match population divided by total number of Lastname matches

prob_match_last = len(last_match) / (len(last_match) + len(last_notmatch))
prob_match_last

0.6446028513238289

In [23]:
# Probability of Lastname match within full population

prob_last = len(last)/len(cross)
prob_last

0.002324260355029586

In [24]:
# Probability of Lastname match within match population

prob_last_match = len(last_match) / len (match)
prob_last_match

0.9937205651491365

In [25]:
# Probability that Firstname doesn't match within not match population

prob_last_notmatch = len (last_notmatch) / len(notmatch)
prob_last_notmatch

0.0008272827908586437

In [26]:
ml = prob_last_match
ul = prob_last_notmatch
lmbda = prob_match
(lmbda * ml) / (lmbda * ml + (1-lmbda) * ul)

0.6446028513238289

# Firstname and Lastname Counts

In [27]:
last_first_match = first_match[first_match['Lmatch']]
len(last_first_match)

628

In [28]:
ml1 = prob_last_match
ul1 = prob_last_notmatch
(lmbda * ml1) / (lmbda * ml1 + (1-lmbda) * ul1)

0.6446028513238289

In [29]:
notlast_first_match = first_match[~first_match['Lmatch']]
len(notlast_first_match)

4

In [30]:
last_first_notmatch = first_notmatch[first_notmatch['Lmatch']]
len(last_first_notmatch)

0

In [31]:
notlast_first_notmatch = first_notmatch[~first_notmatch['Lmatch']]
len(notlast_first_notmatch)

2052

In [32]:
last_notfirst_match = notfirst_match[notfirst_match['Lmatch']]
len(last_notfirst_match)

5

In [33]:
notlast_notfirst_match = notfirst_match[~notfirst_match['Lmatch']]
len(notlast_notfirst_match)

0

In [34]:
last_notfirst_notmatch = notfirst_notmatch[notfirst_notmatch['Lmatch']]
len(last_notfirst_notmatch)

349

In [35]:
notlast_notfirst_notmatch = notfirst_notmatch[~notfirst_notmatch['Lmatch']]
len(notlast_notfirst_notmatch)

419462

# Firstname and Lastname Match Probabilities

In [36]:
# If Firstname matches but Lastname doesn't what is the probability that it's a match?

prob_match_first_notlast = len(notlast_first_match) / (len(notlast_first_match) + len(notlast_first_notmatch))
# 4 / (4 + 2052 )
prob_match_first_notlast

0.0019455252918287938

In [37]:
# If Lastname matches but Firstname doesn't what is the probability that it's a match?

prob_match_notfirst_last = len(last_notfirst_match) / (len(last_notfirst_match) + len(last_notfirst_notmatch))
# 5 / (5 + 349) 
prob_match_notfirst_last

0.014124293785310734

# Firstname and Lastname probabilities with m and u values

In [38]:
# Probability of match if Firstname and Lastname matches

(mf * ml * lmbda) / (mf * ml * lmbda + uf * ul * (1-lmbda))

0.9973042620923954

In [39]:
# Probability of match if Firstname matches but Lastname doesn't

(mf * (1-ml) * lmbda) / ((mf * (1-ml) * lmbda) + (uf * (1-ul) * (1-lmbda)))

0.0019318788952045723

In [40]:
# Probability of match if Firstname doesn't match but Lastname does

((1-mf) * ml * lmbda) / (((1-mf) * ml * lmbda) + ((1-uf) * ul * (1-lmbda)))

0.0141044893399228

In [41]:
# Probability of match if neither Firstname nor Lastname matches

((1-mf) * (1-ml) * lmbda) / (((1-mf) * (1-ml) * lmbda) + ((1-uf) * (1-ul) * (1-lmbda)))

7.485074890874575e-08

# Expectation Maximisation

In [42]:
# Helper function to calculate probability based on match features

def match_prb(Fmatch,Lmatch,Cmatch,mf1,ml1,mc1,uf1,ul1,uc1, lmbda):
    if (Fmatch==1):
        mf = mf1
        uf = uf1
    else:
        mf = (1-mf1)
        uf = (1-uf1)
    if (Lmatch==1):
        ml = ml1
        ul = ul1
    else:
        ml = (1-ml1)
        ul = (1-ul1)
    if (Cmatch==1):
        mc = mc1
        uc = uc1
    else:
        mc = (1-mc1)
        uc = (1-uc1)
    prob = (lmbda * ml * mf * mc) / (lmbda * ml * mf * mc + (1-lmbda) * ul * uf * uc)
    return(prob)

# EM Iteration 1

In [43]:
# Select population where majority of columns match

it1_match = cross[cross['Tmatch']>=2]
it1_notmatch = cross[cross['Tmatch']<2]

len(it1_match)

637

In [44]:
it1_match[~it1_match['Fmatch'] | ~it1_match['Lmatch']]

Unnamed: 0,Unnamed: 0_w,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Unnamed: 0_t,Constituency_t,Firstname_t,Lastname_t,Flink,Fmatch,Lmatch,Cmatch,Tmatch
64699,100,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,349,Burton,Kate,Kniveton,,True,False,True,2
79794,123,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,494,Central Suffolk and North Ipswich,Daniel,Poulter,,False,True,True,2
255245,393,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,445,Newton Abbot,Anne,Morris,https://facebook.com/annemarie.morris.NA,True,False,True,2
256580,395,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,480,North Antrim,Ian,Paisley Jnr,,True,False,True,2
326453,503,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,153,Slough,Tan,Dhesi,https://facebook.com/tandhesi,False,True,True,2
331778,511,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,278,South Down,Christopher,Hazzard,https://facebook.com/chris.hazzard.77,False,True,True,2
342502,527,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,602,South West Norfolk,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk,False,True,True,2
393480,606,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,230,Wealden,Nusrat,Ghani,https://facebook.com/NusGhaniofficial,False,True,True,2
399909,616,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,159,West Dunbartonshire,Martin,Docherty,https://facebook.com/MartinDochertySNP,True,False,True,2


In [45]:
# Calculate m values

mfi1 = it1_match['Fmatch'].sum()/len(it1_match)
mli1 = it1_match['Lmatch'].sum()/len(it1_match)
mci1 = it1_match['Cmatch'].sum()/len(it1_match)

In [46]:
mfi1

0.9921507064364207

In [47]:
mli1

0.9937205651491365

In [48]:
mci1

1.0

In [49]:
# Calculate u values

ufi1 = it1_notmatch['Fmatch'].sum() / len(it1_notmatch)
uli1 = it1_notmatch['Lmatch'].sum() / len(it1_notmatch)
uci1 = it1_notmatch['Cmatch'].sum() / len(it1_notmatch)

In [50]:
ufi1

0.004864138357713286

In [51]:
uli1

0.0008272827908586437

In [52]:
uci1

3.0815691350035436e-05

In [53]:
lmbda = len(it1_match)/len(cross)
lmbda

0.0015076923076923078

# EM Iteration 2

In [54]:
# Calculate the match probability for each combination

cross['prob'] = cross.apply(lambda x: match_prb(x.Fmatch,x.Lmatch,x.Cmatch,
                                       mfi1,mli1,mci1,  
                                        ufi1,uli1,uci1,
                                        lmbda), axis=1)

# Set match threshold as > 0.99 probability

it2_match = cross[cross['prob']>0.99]
it2_notmatch = cross[cross['prob']<=0.99]
len(it2_match)

633

In [55]:
# Select records just below the match threshold

it2_notmatch[it2_notmatch['prob']>0.8]

Unnamed: 0,Unnamed: 0_w,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Unnamed: 0_t,Constituency_t,Firstname_t,Lastname_t,Flink,Fmatch,Lmatch,Cmatch,Tmatch,prob
64699,100,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,349,Burton,Kate,Kniveton,,True,False,True,2,0.984329
255245,393,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,445,Newton Abbot,Anne,Morris,https://facebook.com/annemarie.morris.NA,True,False,True,2,0.984329
256580,395,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,480,North Antrim,Ian,Paisley Jnr,,True,False,True,2,0.984329
399909,616,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,159,West Dunbartonshire,Martin,Docherty,https://facebook.com/MartinDochertySNP,True,False,True,2,0.984329


In [56]:
# Recalculate m values

mfi2 = it2_match['Fmatch'].sum()/len(it2_match)
mli2 = it2_match['Lmatch'].sum()/len(it2_match)
mci2 = it2_match['Cmatch'].sum()/len(it2_match)

In [57]:
mfi2

0.9921011058451816

In [58]:
mli2

1.0

In [59]:
mci2

1.0

In [60]:
# Recalculate u values

ufi2 = it2_notmatch['Fmatch'].sum() / len(it2_notmatch)
uli2 = it2_notmatch['Lmatch'].sum() / len(it2_notmatch)
uci2 = it2_notmatch['Cmatch'].sum() / len(it2_notmatch)

In [61]:
ufi2

0.004873573898882823

In [62]:
uli2

0.0008272749468434365

In [63]:
uci2

4.029706044796109e-05

In [64]:
lmbda = len(it2_match)/len(cross)
lmbda

0.001498224852071006

# EM Iteration 3

In [65]:
# Calculate the match probability for each combination

cross['prob'] = cross.apply(lambda x: match_prb(x.Fmatch,x.Lmatch,x.Cmatch,
                                        mfi2,mli2,mci2,  
                                        ufi2,uli2,uci2,
                                        lmbda), axis=1)

# Set match threshold as > 0.99 probability

it3_match = cross[cross['prob']>0.99]
len(it3_match)

633

In [66]:
# How many records are above match threshold when either Firstname or Lastname don't match

it3_match[~it3_match['Fmatch'] | ~it3_match['Lmatch']]

Unnamed: 0,Unnamed: 0_w,Constituency_w,Fullname,Notes,Firstname_w,Lastname_w,Unnamed: 0_t,Constituency_t,Firstname_t,Lastname_t,Flink,Fmatch,Lmatch,Cmatch,Tmatch,prob
79794,123,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,494,Central Suffolk and North Ipswich,Daniel,Poulter,,False,True,True,2,0.997209
326453,503,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,153,Slough,Tan,Dhesi,https://facebook.com/tandhesi,False,True,True,2,0.997209
331778,511,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,278,South Down,Christopher,Hazzard,https://facebook.com/chris.hazzard.77,False,True,True,2,0.997209
342502,527,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,602,South West Norfolk,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk,False,True,True,2,0.997209
393480,606,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,230,Wealden,Nusrat,Ghani,https://facebook.com/NusGhaniofficial,False,True,True,2,0.997209


# Splink

In [67]:
#%pip install splink
# After install you may need to restart the kernel and reload datasets in cell 1.

import splink
import numpy as np

In [68]:
# Add a unique_id column needed by Splink to both datasets

df_w['unique_id'] = df_w.index
df_t['unique_id'] = df_t.index

In [69]:
df_w['Flink'] = np.nan
df_t['Notes'] = np.nan

In [70]:
df_w = df_w[['Firstname','Lastname','Constituency','Flink','Notes','unique_id']]
df_t = df_t[['Firstname','Lastname','Constituency','Flink','Notes','unique_id']]

In [71]:
# Splink settings to match on Firstname and Lastname

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl

settings = {
    "link_type": "link_only",
    "comparisons": [
        cl.exact_match("Firstname"),
        cl.exact_match("Lastname"),
        cl.exact_match("Constituency"),
    ],
}
linker = DuckDBLinker([df_w, df_t], settings)

In [72]:
# Examine distribution of columns to be matched

linker.profile_columns(['Firstname','Lastname','Constituency'])

In [73]:
em_session = linker.estimate_parameters_using_expectation_maximisation('True', fix_u_probabilities=False, fix_probability_two_random_records_match=False)
em_session.match_weights_interactive_history_chart()


----- Starting EM training session -----

Estimating the m and u probabilities of the model by blocking on:
True

Parameter estimates will be made for the following comparison(s):
    - Firstname
    - Lastname
    - Constituency

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.605 in the u_probability of Firstname, level `All other comparisons`
Iteration 2: Largest change in params was 0.00176 in the m_probability of Firstname, level `All other comparisons`
Iteration 3: Largest change in params was -5.69e-05 in the m_probability of Lastname, level `Exact match`

EM converged after 3 iterations

Your model is fully trained. All comparisons have at least one estimate for their m and u values


In [74]:
linker.save_settings_to_json()

{'link_type': 'link_only',
 'comparisons': [{'output_column_name': 'Firstname',
   'comparison_levels': [{'sql_condition': '"Firstname_l" IS NULL OR "Firstname_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"Firstname_l" = "Firstname_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.992118804074688,
     'u_probability': 0.004864290128404288},
    {'sql_condition': 'ELSE',
     'label_for_charts': 'All other comparisons',
     'm_probability': 0.007881195925311958,
     'u_probability': 0.9951357098715956}],
   'comparison_description': 'Exact match vs. anything else'},
  {'output_column_name': 'Lastname',
   'comparison_levels': [{'sql_condition': '"Lastname_l" IS NULL OR "Lastname_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"Lastname_l" = "Lastname_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.9937726043638647,
     'u_probability': 0.0008

In [75]:
# Calculate predictions, apply probability threshold of 0.8 and convert to dataframe

pres = linker.predict(threshold_match_probability = 0.9).as_pandas_dataframe()
len(pres)

633

In [76]:
pres[pres['match_probability']<0.99]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,Firstname_l,Firstname_r,gamma_Firstname,Lastname_l,Lastname_r,gamma_Lastname,Constituency_l,Constituency_r,gamma_Constituency
152,4.943439,0.968523,_a,_b,502,153,Tanmanjeet,Tan,0,Dhesi,Dhesi,1,Slough,Slough,1
224,4.943439,0.968523,_a,_b,605,230,Nus,Nusrat,0,Ghani,Ghani,1,Wealden,Wealden,1
270,4.943439,0.968523,_a,_b,510,278,Chris,Christopher,0,Hazzard,Hazzard,1,South Down,South Down,1
479,4.943439,0.968523,_a,_b,122,494,Dan,Daniel,0,Poulter,Poulter,1,Central Suffolk and North Ipswich,Central Suffolk and North Ipswich,1
586,4.943439,0.968523,_a,_b,526,602,Liz,Elizabeth,0,Truss,Truss,1,South West Norfolk,South West Norfolk,1


# Real Time Match

In [77]:
# Real time match example
record = {'unique_id': 999,
 'Firstname': "Dan",
 'Lastname': "Poulter",
'Constituency': 'Central Suffolk and North Ipswich'
}

df_w_inc = linker.find_matches_to_new_records([record], blocking_rules=[], match_weight_threshold = -5).as_pandas_dataframe()
df_w_inc.sort_values("match_weight", ascending=False)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,Firstname_l,Firstname_r,gamma_Firstname,Lastname_l,Lastname_r,gamma_Lastname,Constituency_l,Constituency_r,gamma_Constituency
0,19.595913,0.999999,122,999,Dan,Dan,1,Poulter,Poulter,1,Central Suffolk and North Ipswich,Central Suffolk and North Ipswich,1
1,4.943439,0.968523,494,999,Daniel,Dan,0,Poulter,Poulter,1,Central Suffolk and North Ipswich,Central Suffolk and North Ipswich,1
