In [1]:
import pandas as pd

df = pd.read_csv('mps_wiki_clean.csv')
df_mp = pd.read_csv('mps_they_clean.csv')

# Extract Match and NotMatch Population

In [2]:
# Total match population computed by cross product: 650 x 649 = 421850

cross = df.merge(df_mp, how='cross',suffixes=('_w', '_t'))
len(cross)

421850

In [3]:
# Generate exact matches as feature columns

cross['Fmatch'] = cross.apply(lambda x: x.Firstname_w == x.Firstname_t, axis=1)
cross['Lmatch'] = cross.apply(lambda x: x.Lastname_w == x.Lastname_t, axis=1)
cross['Cmatch'] = cross.apply(lambda x: x.Constituency_w == x.Constituency_t, axis=1)
cross['Tmatch'] = sum([cross['Fmatch'],cross['Lmatch'],cross['Cmatch']])

In [4]:
# Extract the match population as Constituency and either Firstname or Lastname matches

match = cross[cross['Cmatch'] & (cross['Fmatch'] | cross['Lmatch'])]
len(match)

637

In [5]:
# Extract non match population as either Consitutency doesn't match or Firstname and Lastname don't match (by-election)

notmatch=cross[(~cross['Cmatch']) | (~cross['Fmatch'] & ~cross['Lmatch'])]
len(notmatch)

421213

 # Firstname Counts

In [6]:
# Within match population how many Firstnames match?

first_match = match[match['Fmatch']]
len(first_match)

632

In [7]:
# Within match population how many Firstnames don't match?

notfirst_match = match[~match['Fmatch']]
len(notfirst_match)

5

In [8]:
# Within the non match population how many Firstnames match?

first_notmatch = notmatch[notmatch['Fmatch']]
len(first_notmatch)

2052

In [9]:
# Within the non match population how many Firstnames don't match?

notfirst_notmatch = notmatch[~notmatch['Fmatch']]
len(notfirst_notmatch)

419161

# Firstname Probabilities

In [10]:
# If Firstname matches how likely is it we have a match?
# Calculate as number of Firstname matches within match population divided by total number of Firstname matches

prob_match_first = len(first_match) / (len(first_match) + len(first_notmatch))
prob_match_first

0.23546944858420268

In [11]:
# Probability of a match in full population

prob_match = len(match) / len(cross)
prob_match

0.001510015408320493

In [12]:
# Probability of Firstname match within match population

prob_first_match = len(first_match) / len(match)
prob_first_match

0.9921507064364207

In [13]:
# Probability of Firstname match within full population

prob_first = len(cross[cross['Fmatch']]) / len(cross)
prob_first

0.00636245110821382

In [14]:
# Probability of match within population of Firstname matches

prob_match_first = prob_first_match * prob_match / prob_first
prob_match_first

0.23546944858420268

In [15]:
# Probability that Firstname doesn't match within not match population

prob_first_notmatch = len(first_notmatch) / len(notmatch)
prob_first_notmatch

0.004871644512396341

In [16]:
mf1 = prob_first_match
uf1 = prob_first_notmatch
lmbda = prob_match
(lmbda * mf1) / (lmbda * mf1 + (1-lmbda) * uf1)

0.23546944858420266

# Lastname Counts

In [17]:
# Within match population how many Lastnames match?

last_match = match[match['Lmatch']]
len(last_match)

633

In [18]:
# Within match population how many Lastnames don't match?

notlast_match = match[~match['Lmatch']]
len(notlast_match)

4

In [19]:
# Within the non match population how many Lastnames match?

last_notmatch = notmatch[notmatch['Lmatch']]
len(last_notmatch)

349

In [20]:
# Within the non match population how many Lastnames don't match?

notlast_notmatch = notmatch[~notmatch['Lmatch']]
len(notlast_notmatch)

420864

# Lastname Probabilities

In [21]:
last = cross[cross['Lmatch']]
len(last)

982

In [22]:
# If Lastname matches how likely is it we have a match?
# Calculate as number of Lastname matches within match population divided by total number of Lastname matches

prob_match_last = len(last_match) / (len(last_match) + len(last_notmatch))
prob_match_last

0.6446028513238289

In [23]:
# Probability of Lastname match within full population

prob_last = len(last)/len(cross)
prob_last

0.002327841649875548

In [24]:
# Probability of Lastname match within match population

prob_last_match = len(last_match) / len (match)
prob_last_match

0.9937205651491365

In [25]:
# Probability that Firstname doesn't match within not match population

prob_last_notmatch = len (last_notmatch) / len(notmatch)
prob_last_notmatch

0.000828559422429982

In [26]:
ml1 = prob_last_match
ul1 = prob_last_notmatch
lmbda = prob_match
(lmbda * ml1) / (lmbda * ml1 + (1-lmbda) * ul1)

0.6446028513238289

# Firstname and Lastname Counts

In [27]:
last_first_match = first_match[first_match['Lmatch']]
len(last_first_match)

628

In [28]:
ml1 = prob_last_match
ul1 = prob_last_notmatch
(lmbda * ml1) / (lmbda * ml1 + (1-lmbda) * ul1)

0.6446028513238289

In [29]:
notlast_first_match = first_match[~first_match['Lmatch']]
len(notlast_first_match)

4

In [30]:
last_first_notmatch = first_notmatch[first_notmatch['Lmatch']]
len(last_first_notmatch)

0

In [31]:
notlast_first_notmatch = first_notmatch[~first_notmatch['Lmatch']]
len(notlast_first_notmatch)

2052

In [32]:
last_notfirst_match = notfirst_match[notfirst_match['Lmatch']]
len(last_notfirst_match)

5

In [33]:
# Within firstname population

In [34]:
notlast_notfirst_match = notfirst_match[~notfirst_match['Lmatch']]
len(notlast_notfirst_match)

0

In [35]:
last_notfirst_notmatch = notfirst_notmatch[notfirst_notmatch['Lmatch']]
len(last_notfirst_notmatch)

349

In [36]:
notlast_notfirst_notmatch = notfirst_notmatch[~notfirst_notmatch['Lmatch']]
len(notlast_notfirst_notmatch)

418812

# Firstname and Lastname Probabilities

In [37]:
# If Firstname matches but Lastname doesn't what is the probability that it's a match?

prob_match_first_notlast = len(notlast_first_match) / (len(notlast_first_match) + len(notlast_first_notmatch))
# 4 / (4 + 2052 )
prob_match_first_notlast

0.0019455252918287938

In [38]:
# If Lastname matches but Firstname doesn't what is the probability that it's a match?

prob_match_notfirst_last = len(last_notfirst_match) / (len(last_notfirst_match) + len(last_notfirst_notmatch))
# 5 / (5 + 349) 
prob_match_notfirst_last

0.014124293785310734

# Firstname and Lastname probabilities with m and u values

In [39]:
# Probability of match if Firstname and Lastname matches

(lmbda * ml1 * mf1) / (lmbda * ml1 * mf1 + (1-lmbda) * ul1 * uf1)

0.9973001133628486

In [40]:
# Probability of match if Firstname matches but Lastname doesn't

(lmbda * (1-ml1) * mf1) / ((lmbda * (1-ml1) * mf1) + ((1-lmbda) * (1-ul1) * uf1))

0.001931881358778785

In [41]:
# Probability of match if Firstname doesn't match but Lastname does

(lmbda * (1-mf1) * ml1) / ((lmbda * (1-mf1) * ml1) + ((1-lmbda) * (1-uf1) * ul1))

0.014104594228119374

In [42]:
# Probability of match if neither Firstname nor Lastname matches

(lmbda * (1-mf1) * (1-ml1)) / (lmbda * (1-mf1) * (1-ml1) + (1-lmbda) * (1-uf1) * (1-ul1))

7.496691699741389e-08

# Expectation Maximisation

In [43]:
# Function to calculate probability based on match features

def em(Fmatch,Lmatch,Cmatch,mf1,ml1,mc1,uf1,ul1,uc1, lmbda):
    if (Fmatch==1):
        mf = mf1
        uf = uf1
    else:
        mf = (1-mf1)
        uf = (1-uf1)
    if (Lmatch==1):
        ml = ml1
        ul = ul1
    else:
        ml = (1-ml1)
        ul = (1-ul1)
    if (Cmatch==1):
        mc = mc1
        uc = uc1
    else:
        mc = (1-mc1)
        uc = (1-uc1)
    prob = (lmbda * ml * mf * mc) / (lmbda * ml * mf * mc + (1-lmbda) * ul * uf * uc)
    return(prob)

# EM Iteration 1

In [44]:
# Select population where majority of columns match

it1_match = cross[cross['Tmatch']>=2]
it1_notmatch = cross[cross['Tmatch']<2]
len(it1_match)

637

In [45]:
mf1i1 = it1_match['Fmatch'].sum()/len(it1_match)
ml1i1 = it1_match['Lmatch'].sum()/len(it1_match)
mc1i1 = it1_match['Cmatch'].sum()/len(it1_match)

In [46]:
mf1i1

0.9921507064364207

In [47]:
ml1i1

0.9937205651491365

In [48]:
mc1i1

1.0

In [49]:
uf1i1 = it1_notmatch['Fmatch'].sum() / len(it1_notmatch)
ul1i1 = it1_notmatch['Lmatch'].sum() / len(it1_notmatch)
uc1i1 = it1_notmatch['Cmatch'].sum() / len(it1_notmatch)

In [50]:
uf1i1

0.004871644512396341

In [51]:
ul1i1

0.000828559422429982

In [52]:
uc1i1

2.848914919530024e-05

# EM Iteration 2

In [53]:
# Calculate the match probability for each combination

cross['prob'] = cross.apply(lambda x: em(x.Fmatch,x.Lmatch,x.Cmatch,
                                        mf1i1,ml1i1,mc1i1,  
                                        uf1i1,ul1i1,uc1i1,
                                        lmbda), axis=1)

# Set match threshold as > 0.99 probability

it2_match = cross[cross['prob']>0.99]
it2_notmatch = cross[cross['prob']<=0.99]
len(it2_match)

633

In [54]:
# How many records are above match threshold when either Firstname or Lastname don't match

it2_match[~it2_match['Fmatch'] | ~it2_match['Lmatch']]

Unnamed: 0,Constituency_w,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Fmatch,Lmatch,Cmatch,Tmatch,prob
79671,Central Suffolk and North Ipswich,Dan,Poulter,Central Suffolk and North Ipswich,Daniel,Poulter,False,True,True,2,0.998013
325950,Slough,Tanmanjeet,Dhesi,Slough,Tan,Dhesi,False,True,True,2,0.998013
331267,South Down,Chris,Hazzard,South Down,Christopher,Hazzard,False,True,True,2,0.998013
341975,South West Norfolk,Liz,Truss,South West Norfolk,Elizabeth,Truss,False,True,True,2,0.998013
392874,Wealden,Nus,Ghani,Wealden,Nusrat,Ghani,False,True,True,2,0.998013


In [55]:
it2_notmatch[it2_notmatch['prob']>0.9]

Unnamed: 0,Constituency_w,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Fmatch,Lmatch,Cmatch,Tmatch,prob
64599,Burton,Kate,Griffiths,Burton,Kate,Kniveton,True,False,True,2,0.985495
254852,Newton Abbot,Anne,Marie Morris,Newton Abbot,Anne,Morris,True,False,True,2,0.985495
256185,North Antrim,Ian,Paisley,North Antrim,Ian,Paisley Jnr,True,False,True,2,0.985495
399293,West Dunbartonshire,Martin,Docherty-Hughes,West Dunbartonshire,Martin,Docherty,True,False,True,2,0.985495


In [56]:
mf1i2 = it2_match['Fmatch'].sum()/len(it2_match)
ml1i2 = it2_match['Lmatch'].sum()/len(it2_match)
mc1i2 = it2_match['Cmatch'].sum()/len(it2_match)

In [57]:
mf1i2

0.9921011058451816

In [58]:
ml1i2

1.0

In [59]:
mc1i2

1.0

In [60]:
uf1i2 = it2_notmatch['Fmatch'].sum() / len(it2_notmatch)
ul1i2 = it2_notmatch['Lmatch'].sum() / len(it2_notmatch)
uc1i2 = it2_notmatch['Cmatch'].sum() / len(it2_notmatch)

In [61]:
uf1i2

0.004881094542717886

In [62]:
ul1i2

0.0008285515541870342

In [63]:
uc1i2

3.79851715386606e-05

# EM Iteration 3

In [64]:
# Calculate the match probability for each combination

cross['prob'] = cross.apply(lambda x: em(x.Fmatch,x.Lmatch,x.Cmatch,
                                        mf1i2,ml1i2,mc1i2,  
                                        uf1i2,ul1i2,uc1i2,
                                        lmbda), axis=1)

# Set match threshold as > 0.99 probability

it3_match = cross[cross['prob']>0.99]
len(it3_match)

633

In [65]:
# How many records are above match threshold when either Firstname or Lastname don't match

it3_match[~it3_match['Fmatch'] | ~it3_match['Lmatch']]

Unnamed: 0,Constituency_w,Firstname_w,Lastname_w,Constituency_t,Firstname_t,Lastname_t,Fmatch,Lmatch,Cmatch,Tmatch,prob
79671,Central Suffolk and North Ipswich,Dan,Poulter,Central Suffolk and North Ipswich,Daniel,Poulter,False,True,True,2,0.997385
325950,Slough,Tanmanjeet,Dhesi,Slough,Tan,Dhesi,False,True,True,2,0.997385
331267,South Down,Chris,Hazzard,South Down,Christopher,Hazzard,False,True,True,2,0.997385
341975,South West Norfolk,Liz,Truss,South West Norfolk,Elizabeth,Truss,False,True,True,2,0.997385
392874,Wealden,Nus,Ghani,Wealden,Nusrat,Ghani,False,True,True,2,0.997385


# Splink

In [66]:
# %pip install splink
# After install you may need to restart the kernel

import splink

In [67]:
# Add a unique_id column needed by Splink to both datasets

df['unique_id'] = df.index
df_mp['unique_id'] = df_mp.index

In [68]:
# Splink settings to match on Firstname, Lastname and Constituency

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl

settings = {
    "link_type": "link_only",
    "comparisons": [
        cl.exact_match("Firstname"),
        cl.exact_match("Lastname"),
        cl.exact_match("Constituency")
    ],
}
linker = DuckDBLinker([df, df_mp], settings)

In [69]:
# Examine distribution of columns to be matched

linker.profile_columns(['Firstname','Lastname','Constituency'])

In [70]:
# Estimate u values using random sampling

linker.estimate_u_using_random_sampling(target_rows=5e5)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - Firstname (no m values are trained).
    - Lastname (no m values are trained).
    - Constituency (no m values are trained).


In [71]:
# Estimate m values using EM method

linker.estimate_parameters_using_expectation_maximisation("r.Lastname == l.Lastname")
linker.estimate_parameters_using_expectation_maximisation("r.Firstname == l.Firstname")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
r.Lastname == l.Lastname

Parameter estimates will be made for the following comparison(s):
    - Firstname
    - Constituency

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - Lastname

Iteration 1: Largest change in params was 0.577 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.0627 in probability_two_random_records_match
Iteration 3: Largest change in params was -0.00125 in the m_probability of Firstname, level `Exact match`
Iteration 4: Largest change in params was 0.00352 in the m_probability of Firstname, level `All other comparisons`
Iteration 5: Largest change in params was -0.00179 in the m_probability of Firstname, level `Exact match`
Iteration 6: Largest change in params was 0.000274 in the m_probability of Firstname, level `All other comparisons`
Iteration 7: Largest c

<EMTrainingSession, blocking on r.Firstname == l.Firstname, deactivating comparisons Firstname>

In [72]:
# Get numerical values of m and u probabilities.

linker.save_settings_to_json()

{'link_type': 'link_only',
 'comparisons': [{'output_column_name': 'Firstname',
   'comparison_levels': [{'sql_condition': '"Firstname_l" IS NULL OR "Firstname_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"Firstname_l" = "Firstname_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.9929430070307547,
     'u_probability': 0.00636245110821382},
    {'sql_condition': 'ELSE',
     'label_for_charts': 'All other comparisons',
     'm_probability': 0.007056992969245251,
     'u_probability': 0.9936375488917861}],
   'comparison_description': 'Exact match vs. anything else'},
  {'output_column_name': 'Lastname',
   'comparison_levels': [{'sql_condition': '"Lastname_l" IS NULL OR "Lastname_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"Lastname_l" = "Lastname_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.9999674340185474,
     'u_probability': 0.0023

In [73]:
# Calculate predictions, apply probability threshold of 0.1 and convert to dataframe

pres = linker.predict(threshold_match_probability = 0.1).as_pandas_dataframe()
len(pres)

633

In [74]:
pres[pres['match_probability']<0.2].head()

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,Firstname_l,Firstname_r,gamma_Firstname,Lastname_l,Lastname_r,gamma_Lastname,Constituency_l,Constituency_r,gamma_Constituency
152,-2.33405,0.165503,_a,502,_b,152,Tanmanjeet,Tan,0,Dhesi,Dhesi,1,Slough,Slough,1
224,-2.33405,0.165503,_a,605,_b,229,Nus,Nusrat,0,Ghani,Ghani,1,Wealden,Wealden,1
270,-2.33405,0.165503,_a,510,_b,277,Chris,Christopher,0,Hazzard,Hazzard,1,South Down,South Down,1
479,-2.33405,0.165503,_a,122,_b,493,Dan,Daniel,0,Poulter,Poulter,1,Central Suffolk and North Ipswich,Central Suffolk and North Ipswich,1
586,-2.33405,0.165503,_a,526,_b,601,Liz,Elizabeth,0,Truss,Truss,1,South West Norfolk,South West Norfolk,1


In [75]:
#    {'sql_condition': '"Firstname_l" = "Firstname_r"',
#     'm_probability': 0.9929424850650888,
#     'u_probability': 0.00636245110821382},
#    
#    {'sql_condition': '"Lastname_l" = "Lastname_r"',
#     'm_probability': 0.9999837062652734,
#     'u_probability': 0.002327841649875548},
#    
#    {'sql_condition': '"Constituency_l" = "Constituency_r"',
#     'm_probability': 1.0,
#     'u_probability': 0.0015384615384615385},

# Splink Approximate Match

In [76]:
# Splink settings to match on Firstname, Lastname and Constituency

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "comparisons": [
        cl.jaro_winkler_at_thresholds("Firstname", [0.8]),
        cl.jaro_winkler_at_thresholds("Lastname", [0.9,0.8]),
        cl.exact_match("Constituency")
    ],
}
linker = DuckDBLinker([df, df_mp], settings)

In [77]:
# Estimate u values using random sampling

linker.estimate_u_using_random_sampling(target_rows=5e5)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - Firstname (no m values are trained).
    - Lastname (no m values are trained).
    - Constituency (no m values are trained).


In [78]:
# Estimate m values using EM method

linker.estimate_parameters_using_expectation_maximisation("r.Lastname == l.Lastname")
linker.estimate_parameters_using_expectation_maximisation("r.Firstname == l.Firstname")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
r.Lastname == l.Lastname

Parameter estimates will be made for the following comparison(s):
    - Firstname
    - Constituency

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - Lastname

Iteration 1: Largest change in params was 0.578 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.0659 in probability_two_random_records_match
Iteration 3: Largest change in params was -4.63e-05 in the m_probability of Firstname, level `Exact match`

EM converged after 3 iterations

Your model is not yet fully trained. Missing estimates for:
    - Lastname (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
r.Firstname == l.Firstname

Parameter estimates will be made for the following comparison(s):
    - Lastname
    - Cons

<EMTrainingSession, blocking on r.Firstname == l.Firstname, deactivating comparisons Firstname>

In [79]:
# Calculate predictions and convert to Pandas dataframe

pres = linker.predict(threshold_match_probability = 0.1).as_pandas_dataframe()
len(pres)

634

In [80]:
pres[pres['match_probability']<0.99].head()

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,Firstname_l,Firstname_r,gamma_Firstname,Lastname_l,Lastname_r,gamma_Lastname,Constituency_l,Constituency_r,gamma_Constituency
152,4.452889,0.956333,_a,502,_b,152,Tanmanjeet,Tan,1,Dhesi,Dhesi,3,Slough,Slough,1
157,5.865504,0.983138,_a,615,_b,158,Martin,Martin,2,Docherty-Hughes,Docherty,2,West Dunbartonshire,West Dunbartonshire,1
225,4.452889,0.956333,_a,605,_b,229,Nus,Nusrat,1,Ghani,Ghani,3,Wealden,Wealden,1
271,4.452889,0.956333,_a,510,_b,277,Chris,Christopher,1,Hazzard,Hazzard,3,South Down,South Down,1
467,5.865504,0.983138,_a,394,_b,479,Ian,Ian,2,Paisley,Paisley Jnr,2,North Antrim,North Antrim,1


In [81]:
linker.save_settings_to_json()

{'link_type': 'link_only',
 'comparisons': [{'output_column_name': 'Firstname',
   'comparison_levels': [{'sql_condition': '"Firstname_l" IS NULL OR "Firstname_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"Firstname_l" = "Firstname_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.9936701863484996,
     'u_probability': 0.00636245110821382},
    {'sql_condition': 'jaro_winkler_similarity("Firstname_l", "Firstname_r") >= 0.8',
     'label_for_charts': 'jaro_winkler_similarity >= 0.8',
     'm_probability': 0.006322249703249883,
     'u_probability': 0.008036031764845325},
    {'sql_condition': 'ELSE',
     'label_for_charts': 'All other comparisons',
     'm_probability': 7.563948250584628e-06,
     'u_probability': 0.9856015171269409}],
   'comparison_description': 'Exact match vs. jaro_winkler_similarity at threshold 0.8 vs. anything else'},
  {'output_column_name': 'Lastname',
   'comparison_levels': [{'sql_condi

In [82]:
linker.match_weights_chart()

# Real Time Match

In [83]:
# Real time match example
record = {'unique_id': 999,
 'Firstname': "Dan",
 'Lastname': "Poulter",
'Constituency': 'Central Suffolk and North Ipswich'
}

df_inc = linker.find_matches_to_new_records([record], blocking_rules=[], match_weight_threshold = -5).as_pandas_dataframe()
df_inc.sort_values("match_weight", ascending=False)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,Firstname_l,Firstname_r,gamma_Firstname,Lastname_l,Lastname_r,gamma_Lastname,Constituency_l,Constituency_r,gamma_Constituency
0,12.085974,0.99977,122,999,Dan,Dan,2,Poulter,Poulter,3,Central Suffolk and North Ipswich,Central Suffolk and North Ipswich,1
1,4.452889,0.956333,493,999,Daniel,Dan,1,Poulter,Poulter,3,Central Suffolk and North Ipswich,Central Suffolk and North Ipswich,1
