In [None]:
import pandas as pd

df_w = pd.read_csv('mps_wiki_clean.csv')
df_t = pd.read_csv('mps_they_clean.csv')

# Extract Match and NotMatch Population

In [None]:
# Total match population computed by cross product: 650 x 649 = 421850

cross = df_w.merge(df_t, how='cross',suffixes=('_w', '_t'))
len(cross)

In [None]:
# Generate exact matches as feature columns

cross['Fmatch'] = cross.apply(lambda x: x.Firstname_w == x.Firstname_t, axis=1)
cross['Lmatch'] = cross.apply(lambda x: x.Lastname_w == x.Lastname_t, axis=1)
cross['Cmatch'] = cross.apply(lambda x: x.Constituency_w == x.Constituency_t, axis=1)
cross['Tmatch'] = sum([cross['Fmatch'],cross['Lmatch'],cross['Cmatch']])

In [None]:
# Extract the match population as Constituency and either Firstname or Lastname matches

match = cross[cross['Cmatch'] & (cross['Fmatch'] | cross['Lmatch'])]
len(match)

In [None]:
# Extract notmatch population as either Consitutency doesn't match or Firstname and Lastname don't match (by-election)

notmatch=cross[(~cross['Cmatch']) | (~cross['Fmatch'] & ~cross['Lmatch'])]
len(notmatch)

 # Firstname Counts

In [None]:
# Within match population how many Firstnames don't match?

first_match = match[match['Fmatch']]
len(first_match)

In [None]:
# Within match population how many Firstnames don't match?

notfirst_match = match[~match['Fmatch']]
len(notfirst_match)

In [None]:
# Within the non match population how many Firstnames match?

first_notmatch = notmatch[notmatch['Fmatch']]
len(first_notmatch)

In [None]:
# Within the non match population how many Firstnames don't match?

notfirst_notmatch = notmatch[~notmatch['Fmatch']]
len(notfirst_notmatch)

## Firstname Match Probabilities

In [None]:
# If Firstname matches how likely is it we have a match?
# Calculate as number of Firstname matches within match population divided by total number of Firstname matches

prob_match_first = len(first_match) / (len(first_match) + len(first_notmatch))
prob_match_first

In [None]:
# Probability of a match in full population

prob_match = len(match) / len(cross)
prob_match

In [None]:
# Probability of Firstname match within match population

prob_first_match = len(first_match) / len(match)
prob_first_match

In [None]:
# Probability of Firstname match within full population

prob_first = len(cross[cross['Fmatch']]) / len(cross)
prob_first

In [None]:
# Probability of match within population of Firstname matches

prob_match_first = prob_first_match * prob_match / prob_first
prob_match_first

In [None]:
# Probability that Firstname doesn't match within not match population

prob_first_notmatch = len(first_notmatch) / len(notmatch)
prob_first_notmatch

In [None]:
mf = prob_first_match
uf = prob_first_notmatch
lmbda = prob_match
(lmbda * mf) / (lmbda * mf + (1-lmbda) * uf)

# Lastname Counts

In [None]:
# Within match population how many Lastnames match?

last_match = match[match['Lmatch']]
len(last_match)

In [None]:
# Within match population how many Lastnames don't match?

notlast_match = match[~match['Lmatch']]
len(notlast_match)

In [None]:
# Within the non match population how many Lastnames match?

last_notmatch = notmatch[notmatch['Lmatch']]
len(last_notmatch)

In [None]:
# Within the non match population how many Lastnames don't match?

notlast_notmatch = notmatch[~notmatch['Lmatch']]
len(notlast_notmatch)

## Lastname Match Probabilities

In [None]:
last = cross[cross['Lmatch']]
len(last)

In [None]:
# If Lastname matches how likely is it we have a match?
# Calculate as number of Lastname matches within match population divided by total number of Lastname matches

prob_match_last = len(last_match) / (len(last_match) + len(last_notmatch))
prob_match_last

In [None]:
# Probability of Lastname match within full population

prob_last = len(last)/len(cross)
prob_last

In [None]:
# Probability of Lastname match within match population

prob_last_match = len(last_match) / len (match)
prob_last_match

In [None]:
# Probability that Firstname doesn't match within not match population

prob_last_notmatch = len (last_notmatch) / len(notmatch)
prob_last_notmatch

In [None]:
ml = prob_last_match
ul = prob_last_notmatch
lmbda = prob_match
(lmbda * ml) / (lmbda * ml + (1-lmbda) * ul)

# Firstname and Lastname Counts

In [None]:
last_first_match = first_match[first_match['Lmatch']]
len(last_first_match)

In [None]:
ml1 = prob_last_match
ul1 = prob_last_notmatch
(lmbda * ml1) / (lmbda * ml1 + (1-lmbda) * ul1)

In [None]:
notlast_first_match = first_match[~first_match['Lmatch']]
len(notlast_first_match)

In [None]:
last_first_notmatch = first_notmatch[first_notmatch['Lmatch']]
len(last_first_notmatch)

In [None]:
notlast_first_notmatch = first_notmatch[~first_notmatch['Lmatch']]
len(notlast_first_notmatch)

In [None]:
last_notfirst_match = notfirst_match[notfirst_match['Lmatch']]
len(last_notfirst_match)

In [None]:
notlast_notfirst_match = notfirst_match[~notfirst_match['Lmatch']]
len(notlast_notfirst_match)

In [None]:
last_notfirst_notmatch = notfirst_notmatch[notfirst_notmatch['Lmatch']]
len(last_notfirst_notmatch)

In [None]:
notlast_notfirst_notmatch = notfirst_notmatch[~notfirst_notmatch['Lmatch']]
len(notlast_notfirst_notmatch)

# Firstname and Lastname Match Probabilities

In [None]:
# If Firstname matches but Lastname doesn't what is the probability that it's a match?

prob_match_first_notlast = len(notlast_first_match) / (len(notlast_first_match) + len(notlast_first_notmatch))
# 4 / (4 + 2052 )
prob_match_first_notlast

In [None]:
# If Lastname matches but Firstname doesn't what is the probability that it's a match?

prob_match_notfirst_last = len(last_notfirst_match) / (len(last_notfirst_match) + len(last_notfirst_notmatch))
# 5 / (5 + 349) 
prob_match_notfirst_last

# Firstname and Lastname probabilities with m and u values

In [None]:
# Probability of match if Firstname and Lastname matches

(mf * ml * lmbda) / (mf * ml * lmbda + uf * ul * (1-lmbda))

In [None]:
# Probability of match if Firstname matches but Lastname doesn't

(mf * (1-ml) * lmbda) / ((mf * (1-ml) * lmbda) + (uf * (1-ul) * (1-lmbda)))

In [None]:
# Probability of match if Firstname doesn't match but Lastname does

((1-mf) * ml * lmbda) / (((1-mf) * ml * lmbda) + ((1-uf) * ul * (1-lmbda)))

In [None]:
# Probability of match if neither Firstname nor Lastname matches

((1-mf) * (1-ml) * lmbda) / (((1-mf) * (1-ml) * lmbda) + ((1-uf) * (1-ul) * (1-lmbda)))

# Expectation Maximisation

In [None]:
# Helper function to calculate probability based on match features

def match_prb(Fmatch,Lmatch,Cmatch,mf1,ml1,mc1,uf1,ul1,uc1, lmbda):
    if (Fmatch==1):
        mf = mf1
        uf = uf1
    else:
        mf = (1-mf1)
        uf = (1-uf1)
    if (Lmatch==1):
        ml = ml1
        ul = ul1
    else:
        ml = (1-ml1)
        ul = (1-ul1)
    if (Cmatch==1):
        mc = mc1
        uc = uc1
    else:
        mc = (1-mc1)
        uc = (1-uc1)
    prob = (lmbda * ml * mf * mc) / (lmbda * ml * mf * mc + (1-lmbda) * ul * uf * uc)
    return(prob)

# EM Iteration 1

In [None]:
# Select population where majority of columns match

it1_match = cross[cross['Tmatch']>=2]
it1_notmatch = cross[cross['Tmatch']<2]

len(it1_match)

In [None]:
it1_match[~it1_match['Fmatch'] | ~it1_match['Lmatch']]

In [None]:
# Calculate m values

mfi1 = it1_match['Fmatch'].sum()/len(it1_match)
mli1 = it1_match['Lmatch'].sum()/len(it1_match)
mci1 = it1_match['Cmatch'].sum()/len(it1_match)

In [None]:
mfi1

In [None]:
mli1

In [None]:
mci1

In [None]:
# Calculate u values

ufi1 = it1_notmatch['Fmatch'].sum() / len(it1_notmatch)
uli1 = it1_notmatch['Lmatch'].sum() / len(it1_notmatch)
uci1 = it1_notmatch['Cmatch'].sum() / len(it1_notmatch)

In [None]:
ufi1

In [None]:
uli1

In [None]:
uci1

In [None]:
lmbda = len(it1_match)/len(cross)
lmbda

# EM Iteration 2

In [None]:
# Calculate the match probability for each combination

cross['prob'] = cross.apply(lambda x: match_prb(x.Fmatch,x.Lmatch,x.Cmatch,
                                       mfi1,mli1,mci1,  
                                        ufi1,uli1,uci1,
                                        lmbda), axis=1)

# Set match threshold as > 0.99 probability

it2_match = cross[cross['prob']>0.99]
it2_notmatch = cross[cross['prob']<=0.99]
len(it2_match)

In [None]:
# Select records just below the match threshold

it2_notmatch[it2_notmatch['prob']>0.8]

In [None]:
# Recalculate m values

mfi2 = it2_match['Fmatch'].sum()/len(it2_match)
mli2 = it2_match['Lmatch'].sum()/len(it2_match)
mci2 = it2_match['Cmatch'].sum()/len(it2_match)

In [None]:
mfi2

In [None]:
mli2

In [None]:
mci2

In [None]:
# Recalculate u values

ufi2 = it2_notmatch['Fmatch'].sum() / len(it2_notmatch)
uli2 = it2_notmatch['Lmatch'].sum() / len(it2_notmatch)
uci2 = it2_notmatch['Cmatch'].sum() / len(it2_notmatch)

In [None]:
ufi2

In [None]:
uli2

In [None]:
uci2

In [None]:
lmbda = len(it2_match)/len(cross)
lmbda

# EM Iteration 3

In [None]:
# Calculate the match probability for each combination

cross['prob'] = cross.apply(lambda x: match_prb(x.Fmatch,x.Lmatch,x.Cmatch,
                                        mfi2,mli2,mci2,  
                                        ufi2,uli2,uci2,
                                        lmbda), axis=1)

# Set match threshold as > 0.99 probability

it3_match = cross[cross['prob']>0.99]
len(it3_match)

In [None]:
# How many records are above match threshold when either Firstname or Lastname don't match

it3_match[~it3_match['Fmatch'] | ~it3_match['Lmatch']]

# Splink

In [None]:
#%pip install splink
# After install you may need to restart the kernel and reload datasets in cell 1.

import splink
import numpy as np

In [None]:
# Add a unique_id column needed by Splink to both datasets

df_w['unique_id'] = df_w.index
df_t['unique_id'] = df_t.index

In [None]:
df_w['Flink'] = np.nan
df_t['Notes'] = np.nan

In [None]:
df_w = df_w[['Firstname','Lastname','Constituency','Flink','Notes','unique_id']]
df_t = df_t[['Firstname','Lastname','Constituency','Flink','Notes','unique_id']]

In [None]:
# Splink settings to match on Firstname and Lastname

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl

settings = {
    "link_type": "link_only",
    "comparisons": [
        cl.exact_match("Firstname"),
        cl.exact_match("Lastname"),
        cl.exact_match("Constituency"),
    ],
}
linker = DuckDBLinker([df_w, df_t], settings)

In [None]:
# Examine distribution of columns to be matched

linker.profile_columns(['Firstname','Lastname','Constituency'])

In [None]:
em_session = linker.estimate_parameters_using_expectation_maximisation('True', fix_u_probabilities=False, fix_probability_two_random_records_match=False)
em_session.match_weights_interactive_history_chart()

In [None]:
linker.save_settings_to_json()

In [None]:
# Calculate predictions, apply probability threshold of 0.8 and convert to dataframe

pres = linker.predict(threshold_match_probability = 0.9).as_pandas_dataframe()
len(pres)

In [None]:
pres[pres['match_probability']<0.99]

# Real Time Match

In [None]:
# Real time match example
record = {'unique_id': 999,
 'Firstname': "Dan",
 'Lastname': "Poulter",
'Constituency': 'Central Suffolk and North Ipswich'
}

df_w_inc = linker.find_matches_to_new_records([record], blocking_rules=[], match_weight_threshold = -5).as_pandas_dataframe()
df_w_inc.sort_values("match_weight", ascending=False)