### Person Match Example

Match House of Commons MPs to Persons with Significant Control of UK Companies

In [4]:
import pandas as pd
import numpy as np
import json

In [5]:
# Open UK House of Commons JSON, normalise, load into dataframe and rename columns

with open('ep-popolo-v1.0.json', encoding="utf8") as f:
    json_dict = json.load(f)
    
df = pd.json_normalize(json_dict, record_path=['persons'])
df = df.rename(columns={'email':'primaryemail'})

In [6]:
# Replace nulls with empty dict in a list to allow normalisation

df['contact_details'] = [ [{}] if x is np.NaN else x for x in df['contact_details'] ]

In [None]:
import warnings
warnings.filterwarnings('ignore')
# Extract the extra attributes embedded as json in a list in contact details column
# Pivot and join with dataframe, adding as extra columns

df_info = pd.json_normalize(df.to_dict('list'), ['contact_details']).unstack().apply(pd.Series)
df_extract = df_info.pivot_table(index=df_info.index.get_level_values(1), columns=['type'], 
                         values=['value'], aggfunc=','.join)
df = pd.concat([df, df_extract.xs('value', axis=1)], axis=1)

In [None]:
# Extract year and month and convert to Int 

df['year'] = pd.to_datetime(df['birth_date']).dt.year.astype('Int64')
df['month'] = pd.to_datetime(df['birth_date']).dt.month.astype('Int64')

# Convert to string and rename

df['family_name']=df['family_name'].astype('string')
df['given_name']=df['given_name'].astype('string')
df['title']=df['honorific_prefix'].astype('string')

In [None]:
# Create unique index column needed by Splink from datafram index
# Create blank company number company to enable company number from linked table to be included in results
# Subset down to required columns

df['unique_id'] = df.index
df["company_number"] = np.nan
df = df[['family_name','given_name','year','month','unique_id','company_number','title']]

In [20]:
# Read Persons with Significant Control file created in Download Data notebook

df_psc = pd.read_csv('psc_slim.csv', dtype={'data.name_elements.surname':'string','data.name_elements.forename':'string'})

In [21]:
# Rename and convert columns for matching

df_psc['year'] = df_psc['data.date_of_birth.year'].astype('Int64')
df_psc['month'] = df_psc['data.date_of_birth.month'].astype('Int64')
df_psc['given_name']=df_psc['data.name_elements.forename']
df_psc['family_name']=df_psc['data.name_elements.surname']
df_psc['title']=df_psc['data.name_elements.title']

# Create unique index column needed by Splink from datafram index
# Subset down to required columns

df_psc['unique_id'] = df_psc.index
df_psc = df_psc[['company_number','given_name','family_name','year','month', 'unique_id','title']]

In [22]:
df.head(2)

Unnamed: 0,family_name,given_name,year,month,unique_id,company_number,title
0,Frith,James,1977,4,0,,
1,,Anna,1971,1,1,,


In [23]:
df_psc

Unnamed: 0,company_number,given_name,family_name,year,month,unique_id,title
0,09145694,Nga,Wildman,1977,2,0,Mrs
1,08581893,Stephen,Davies,1947,9,1,Mr
2,08581893,Quentin,Solt,1965,6,2,Mr
3,01605766,Jeremy,Wright,1960,10,3,Mr
4,03306517,Keith,Tarrant,1955,5,4,
...,...,...,...,...,...,...,...
1302365,OC316772,,,,,1302365,
1302366,04135166,,,,,1302366,
1302367,07496736,,,,,1302367,
1302368,09900667,,,,,1302368,


In [38]:
# Splink settings to block on year and month matches and then compare given and family names

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.year = r.year and l.month = r.month"
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("given_name", [0.95, 0.9, 0.8, 0.7], term_frequency_adjustments=True),
        cl.jaro_winkler_at_thresholds("family_name", [0.95, 0.9, 0.8, 0.7], term_frequency_adjustments=True),
        cl.exact_match("month"),
        cl.exact_match("year", term_frequency_adjustments=True),
    ],       
}

In [39]:
# Call linker and estimate u values

linker = DuckDBLinker([df, df_psc], settings, input_table_aliases=["df", "df_psc"])

# Estimate the 'prior' (the probability that two random records match)

linker.estimate_probability_two_random_records_match(["l.given_name = r.given_name and l.family_name = r.family_name and l.month = r.month"], recall = 0.8)



Probability two random records match is estimated to be  3.62e-07.
This means that amongst all possible pairwise record comparisons, one in 2,762,370.02 are expected to match.  With 1,871,505,690 total possible comparisons, we expect a total of around 677.50 matching pairs


In [40]:
linker.estimate_u_using_random_sampling(target_rows=5e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - given_name (no m values are trained).
    - family_name (no m values are trained).
    - month (no m values are trained).
    - year (no m values are trained).


In [41]:
# Calculate m values 

linker.estimate_parameters_using_expectation_maximisation("l.family_name = r.family_name and l.month = r.month")
linker.estimate_parameters_using_expectation_maximisation("l.given_name = r.given_name and  l.year = r.year")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.family_name = r.family_name and l.month = r.month

Parameter estimates will be made for the following comparison(s):
    - given_name
    - year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - family_name
    - month

Iteration 1: Largest change in params was -0.0345 in the m_probability of year, level `All other comparisons`
Iteration 2: Largest change in params was -0.0251 in the m_probability of given_name, level `Exact match`
Iteration 3: Largest change in params was -0.0441 in the m_probability of given_name, level `Exact match`
Iteration 4: Largest change in params was -0.0381 in the m_probability of year, level `Exact match`
Iteration 5: Largest change in params was 0.0537 in the m_probability of year, level `All other comparisons`
Iteration 6: Largest change in params was 0.0644 in the m_probability of yea

<EMTrainingSession, blocking on l.given_name = r.given_name and  l.year = r.year, deactivating comparisons given_name, year>

In [42]:
linker.match_weights_chart()

In [43]:
# Predict matches and convert to dataframe
# To fix: Why such a low probability needed?

results = linker.predict(threshold_match_probability=0.7)
pres = results.as_pandas_dataframe()
pres

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,given_name_l,given_name_r,gamma_given_name,family_name_l,family_name_r,gamma_family_name,month_l,month_r,gamma_month,year_l,year_r,gamma_year
0,6.495777,0.989041,df,513,df_psc,21278,Andy,Andy,5,Reed,Reed,5,9,9,1,1964,1964,1
1,5.541517,0.978981,df,879,df_psc,419967,Charles,Charles,5,Hendry,Hendry,5,5,5,1,1959,1959,1
2,6.535698,0.989336,df,1048,df_psc,562214,Margaret,Margaret,5,Hodge,Hodge,5,9,9,1,1944,1944,1
3,9.989728,0.999017,df,346,df_psc,542129,Robert,Robert,5,Sheldon,Sheldon,5,9,9,1,1923,1923,1
4,11.152524,0.999561,df,1346,df_psc,1044585,Hilton,Hilton,5,Dawson,Dawson,5,9,9,1,1953,1953,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,5.652953,0.980513,df,822,df_psc,173003,Paul,Paul,5,Masterton,Masterton,5,11,11,1,1985,1985,1
131,14.078021,0.999942,df,786,df_psc,479788,Meg,Meg,5,Munn,Munn,5,8,8,1,1959,1959,1
132,1.240928,0.702691,df,419,df_psc,521265,Colleen,Carolyn,1,Fletcher,Fletcher,5,11,11,1,1954,1954,1
133,7.692262,0.995188,df,347,df_psc,716443,George,George,5,Hollingbery,Hollingbery,5,10,10,1,1963,1963,1


In [44]:
# Select matches that aren't exact

pres[(pres['family_name_l']!=pres['family_name_r']) | (pres['given_name_l']!=pres['given_name_r'])]

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,given_name_l,given_name_r,gamma_given_name,family_name_l,family_name_r,gamma_family_name,month_l,month_r,gamma_month,year_l,year_r,gamma_year
9,6.536685,0.989344,df,635,df_psc,528253,Bill,William,1,Rammell,Rammell,5,10,10,1,1959,1959,1
13,6.661395,0.990217,df,398,df_psc,692864,Christopher,Chris,2,Philp,Philp,5,7,7,1,1976,1976,1
14,1.309786,0.712565,df,1310,df_psc,118348,Angus,Angus,5,Robertson,Henderson,1,9,9,1,1969,1969,1
18,6.536685,0.989344,df,635,df_psc,742226,Bill,William,1,Rammell,Rammell,5,10,10,1,1959,1959,1
21,4.86953,0.966924,df,372,df_psc,1159821,Grahame,Grahame,5,Morris,Morris Mp,3,3,3,1,1961,1961,1
22,1.235106,0.701847,df,931,df_psc,608031,Tim,Timothy,2,Collins,Collins,5,5,5,1,1964,1964,1
25,7.884867,0.995787,df,469,df_psc,629843,Danny,Daniel,1,Kinahan,Kinahan,5,4,4,1,1958,1958,1
34,6.536685,0.989344,df,635,df_psc,272087,Bill,Willian,1,Rammell,Rammell,5,10,10,1,1959,1959,1
35,3.332254,0.909681,df,526,df_psc,403483,Drew,Andrew,1,Hendry,Hendry,5,5,5,1,1964,1964,1
40,4.755539,0.964301,df,109,df_psc,213178,Archie,Archibald,2,Hamilton,Hamilton,5,12,12,1,1941,1941,1


In [18]:
# Calculate exact match using a simply join

df_result = df.merge(df_psc, left_on=['family_name','given_name','year','month'], right_on=['family_name','given_name','year','month'],
          suffixes=('_left', '_right'))
df_result

Unnamed: 0,family_name,given_name,year,month,unique_id_left,company_number_left,title_left,company_number_right,unique_id_right,title_right
0,Gibson,Ian,1938,9,20,,,12814692,1043871,Dr
1,Roche,Barbara,1954,4,24,,The Right Honourable,08544993,518135,Mrs
2,Clark,Colin,1969,5,58,,,SC274212,127735,Mr
3,Whittaker,Craig,1962,8,82,,,13029479,1082087,Mr
4,Graham,Richard,1958,4,83,,,03426607,678833,Mr
...,...,...,...,...,...,...,...,...,...,...
118,Hunt,Jeremy,1966,11,1311,,The Right Honourable,02471319,275571,Mr
119,Dawson,Hilton,1953,9,1346,,,10204648,1044585,Mr
120,Cameron,David,1966,10,1350,,The Right Honourable,05289086,4970,Rt Hon
121,May,Theresa,1956,10,1351,,The Right Honourable,00464224,335637,Rt Hon


In [19]:
import recordlinkage

ModuleNotFoundError: No module named 'recordlinkage'

In [None]:
# Block on year and month matches

indexer = recordlinkage.Index()
indexer.block("year")
indexer.block("month")
candidate_links = indexer.index(df, df_psc)
len(candidate_links)

In [None]:
# Calculate matches as features

compare_cl = recordlinkage.Compare()
compare_cl.exact("given_name", "given_name", label="gname")
compare_cl.exact("family_name", "family_name", label="fname")
compare_cl.exact("year", "year", label="year_of_birth")
compare_cl.exact("month", "month", label="month_of_birth")
features = compare_cl.compute(candidate_links, df, df_psc)
features

In [None]:
# Get statistics on frequency of number of records matching 

features.sum(axis=1).value_counts().sort_index(ascending=False)

In [None]:
# Select those with name, year and month match

features[features.sum(axis=1) > 2]