### Person Match Example

Match House of Commons MPs to Persons with Significant Control of UK Companies

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Open UK House of Commons JSON, normalise, load into dataframe and rename columns

with open('ep-popolo-v1.0.json', encoding="utf8") as f:
    json_dict = json.load(f)
    
df = pd.json_normalize(json_dict, record_path=['persons'])
df = df.rename(columns={'email':'primaryemail'})

In [3]:
# Replace nulls with empty dict in a list to allow normalisation

df['contact_details'] = [ [{}] if x is np.NaN else x for x in df['contact_details'] ]

In [4]:
# Extract the extra attributes embedded as json in a list in contact details column
# Pivot and join with dataframe, adding as extra columns

df_info = pd.json_normalize(df.to_dict('list'), ['contact_details']).unstack().apply(pd.Series)
df_extract = df_info.pivot_table(index=df_info.index.get_level_values(1), columns=['type'], 
                         values=['value'], aggfunc=','.join)
df = pd.concat([df, df_extract.xs('value', axis=1)], axis=1)

In [5]:
# Extract year and month and convert to Int 

df['year'] = pd.to_datetime(df['birth_date']).dt.year.astype('Int64')
df['month'] = pd.to_datetime(df['birth_date']).dt.month.astype('Int64')

# Convert to string and rename

df['family_name']=df['family_name'].astype('string')
df['given_name']=df['given_name'].astype('string')
df['title']=df['honorific_prefix'].astype('string')

In [6]:
# Create unique index column needed by Splink from datafram index
# Create blank company number company to enable company number from linked table to be included in results
# Subset down to required columns

df['unique_id'] = df.index
df["company_number"] = np.nan
df = df[['family_name','given_name','year','month','unique_id','company_number','title']]

In [8]:
# Read Persons with Significant Control file created in Download Data notebook

df_psc = pd.read_csv('psc_slim.csv', dtype={'data.name_elements.surname':'string','data.name_elements.forename':'string'})

In [9]:
# Rename and convert columns for matching

df_psc['year'] = df_psc['data.date_of_birth.year'].astype('Int64')
df_psc['month'] = df_psc['data.date_of_birth.month'].astype('Int64')
df_psc['given_name']=df_psc['data.name_elements.forename']
df_psc['family_name']=df_psc['data.name_elements.surname']
df_psc['title']=df_psc['data.name_elements.title']

# Create unique index column needed by Splink from datafram index
# Subset down to required columns

df_psc['unique_id'] = df_psc.index
df_psc = df_psc[['company_number','given_name','family_name','year','month', 'unique_id','title']]

In [11]:
import splink

In [12]:
# Splink settings to block on year and month matches and then compare given and family names

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.year = r.year and l.month = r.month"
    ],
    "comparisons": [
        cl.jaccard_at_thresholds("given_name"),
        cl.jaccard_at_thresholds("family_name"),
    ],       
}

In [13]:
# Call linker and estimate u values

linker = DuckDBLinker([df, df_psc], settings, input_table_aliases=["df", "df_psc"])
linker.estimate_u_using_random_sampling(target_rows=1e6)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - given_name (no m values are trained).
    - family_name (no m values are trained).


In [14]:
# Calculate m values 

linker.estimate_parameters_using_expectation_maximisation("l.family_name = r.family_name")
linker.estimate_parameters_using_expectation_maximisation("l.given_name = r.given_name")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.family_name = r.family_name

Parameter estimates will be made for the following comparison(s):
    - given_name

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - family_name

Iteration 1: Largest change in params was 0.0461 in the m_probability of given_name, level `Exact match`
Iteration 2: Largest change in params was 0.0215 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.00823 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.00156 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.000248 in probability_two_random_records_match
Iteration 6: Largest change in params was 3.83e-05 in probability_two_random_records_match

EM converged after 6 iterations

Your model is not yet fully trained. Missing estimates for:
  

<EMTrainingSession, blocking on l.given_name = r.given_name, deactivating comparisons given_name>

In [15]:
# Predict matches and convert to dataframe
# To fix: Why such a low probability needed?

results = linker.predict(threshold_match_probability=0.001)
pres = results.as_pandas_dataframe()
pres

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,given_name_l,given_name_r,gamma_given_name,family_name_l,family_name_r,gamma_family_name,month_l,month_r,year_l,year_r
0,-0.881856,0.351773,df,786,df_psc,477783,Meg,Meg,3,Munn,Munn,3,8,8,1959,1959
1,-0.881856,0.351773,df,347,df_psc,667876,George,George,3,Hollingbery,Hollingbery,3,10,10,1963,1963
2,-0.881856,0.351773,df,347,df_psc,667893,George,George,3,Hollingbery,Hollingbery,3,10,10,1963,1963
3,-0.881856,0.351773,df,1195,df_psc,89710,Leo,Leo,3,Docherty,Docherty,3,10,10,1976,1976
4,-0.881856,0.351773,df,695,df_psc,129120,Ian,Ian,3,Pearson,Pearson,3,4,4,1959,1959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,-0.881856,0.351773,df,1157,df_psc,1225659,Robert,Robert,3,Buckland,Buckland,3,9,9,1968,1968
131,-0.881856,0.351773,df,1089,df_psc,1112259,Shahid,Shahid,3,Malik,Malik,3,11,11,1967,1967
132,-6.731982,0.009320,df,372,df_psc,1154608,Grahame,Grahame,3,Morris,Morris Mp,1,3,3,1961,1961
133,-0.881856,0.351773,df,1000,df_psc,1198154,Natascha,Natascha,3,Engel,Engel,3,4,4,1967,1967


In [16]:
# Select matches that aren't exact

pres[(pres['family_name_l']!=pres['family_name_r']) | (pres['given_name_l']!=pres['given_name_r'])]

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,given_name_l,given_name_r,gamma_given_name,family_name_l,family_name_r,gamma_family_name,month_l,month_r,year_l,year_r
16,-6.731982,0.00932,df,1320,df_psc,132641,Stephen,Stephen,3,McPartland,Mcpartland,1,8,8,1976,1976
38,-6.731982,0.00932,df,695,df_psc,612231,Ian,Ian,3,Pearson,Parison,1,4,4,1959,1959
51,-6.731982,0.00932,df,1155,df_psc,657226,Paul,Paul,3,Marsden,Mears,1,3,3,1968,1968
52,-6.731982,0.00932,df,1155,df_psc,929922,Paul,Paul,3,Marsden,Mears,1,3,3,1968,1968
53,-6.731982,0.00932,df,1067,df_psc,741765,Jack,Jack,3,Brereton,Stoner,1,5,5,1991,1991
56,-6.731982,0.00932,df,1233,df_psc,257038,Peter,Peter,3,Bradley,Brearley,1,4,4,1953,1953
64,-6.731982,0.00932,df,1102,df_psc,703241,Mark,Mark,3,Garnier,Gardner,1,2,2,1963,1963
65,-6.731982,0.00932,df,1102,df_psc,703242,Mark,Mark,3,Garnier,Gardner,1,2,2,1963,1963
82,-6.731982,0.00932,df,265,df_psc,659818,Andrew,Andrew,3,Jones,Jewson,1,11,11,1963,1963
95,-6.731982,0.00932,df,168,df_psc,951413,Stephen,Stephen,3,Phillips,Phillippo,1,3,3,1970,1970


In [17]:
# Calculate exact match using a simply join

df_result = df.merge(df_psc, left_on=['family_name','given_name','year','month'], right_on=['family_name','given_name','year','month'],
          suffixes=('_left', '_right'))
df_result

Unnamed: 0,family_name,given_name,year,month,unique_id_left,company_number_left,title_left,company_number_right,unique_id_right,title_right
0,Gibson,Ian,1938,9,20,,,12814692,1039673,Dr
1,Roche,Barbara,1954,4,24,,The Right Honourable,08544993,516016,Mrs
2,Clark,Colin,1969,5,58,,,SC274212,127379,Mr
3,Whittaker,Craig,1962,8,82,,,13029479,1077609,Mr
4,Graham,Richard,1958,4,83,,,03426607,676201,Mr
...,...,...,...,...,...,...,...,...,...,...
116,Hunt,Jeremy,1966,11,1311,,The Right Honourable,02471319,274340,Mr
117,Dawson,Hilton,1953,9,1346,,,10204648,1040372,Mr
118,Cameron,David,1966,10,1350,,The Right Honourable,05289086,4880,Rt Hon
119,May,Theresa,1956,10,1351,,The Right Honourable,00464224,334190,Rt Hon


In [18]:
import recordlinkage

In [None]:
# Block on year and month matches

indexer = recordlinkage.Index()
indexer.block("year")
indexer.block("month")
candidate_links = indexer.index(df, df_psc)
len(candidate_links)

In [12]:
# Calculate matches as features

compare_cl = recordlinkage.Compare()
compare_cl.exact("given_name", "given_name", label="gname")
compare_cl.exact("family_name", "family_name", label="fname")
compare_cl.exact("year", "year", label="year_of_birth")
compare_cl.exact("month", "month", label="month_of_birth")
features = compare_cl.compute(candidate_links, df, df_psc)
features

In [14]:
# Get statistics on frequency of number of records matching 

features.sum(axis=1).value_counts().sort_index(ascending=False)

3      26
2     153
1    1574
dtype: int64

In [15]:
# Select those with name, year and month match

features[features.sum(axis=1) > 2]

Unnamed: 0,Unnamed: 1,name,year_of_birth,month_of_birth
50,3936482,1,1,1
135,1593395,1,1,1
135,1593770,1,1,1
135,3734393,1,1,1
314,5929680,1,1,1
314,6427452,1,1,1
314,7418248,1,1,1
332,4622988,1,1,1
397,447090,1,1,1
432,6893411,1,1,1
