### Person Match Example

Match House of Commons MPs to Persons with Significant Control of UK Companies

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Open UK House of Commons JSON, normalise, load into dataframe and rename columns

with open('ep-popolo-v1.0.json', encoding="utf8") as f:
    json_dict = json.load(f)
    
df = pd.json_normalize(json_dict, record_path=['persons'])
df = df.rename(columns={'email':'primaryemail'})

In [3]:
# Replace nulls with empty dict in a list to allow normalisation

df['contact_details'] = [ [{}] if x is np.NaN else x for x in df['contact_details'] ]

In [4]:
import warnings
warnings.filterwarnings('ignore')
# Extract the extra attributes embedded as json in a list in contact details column
# Pivot and join with dataframe, adding as extra columns

df_info = pd.json_normalize(df.to_dict('list'), ['contact_details']).unstack().apply(pd.Series)
df_extract = df_info.pivot_table(index=df_info.index.get_level_values(1), columns=['type'], 
                         values=['value'], aggfunc=','.join)
df = pd.concat([df, df_extract.xs('value', axis=1)], axis=1)

In [5]:
# Extract year and month and convert to Int 

df['year'] = pd.to_datetime(df['birth_date']).dt.year.astype('Int64')
df['month'] = pd.to_datetime(df['birth_date']).dt.month.astype('Int64')

# Convert to string and rename

df['family_name']=df['family_name'].astype('string')
df['given_name']=df['given_name'].astype('string')

In [6]:
# Create unique index column needed by Splink from datafram index
# Create blank company number company to enable company number from linked table to be included in results
# Subset down to required columns

df['unique_id'] = df.index
df["company_number"] = np.nan
df = df[['family_name','given_name','year','month','unique_id','company_number']]

In [7]:
# Read Persons with Significant Control file created in Download Data notebook

df_psc = pd.read_csv('psc_slim.csv', dtype={'data.name_elements.surname':'string','data.name_elements.forename':'string'})

In [8]:
# Rename and convert columns for matching

df_psc['year'] = df_psc['data.date_of_birth.year'].astype('Int64')
df_psc['month'] = df_psc['data.date_of_birth.month'].astype('Int64')
df_psc['given_name']=df_psc['data.name_elements.forename']
df_psc['family_name']=df_psc['data.name_elements.surname']

# Create unique index column needed by Splink from datafram index
# Subset down to required columns

df_psc['unique_id'] = df_psc.index
df_psc = df_psc[['company_number','given_name','family_name','year','month', 'unique_id']]

## Splink

In [9]:
# Splink settings to block on year and month matches and then compare given and family names

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.year = r.year and l.month = r.month"
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("given_name", [0.95, 0.9, 0.8, 0.7], term_frequency_adjustments=True),
        cl.jaro_winkler_at_thresholds("family_name", [0.95, 0.9, 0.8, 0.7], term_frequency_adjustments=True),
        cl.exact_match("month"),
        cl.exact_match("year", term_frequency_adjustments=True),
    ],       
}

In [10]:
# Setup linker and profile columns

linker = DuckDBLinker([df, df_psc], settings, input_table_aliases=["df", "df_psc"])
linker.profile_columns(["given_name","family_name", "year", "month"], top_n=10, bottom_n=5)

In [11]:
# Check for missing values

linker.missingness_chart()

In [12]:
# Estimate the 'prior' (the probability that two random records match)

linker.estimate_probability_two_random_records_match(["l.given_name = r.given_name and l.family_name = r.family_name and l.month = r.month"], recall = 0.8)

Probability two random records match is estimated to be  4.3e-07.
This means that amongst all possible pairwise record comparisons, one in 2,325,490.03 are expected to match.  With 1,595,867,535 total possible comparisons, we expect a total of around 686.25 matching pairs


In [13]:
# Estimate u values

linker.estimate_u_using_random_sampling(target_rows=5e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - given_name (no m values are trained).
    - family_name (no m values are trained).
    - month (no m values are trained).
    - year (no m values are trained).


In [14]:
# Calculate m values 

linker.estimate_parameters_using_expectation_maximisation("l.family_name = r.family_name and l.month = r.month")
linker.estimate_parameters_using_expectation_maximisation("l.given_name = r.given_name and  l.year = r.year")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.family_name = r.family_name and l.month = r.month

Parameter estimates will be made for the following comparison(s):
    - given_name
    - year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - family_name
    - month

Iteration 1: Largest change in params was 0.126 in the m_probability of year, level `All other comparisons`
Iteration 2: Largest change in params was -0.046 in the m_probability of year, level `Exact match`
Iteration 3: Largest change in params was 0.0331 in the m_probability of year, level `All other comparisons`
Iteration 4: Largest change in params was -0.0284 in the m_probability of year, level `Exact match`
Iteration 5: Largest change in params was -0.0241 in the m_probability of year, level `Exact match`
Iteration 6: Largest change in params was 0.0201 in the m_probability of year, level `All o

<EMTrainingSession, blocking on l.given_name = r.given_name and  l.year = r.year, deactivating comparisons given_name, year>

In [15]:
linker.match_weights_chart()

In [16]:
linker.m_u_parameters_chart()

In [17]:
# Predict matches and convert to dataframe

results = linker.predict(threshold_match_probability=0.7)
pres = results.as_pandas_dataframe()
pres

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,given_name_l,given_name_r,gamma_given_name,family_name_l,family_name_r,gamma_family_name,month_l,month_r,gamma_month,year_l,year_r,gamma_year
0,8.036625,0.996206,df,993,df_psc,1052239,Ben,Benedict,2,Gummer,Gummer,5,2,2,1,1978,1978,1
1,16.521976,0.999989,df,1000,df_psc,1070477,Natascha,Natascha,5,Engel,Engel,5,4,4,1,1967,1967,1
2,6.278989,0.987286,df,1157,df_psc,1096194,Robert,Robert,5,Buckland,Buckland,5,9,9,1,1968,1968,1
3,1.496733,0.738359,df,325,df_psc,1096892,Roger,Roger,5,Williams,Wild,1,1,1,1,1948,1948,1
4,2.059880,0.806558,df,1064,df_psc,1099720,Malcolm,Malcolm,5,Wicks,Quick,1,7,7,1,1947,1947,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,10.347116,0.999233,df,590,df_psc,188533,Kerry,Kerry,5,Pollard,Pollard,5,4,4,1,1944,1944,1
152,6.911494,0.991762,df,20,df_psc,927461,Ian,Ian,5,Gibson,Gibson,5,9,9,1,1938,1938,1
153,13.951945,0.999937,df,1346,df_psc,928114,Hilton,Hilton,5,Dawson,Dawson,5,9,9,1,1953,1953,1
154,13.286275,0.999900,df,611,df_psc,167618,Alister,Alister,5,Jack,Jack,5,7,7,1,1963,1963,1


In [18]:
# Select matches that aren't exact

pres[(pres['family_name_l']!=pres['family_name_r']) | (pres['given_name_l']!=pres['given_name_r'])]

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,given_name_l,given_name_r,gamma_given_name,family_name_l,family_name_r,gamma_family_name,month_l,month_r,gamma_month,year_l,year_r,gamma_year
0,8.036625,0.996206,df,993,df_psc,1052239,Ben,Benedict,2,Gummer,Gummer,5,2,2,1,1978,1978,1
3,1.496733,0.738359,df,325,df_psc,1096892,Roger,Roger,5,Williams,Wild,1,1,1,1,1948,1948,1
4,2.05988,0.806558,df,1064,df_psc,1099720,Malcolm,Malcolm,5,Wicks,Quick,1,7,7,1,1947,1947,1
8,1.747773,0.770558,df,25,df_psc,849309,Gordon,Gordon,5,Marsden,Paterson,1,11,11,1,1953,1953,1
9,1.747773,0.770558,df,25,df_psc,849370,Gordon,Gordon,5,Marsden,Paterson,1,11,11,1,1953,1953,1
10,5.021392,0.97013,df,1407,df_psc,881270,Gerry,Gerard,2,Sutcliffe,Sutcliffe,5,5,5,1,1953,1953,1
15,1.496733,0.738359,df,325,df_psc,514682,Roger,Roger,5,Williams,Wild,1,1,1,1,1948,1948,1
16,9.128658,0.998217,df,622,df_psc,518735,Alex,Alexander,2,Salmond,Salmond,5,12,12,1,1954,1954,1
23,7.527267,0.994608,df,469,df_psc,559452,Danny,Daniel,1,Kinahan,Kinahan,5,4,4,1,1958,1958,1
30,1.543226,0.744537,df,695,df_psc,546348,Ian,Ian,5,Pearson,Parison,3,4,4,1,1959,1959,1


In [19]:
# Calculate exact match using a simple join

df_result = df.merge(df_psc, left_on=['family_name','given_name','year','month'], right_on=['family_name','given_name','year','month'],
          suffixes=('_left', '_right'))
df_result

Unnamed: 0,family_name,given_name,year,month,unique_id_left,company_number_left,company_number_right,unique_id_right
0,Gibson,Ian,1938,9,20,,12814692,927461
1,Roche,Barbara,1954,4,24,,08544993,462175
2,Clark,Colin,1969,5,58,,SC274212,114205
3,Whittaker,Craig,1962,8,82,,13029479,961966
4,Graham,Richard,1958,4,83,,03426607,602104
...,...,...,...,...,...,...,...,...
123,Hunt,Jeremy,1966,11,1311,,02471319,245609
124,Dawson,Hilton,1953,9,1346,,10204648,928114
125,Cameron,David,1966,10,1350,,05289086,4439
126,May,Theresa,1956,10,1351,,00464224,299374
