In [1]:
# Example of an entity resolution approach: compiling available information about entities 
# (here, fictional persons) from multiple tables. This starts with a (multi-table) record 
# linkage step as before, and then clusters the results by entity.

In [2]:
import numpy as np
import numbers
import pandas as pd

import phonetics

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

In [3]:
import IPython

In [4]:
import vega

Setup
==

Vega setup for charts
--
Splink uses Vega for inline charts. We need a bit of setup to make sure these are displayed correctly.

In [5]:
# !jupyter nbextension install --sys-prefix --py vega

In [6]:
!jupyter nbextension enable vega --py --sys-prefix

Enabling notebook extension jupyter-vega/extension...
      - Validating: ok


In [7]:
# # A minimal example for debugging:
# vega.Vega({
#     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
#     "data": {
#         "values": [
#             {"a": "A", "b": 28},
#             {"a": "B", "b": 55},
#             {"a": "C", "b": 43},
#         ]
#     },
#     "mark": "bar",
#     "encoding": {
#         "x": {"field": "b", "type": "quantitative"},
#         "y": {"field": "a", "type": "nominal"}
#     }
# })

Data
==

Tables
--

In [8]:
table1 = pd.read_csv('data/interim/table1_with_duplicates.csv', dtype=str)
table2 = pd.read_csv('data/interim/table2_with_duplicates.csv', dtype=str)
table3 = pd.read_csv('data/interim/table3_with_duplicates.csv', dtype=str)
table4 = pd.read_csv('data/interim/table4_with_duplicates.csv', dtype=str)

Feature vector
--

In [9]:
# In practice we would first need to preprocess all tables to make them fit a standardised format --
# we can skip this here as the test data already comes standardised.

In [10]:
# Label them
table1.insert(0, 'Source', 'Table 1')
table2.insert(0, 'Source', 'Table 2')
table3.insert(0, 'Source', 'Table 3')
table4.insert(0, 'Source', 'Table 4')

# Combine to single DF
features = pd.concat([
    table1, table2, table3, table4
])
features.sample(5)

Unnamed: 0,Source,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster
5763,Table 1,Q389497-3,barnsbury,n7 6lj,benjamin glover laws,benjamin laws,benjamin,laws,1861-02-06,male,Q389497
920,Table 1,Q98761006-3,birmingham,b29 4jh,frank henry partridge,frank partridge,frank,partridge,1849-01-11,male,Q98761006
6047,Table 3,Q7790900-5,"kingston upon hull, city of",hu8 8bz,humphry ward,humphry ward,humphry,ward,1842-11-09,male,Q7790900
1911,Table 2,Q43127454-7,wales,,david bevan,david bevan,david,bevan,1852-01-01,male,Q43127454
9018,Table 1,Q5393276-4,hillingdon,ub5 6hw,ernest hirst,ernest hirst,ernest,hirst,1855-02-27,male,Q5393276


Entity resolution
==
Based on https://moj-analytical-services.github.io/splink/demos/examples/duckdb/deduplicate_50k_synthetic.html

Entity resolution of person names across all data sets. This then also effectively deduplicates any entries within each individual source table.

Settings
--

In [11]:
person_match_settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "unique_id",
    
    # Blocking, for performance
    "blocking_rules_to_generate_predictions": [
        "l.postcode = r.postcode",
    ],
    "comparisons": [
        cl.exact_match("city", term_frequency_adjustments=False),
        
        ctl.name_comparison("first_name"), 
        ctl.name_comparison("surname"), 
#         ctl.name_comparison("full_name"), 
#         ctl.forename_surname_comparison("first_name", "surname"),
    ],
    
    # Needed for waterfall charts of results
    "retain_intermediate_calculation_columns": True,
    "retain_matching_columns": True,
}

Training
--

In [12]:
person_linker = DuckDBLinker(
    [features],
    person_match_settings)

In [13]:
person_linker.missingness_chart()

In [14]:
person_linker.estimate_probability_two_random_records_match(
    #  A list of deterministic matching rules that should be designed to admit 
    # very few (none if possible) false positives:
    [
        "l.city = r.city and l.full_name = r.full_name",
    ], 
    recall=0.9)

Probability two random records match is estimated to be  3.77e-05.
This means that amongst all possible pairwise record comparisons, one in 26,534.30 are expected to match.  With 1,279,041,753 total possible comparisons, we expect a total of around 48,203.33 matching pairs


In [15]:
# Estimate U
person_linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)

----- Estimating u probabilities using random sampling -----


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - city (no m values are trained).
    - first_name (no m values are trained).
    - surname (no m values are trained).


In [16]:
session_person_uid = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.city = r.city")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.city = r.city

Parameter estimates will be made for the following comparison(s):
    - first_name
    - surname

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - city


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


Iteration 1: Largest change in params was -0.25 in the m_probability of first_name, level `Exact match first_name`
Iteration 2: Largest change in params was -0.0882 in the m_probability of first_name, level `Exact match first_name`
Iteration 3: Largest change in params was 0.0787 in the m_probability of first_name, level `All other comparisons`
Iteration 4: Largest change in params was 0.0375 in the m_probability of first_name, level `All other comparisons`
Iteration 5: Largest change in params was 0.0122 in the m_probability of first_name, level `All other comparisons`
Iteration 6: Largest change in params was 0.00368 in the m_probability of first_name, level `All other comparisons`
Iteration 7: Largest change in params was 0.0011 in the m_probability of first_name, level `All other comparisons`
Iteration 8: Largest change in params was -0.000451 in the m_probability of surname, level `Exact match surname`
Iteration 9: Largest change in params was 0.000485 in the m_probability of sur

In [17]:
session_person_fsn = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.first_name = r.first_name and l.surname = r.surname")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was -0.199 in probability_two_random_records_match
Iteration 2: Largest change in params was -0.0281 in probability_two_random_records_match
Iteration 3: Largest change in params was -0.00403 in probability_two_random_records_match
Iteration 4: Largest change in params was -0.000579 in probability_two_random_records_match
Iteration 5: Largest change in params was -8.34e-05 in probability_two_random_records_match

EM converged after 5 iterations

Your model is fully trained. All comparisons have at least one estimate for their m and u values


In [18]:
session_person_fulln = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.full_name  = r.full_name ")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.full_name  = r.full_name 

Parameter estimates will be made for the following comparison(s):
    - city
    - first_name
    - surname

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Level Damerau_levenshtein <= 1 on comparison first_name not observed in dataset, unable to train m value

Level Jaro_winkler_similarity >= 0.9 on comparison first_name not observed in dataset, unable to train m value

Level Jaro_winkler_similarity >= 0.8 on comparison first_name not observed in dataset, unable to train m value

Level All other comparisons on comparison first_name not observed in dataset, unable to train m value

Level Damerau_levenshtein <= 1 on comparison surname not observed in dataset, unable to train m value

Level Jaro_winkler_similarity >= 0.9 on comparison surname not observed in dataset, unable to train m value

Le

Review the parameters
--

In [19]:
person_linker.match_weights_chart()

In [20]:
person_linker.parameter_estimate_comparisons_chart()

In [21]:
person_linker.unlinkables_chart()

Linkage & deduplication
--

In [22]:
# Match probability threshold
# See the unlinkables chart above to determine a good value for this threshold, 
# and the match probabilities below for added context
person_match_threshold = 0.99

person_match_results = person_linker.predict(threshold_match_probability=person_match_threshold) 
person_match = person_match_results.as_pandas_dataframe()
person_match.sample(10)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,city_l,city_r,gamma_city,bf_city,first_name_l,first_name_r,gamma_first_name,bf_first_name,surname_l,surname_r,gamma_surname,bf_surname,postcode_l,postcode_r
37873,8.448621,0.997146,Q17627027-13,Q17627027-9,reading,reading,1,149.071901,martin,martin,4,58.559164,ridley,ridley,4,1061.913096,rg1 5bn,rg1 5bn
4884,8.448621,0.997146,Q75705682-1,Q75705682-8,hertsmere,hertsmere,1,149.071901,frederick,frederick,4,58.559164,minchin,minchin,4,1061.913096,wd23 4gt,wd23 4gt
32331,8.448621,0.997146,Q538999-2,Q538999-3,aldingham,aldingham,1,149.071901,reginald,reginald,4,58.559164,blomfield,blomfield,4,1061.913096,la12 9ry,la12 9ry
3940,7.592126,0.994844,Q2585321-15,Q2585321-8,yorkshire,yorkshire,1,149.071901,zmith,smith,3,32.341796,wigglesworth,wigglesworth,4,1061.913096,yo31 8jl,yo31 8jl
10074,8.448621,0.997146,Q21456207-1,Q21456207-15,kingston upon hull,kingston upon hull,1,149.071901,edward,edward,4,58.559164,redmore,redmore,4,1061.913096,hu2 9pb,hu2 9pb
30201,8.448621,0.997146,Q16832027-1,Q16832027-5,bexley,bexley,1,149.071901,violet,violet,4,58.559164,melnotte,melnotte,4,1061.913096,da8 3ay,da8 3ay
7970,8.448621,0.997146,Q20733457-2,Q20733457-6,newark and sherwood,newark and sherwood,1,149.071901,john,john,4,58.559164,hughes,hughes,4,1061.913096,ng24 3wz,ng24 3wz
18299,8.448621,0.997146,Q81207821-3,Q81207821-4,west midlands,west midlands,1,149.071901,georgina,georgina,4,58.559164,aubinière,aubinière,4,1061.913096,dy7 5lj,dy7 5lj
1627,7.260562,0.993521,Q3526667-2,Q3526667-3,wakefield,wakefield,1,149.071901,théophile,theophilus,1,25.701209,lobb,lobb,4,1061.913096,wf7 6lb,wf7 6lb
7189,8.448621,0.997146,Q18810722-10,Q18810722-5,islington,islington,1,149.071901,frederick,frederick,4,58.559164,goulding,goulding,4,1061.913096,n4 4at,n4 4at


In [23]:
len(person_match)

43412

In [24]:
person_match.sort_values(by='match_probability', ascending=True).head(15)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,city_l,city_r,gamma_city,bf_city,first_name_l,first_name_r,gamma_first_name,bf_first_name,surname_l,surname_r,gamma_surname,bf_surname,postcode_l,postcode_r
0,6.910146,0.991754,Q2296770-10,Q2296770-11,devon,devon,1,149.071901,thomas,thomas,4,58.559164,chudleigh,chydleigh,3,365.562651,tq13 8jr,tq13 8jr
10597,6.910146,0.991754,Q21460055-3,Q21460055-6,pembrokeshire,pembrokeshire,1,149.071901,thomas,thomas,4,58.559164,butl4r,butler,3,365.562651,sa43 3gf,sa43 3gf
10593,6.910146,0.991754,Q18734711-3,Q18734711-7,dorset,dorset,1,149.071901,samuel,samuel,4,58.559164,macpherson,mcpherson,3,365.562651,dt4 9ry,dt4 9ry
10591,6.910146,0.991754,Q7879758-15,Q7879758-16,vale of white horse,vale of white horse,1,149.071901,ulpian,ulpian,4,58.559164,ful2ell,fulwell,3,365.562651,sn7 7eh,sn7 7eh
10590,6.910146,0.991754,Q7879758-15,Q7879758-9,vale of white horse,vale of white horse,1,149.071901,ulpian,ulpian,4,58.559164,ful2ell,fulwell,3,365.562651,sn7 7eh,sn7 7eh
10580,6.910146,0.991754,Q15485597-1,Q15485597-9,wakefield,wakefield,1,149.071901,holliday,holliday,4,58.559164,kendall,kendal,3,365.562651,wf1 5dl,wf1 5dl
10560,6.910146,0.991754,Q3809493-1,Q3809493-3,flintshire,flintshire,1,149.071901,john,john,4,58.559164,milton,milyon,3,365.562651,ll12 9ru,ll12 9ru
35842,6.910146,0.991754,Q563069-14,Q563069-3,anstruther,anstruther,1,149.071901,thomas,thomas,4,58.559164,chalners,chalmers,3,365.562651,ky10 3ju,ky10 3ju
35843,6.910146,0.991754,Q563069-14,Q563069-9,anstruther,anstruther,1,149.071901,thomas,thomas,4,58.559164,chalners,chalmers,3,365.562651,ky10 3ju,ky10 3ju
10548,6.910146,0.991754,Q3431381-5,Q3431381-6,"herefordshire, county of","herefordshire, county of",1,149.071901,richard,richard,4,58.559164,villakil,villamil,3,365.562651,wr13 6dp,wr13 6dp


Inspect the results
--

In [25]:
person_linker.waterfall_chart(person_match_results.as_record_dict(limit=20))

In [26]:
# Comparison viewer
person_linker.comparison_viewer_dashboard(person_match_results, 
                                          "data/interim/entity_resolution-report-scv.html", 
                                          overwrite=True, num_example_rows=20)
IPython.display.IFrame(src="data/interim/entity_resolution-report-scv.html", width="100%", height=1200)

Clusters
--

In [27]:
# Lower threshold = larger (more permissive) clusters
cluster_match_threshold = 0.99

person_clusters_result = person_linker.cluster_pairwise_predictions_at_threshold(person_match_results,
                                                                                 threshold_match_probability=cluster_match_threshold)
person_clusters = person_clusters_result.as_pandas_dataframe().\
    sort_values(by=['cluster_id', 'full_name'])
person_clusters.cluster_id.nunique()

Completed iteration 1, root rows count 9
Completed iteration 2, root rows count 1
Completed iteration 3, root rows count 0


33869

In [28]:
person_clusters.sort_values(by=['postcode', 'cluster_id']).head(100).tail(20)

Unnamed: 0,cluster_id,Source,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster
20895,Q7791193-17,Table 4,Q7791193-17,aberdeen,ab24 3bs,sir thomas jaffrey,sir jaffrey,sir,jaffrey,,male,Q7791193
24666,Q4718978-11,Table 1,Q4718978-11,aberdeen,ab24 3ex,alexander gurdon,alexander gurdon,alexander,gurdon,1692-01-01,male,Q4718978
1811,Q4718978-1,Table 3,Q4718978-7,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,1692-01-81,male,Q4718978
18527,Q4718978-1,Table 1,Q4718978-4,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,1692-01-01,male,Q4718978
22191,Q4718978-1,Table 2,Q4718978-10,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,,,Q4718978
24669,Q4718978-1,Table 3,Q4718978-3,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,1692-01-01,male,Q4718978
31132,Q4718978-1,Table 2,Q4718978-2,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,1692-01-01,male,Q4718978
46692,Q4718978-1,Table 2,Q4718978-1,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,1692-01-01,male,Q4718978
46693,Q4718978-1,Table 3,Q4718978-6,aberdeen,ab24 3hd,alexander gordon,alexander gordon,alexander,gordon,1692-01-01,male,Q4718978
50295,Q4718978-12,Table 4,Q4718978-12,,ab24 3hd,elsie gurdon,elsie gurdon,elsie,gurdon,,,Q4718978


In [29]:
# The n largest clusters
largest_clusters = person_clusters.groupby('cluster_id').cluster_id.count().rename('count').\
    sort_values(ascending=False).\
    head(20)
largest_clusters.head()

cluster_id
Q319331-1      15
Q5548996-1     14
Q18750552-1    14
Q75510166-1    13
Q59549587-1    13
Name: count, dtype: int64

In [30]:
# Cluster viewer
person_linker.cluster_studio_dashboard(person_match_results, person_clusters_result, 
                                       "data/interim/entity_resolution-report-cluster_studio.html", 
#                                        cluster_ids=person_clusters.cluster_id,
#                                        cluster_names=list(person_clusters.label),
#                                        sampling_method="by_cluster_size",
#                                        sampling_method="random", 
#                                        sample_size=100, # This doesn't work when sampling by cluster size -- only ever returns 4
                                       overwrite=True)
IPython.display.IFrame(src="data/interim/entity_resolution-report-cluster_studio.html", width="100%", height=1200)

Export
--

In [31]:
person_match.to_csv('data/processed/entity_resolution-edges.csv', index=False)
person_clusters.to_csv('data/processed/entity_resolution-clusters.csv', index=False)