In [1]:
# Example of a phonetic match across multiple columns, with blocking.

In [2]:
import numpy as np
import numbers
import pandas as pd

import phonetics

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

In [3]:
import IPython

In [4]:
import vega

Setup
==

Vega setup for charts
--
Splink uses Vega for inline charts. We need a bit of setup to make sure these are displayed correctly.

In [5]:
# !jupyter nbextension install --sys-prefix --py vega

In [6]:
!jupyter nbextension enable vega --py --sys-prefix

Enabling notebook extension jupyter-vega/extension...
      - Validating: ok


In [7]:
# # A minimal example for debugging:
# vega.Vega({
#     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
#     "data": {
#         "values": [
#             {"a": "A", "b": 28},
#             {"a": "B", "b": 55},
#             {"a": "C", "b": 43},
#         ]
#     },
#     "mark": "bar",
#     "encoding": {
#         "x": {"field": "b", "type": "quantitative"},
#         "y": {"field": "a", "type": "nominal"}
#     }
# })

Data
==

Tables
--

In [8]:
table1 = pd.read_csv('data/interim/table1.csv', dtype=str)
table1.head(2)

Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster
0,Q2296770-2,devon,tq13 8df,thomas of chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,male,Q2296770
1,Q1443188-1,bristol,bs2 0el,frank edward brightman,frank brightman,frank,brightman,1856-06-18,male,Q1443188


In [9]:
table2 = pd.read_csv('data/interim/table2.csv', dtype=str)
table2.head(2)

Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster
0,Q2296770-4,devon,tq13 8hu,thomas 1st chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,,Q2296770
1,Q1443188-2,bristol,bs2 0el,frank edward brightman,frank brightman,frank,brightman,1856-06-18,male,Q1443188


Preprocessing
==

Phonetic match
--

In [10]:
def is_null(v):
    if v is None:
        return True
    if isinstance(v, numbers.Number):
        return np.isnan(v)
    return False

In [11]:
# Metaphone (vs Soundex)
def metaphone(s):
    if is_null(s):
        return None
    try:
        return phonetics.metaphone(s)
    except IndexError:
        print(f"Could not transform string: {s}")
        return None

In [12]:
metaphone('Karen')

'KRN'

In [13]:
metaphone('Karin')

'KRN'

In [14]:
# Table 1
for col in ['full_name', 'first_and_surname', 'first_name', 'surname']:
    print(col)
    table1[f'{col}_metaphone'] = table1[col].apply(lambda s: metaphone(s))
table1.sample(5)

full_name
first_and_surname
first_name
surname


Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster,full_name_metaphone,first_and_surname_metaphone,first_name_metaphone,surname_metaphone
1771,Q105824571-1,ashfield,ng17 5gu,richard staunton cahill,richard cahill,richard,cahill,1827-01-01,male,Q105824571,RXRTSTNTNKHL,RXRTKHL,RXRT,KHL
607,Q5718119-2,london,sw1p 1jw,henry bernard chalon,henry chalon,henry,chalon,1770-01-01,male,Q5718119,HNRPRNRTXLN,HNRXLN,HNR,XLN
515,Q7328478-10,stoke-on-trent,st2 8lh,dick pullan,dick pullan,dick,pullan,,male,Q7328478,TKPLN,TKPLN,TK,PLN
2235,Q4718588-4,hatcham,se4 2qn,alexander chinnery-haldane,alexander chinnery-haldane,alexander,chinnery-haldane,1842-08-14,male,Q4718588,ALKSNTRXNRLTN,ALKSNTRXNRLTN,ALKSNTR,XNRLTN
1758,Q16551731-1,north east derbyshire,s44 6uz,"elizabeth cecil, countess of devonshire",elizabeth devonshire,elizabeth,devonshire,1619-01-01,female,Q16551731,ALSP0SSLKNTSFTFNXR,ALSP0TFNXR,ALSP0,TFNXR


In [15]:
# Table 2
for col in ['full_name', 'first_and_surname', 'first_name', 'surname']:
    print(col)
    table2[f'{col}_metaphone'] = table2[col].apply(lambda s: metaphone(s))
table1.sample(5)

full_name
first_and_surname
first_name
surname


Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster,full_name_metaphone,first_and_surname_metaphone,first_name_metaphone,surname_metaphone
3971,Q2646506-3,birkenhead,ch41 2yr,sir algernon thomas,sir thomas,sir,thomas,1857-08-03,male,Q2646506,SRLKRNNTMS,SRTMS,SR,TMS
6,Q631006-2,bucharest,ex20 3pz,moses gaster,moses gaster,moses,gaster,1856-09-17,male,Q631006,MSSKSTR,MSSKSTR,MSS,KSTR
2003,Q8018606-3,howden,dn14 7bg,william somergell,william somergell,william,somergell,1860-04-05,male,Q8018606,ALMSMRJL,ALMSMRJL,ALM,SMRJL
482,Q3530860-1,london,wc2r 1ab,tom ricketts,tom ricketts,tom,ricketts,1853-01-15,male,Q3530860,TMRKTS,TMRKTS,TM,RKTS
1746,Q1702110-2,glasgow,g4 0le,john watson,john watson,john,watson,1847-02-25,male,Q1702110,JNTSN,JNTSN,JN,ATSN


Phonetic match
==
Based on https://moj-analytical-services.github.io/splink/demos/examples/duckdb/link_only.html

Linking two tables without deduplication.

Settings
--

In [16]:
person_match_settings = {
    "link_type": "link_only",
    "unique_id_column_name": "unique_id",
    
    # Blocking, for performance
    "blocking_rules_to_generate_predictions": [
        "l.postcode = r.postcode",
    ],
    "comparisons": [
        cl.exact_match("city", term_frequency_adjustments=False),
        
        # Deterministic name match (phonetic)
        cl.exact_match("first_name_metaphone", term_frequency_adjustments=False),
        cl.exact_match("surname_metaphone", term_frequency_adjustments=False),
#         cl.exact_match("full_name_metaphone", term_frequency_adjustments=False),
    ],
    
    # Needed for waterfall charts of results
    "retain_intermediate_calculation_columns": True,
    "retain_matching_columns": True,
}

Training
--

In [17]:
person_linker = DuckDBLinker(
    [table1, table2],
    person_match_settings)

In [18]:
person_linker.missingness_chart()

In [19]:
person_linker.estimate_probability_two_random_records_match(
    #  A list of deterministic matching rules that should be designed to admit 
    # very few (none if possible) false positives:
    [
        "l.city = r.city and l.full_name = r.full_name",
    ], 
    recall=0.9)

Probability two random records match is estimated to be  7.9e-05.
This means that amongst all possible pairwise record comparisons, one in 12,652.24 are expected to match.  With 19,386,048 total possible comparisons, we expect a total of around 1,532.22 matching pairs


In [20]:
# Estimate U
person_linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - city (no m values are trained).
    - first_name_metaphone (no m values are trained).
    - surname_metaphone (no m values are trained).


In [21]:
session_person_uid = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.city = r.city")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.city = r.city

Parameter estimates will be made for the following comparison(s):
    - first_name_metaphone
    - surname_metaphone

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - city

Iteration 1: Largest change in params was -0.0237 in the m_probability of surname_metaphone, level `All other comparisons`
Iteration 2: Largest change in params was -0.0589 in the m_probability of first_name_metaphone, level `Exact match`
Iteration 3: Largest change in params was -0.0408 in the m_probability of first_name_metaphone, level `Exact match`
Iteration 4: Largest change in params was -0.0176 in the m_probability of first_name_metaphone, level `Exact match`
Iteration 5: Largest change in params was 0.00624 in the m_probability of first_name_metaphone, level `All other comparisons`
Iteration 6: Largest change in params was 

In [22]:
session_person_fsn = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.first_name_metaphone = r.first_name_metaphone and l.surname_metaphone = r.surname_metaphone")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name_metaphone = r.first_name_metaphone and l.surname_metaphone = r.surname_metaphone

Parameter estimates will be made for the following comparison(s):
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name_metaphone
    - surname_metaphone

Iteration 1: Largest change in params was -0.218 in probability_two_random_records_match
Iteration 2: Largest change in params was -0.014 in probability_two_random_records_match
Iteration 3: Largest change in params was -0.000986 in probability_two_random_records_match
Iteration 4: Largest change in params was -7.03e-05 in probability_two_random_records_match

EM converged after 4 iterations

Your model is fully trained. All comparisons have at least one estimate for their m and u values


In [23]:
session_person_fulln = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.full_name_metaphone  = r.full_name_metaphone")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.full_name_metaphone  = r.full_name_metaphone

Parameter estimates will be made for the following comparison(s):
    - city
    - first_name_metaphone
    - surname_metaphone

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was 0.64 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.27 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0152 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.0207 in the m_probability of surname_metaphone, level `All other comparisons`
Iteration 5: Largest change in params was 0.0353 in the m_probability of surname_metaphone, level `All other comparisons`
Iteration 6: Largest change in params was -0.00781 in the m_probability of surname_metaphone, level `

Review the parameters
--

In [24]:
person_linker.match_weights_chart()

In [25]:
person_linker.parameter_estimate_comparisons_chart()

In [26]:
person_linker.unlinkables_chart()

Linking
--

In [27]:
# Match probability threshold
# See the unlinkables chart above to determine a good value for this threshold, 
# and the match probabilities below for added context
person_match_threshold = 0.99

person_match_results = person_linker.predict(threshold_match_probability=person_match_threshold) 
person_match = person_match_results.as_pandas_dataframe()
person_match.sample(10)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,city_l,city_r,gamma_city,bf_city,first_name_metaphone_l,first_name_metaphone_r,gamma_first_name_metaphone,bf_first_name_metaphone,surname_metaphone_l,surname_metaphone_r,gamma_surname_metaphone,bf_surname_metaphone,postcode_l,postcode_r
997,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q8013972-1,Q8013972-2,barnet,barnet,1,132.707313,ALM,ALM,1,40.357551,KR,KR,1,486.570282,nw7 4jy,nw7 4jy
16,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q1391460-4,Q1391460-1,focșani,focșani,1,132.707313,SLMN,SLMN,1,40.357551,XKTR,XKTR,1,486.570282,b42 2hs,b42 2hs
439,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q3108513-2,Q3108513-1,dumfries and galloway,dumfries and galloway,1,132.707313,PTR,PTR,1,40.357551,ART,ART,1,486.570282,dg1 2da,dg1 2da
1113,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q4722779-1,Q4722779-3,fulham,fulham,1,132.707313,ALFRT,ALFRT,1,40.357551,HKMN,HKMN,1,486.570282,sw15 2ny,sw15 2ny
1205,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q7526494-3,Q7526494-2,manchester,manchester,1,132.707313,SR,SR,1,40.357551,PRNT,PRNT,1,486.570282,m15 6wj,m15 6wj
1260,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q97801414-5,Q97801414-10,barnet,barnet,1,132.707313,MR,MR,1,40.357551,XRP,XRP,1,486.570282,ha8 8tr,ha8 8tr
1252,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q50210878-3,Q50210878-4,norwich,norwich,1,132.707313,FRTRK,FRTRK,1,40.357551,FTT,FTT,1,486.570282,nr5 8dd,nr5 8dd
490,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q28873162-1,Q28873162-3,medway,medway,1,132.707313,ATRT,ATRT,1,40.357551,AT,AT,1,486.570282,me2 4na,me2 4na
418,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q613852-5,Q613852-1,london,london,1,132.707313,JN,JN,1,40.357551,KSNS,KSNS,1,486.570282,w1t 2rf,w1t 2rf
1151,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q5342415-2,Q5342415-8,liverpool,liverpool,1,132.707313,ATRT,ATRT,1,40.357551,KR,KR,1,486.570282,l3 1qw,l3 1qw


In [28]:
# Exact match is 1370
len(person_match)

1429

In [29]:
person_match.sort_values(by='match_probability', ascending=True).head(15)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,city_l,city_r,gamma_city,bf_city,first_name_metaphone_l,first_name_metaphone_r,gamma_first_name_metaphone,bf_first_name_metaphone,surname_metaphone_l,surname_metaphone_r,gamma_surname_metaphone,bf_surname_metaphone,postcode_l,postcode_r
0,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q1443188-1,Q1443188-2,bristol,bristol,1,132.707313,FRNK,FRNK,1,40.357551,PRTMN,PRTMN,1,486.570282,bs2 0el,bs2 0el
957,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q5718316-1,Q5718316-2,hertfordshire,hertfordshire,1,132.707313,HNR,HNR,1,40.357551,PLNT,PLNT,1,486.570282,sg1 2da,sg1 2da
956,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q21455451-4,Q21455451-6,southend-on-sea,southend-on-sea,1,132.707313,ASK,ASK,1,40.357551,KLN,KLN,1,486.570282,ss9 4js,ss9 4js
955,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q8019371-4,Q8019371-3,tattyreagh,tattyreagh,1,132.707313,ALM,ALM,1,40.357551,TMPSN,TMPSN,1,486.570282,bt78 2eg,bt78 2eg
954,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q8004400-3,Q8004400-1,glasgow,glasgow,1,132.707313,ALM,ALM,1,40.357551,ANTRSN,ANTRSN,1,486.570282,g1 1nq,g1 1nq
953,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q56682735-2,Q56682735-3,reading,reading,1,132.707313,H,H,1,40.357551,PRJS,PRJS,1,486.570282,rg4 7tf,rg4 7tf
952,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q7793299-5,Q7793299-2,scotland,scotland,1,132.707313,TMS,TMS,1,40.357551,PRNKL,PRNKL,1,486.570282,ip31 3el,ip31 3el
951,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q16023428-1,Q16023428-2,stafford,stafford,1,132.707313,FRTRK,FRTRK,1,40.357551,KPNS,KPNS,1,486.570282,st15 0lj,st15 0lj
950,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q105533631-1,Q105533631-4,cornwall,cornwall,1,132.707313,HNR,HNR,1,40.357551,ARNT,ARNT,1,486.570282,pl24 2rw,pl24 2rw
958,7.686384,0.995169,__splink__input_table_0,__splink__input_table_1,Q7526211-4,Q7526211-1,nottingham,nottingham,1,132.707313,SR,SR,1,40.357551,PRNT,PRNT,1,486.570282,ng1 5hr,ng1 5hr


In [30]:
person_match.match_weight.value_counts()

match_weight
7.686384    1429
Name: count, dtype: int64

In [31]:
person_match.match_probability.value_counts()

match_probability
0.995169    1429
Name: count, dtype: int64

Inspect the results
--

In [32]:
person_linker.waterfall_chart(person_match_results.as_record_dict(limit=20))

In [33]:
# Comparison viewer
person_linker.comparison_viewer_dashboard(person_match_results, 
                                          "data/interim/person_match-phonetic-report-scv.html", 
                                          overwrite=True, num_example_rows=20)
IPython.display.IFrame(src="data/interim/person_match-phonetic-report-scv.html", width="100%", height=1200)

Export
--

In [34]:
person_match.to_csv('data/processed/person_match-phonetic.csv', index=False)