In [1]:
# Example of an exact match across multiple columns, with blocking.

In [2]:
import numpy as np
import numbers
import pandas as pd

import phonetics

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

In [3]:
import IPython

In [4]:
import vega

Setup
==

Vega setup for charts
--
Splink uses Vega for inline charts. We need a bit of setup to make sure these are displayed correctly.

In [5]:
# !jupyter nbextension install --sys-prefix --py vega

In [6]:
!jupyter nbextension enable vega --py --sys-prefix

Enabling notebook extension jupyter-vega/extension...
      - Validating: ok


In [7]:
# # A minimal example for debugging:
# vega.Vega({
#     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
#     "data": {
#         "values": [
#             {"a": "A", "b": 28},
#             {"a": "B", "b": 55},
#             {"a": "C", "b": 43},
#         ]
#     },
#     "mark": "bar",
#     "encoding": {
#         "x": {"field": "b", "type": "quantitative"},
#         "y": {"field": "a", "type": "nominal"}
#     }
# })

Data
==

Tables
--

In [8]:
table1 = pd.read_csv('data/interim/table1.csv', dtype=str)
table1.head(2)

Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster
0,Q2296770-2,devon,tq13 8df,thomas of chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,male,Q2296770
1,Q1443188-1,bristol,bs2 0el,frank edward brightman,frank brightman,frank,brightman,1856-06-18,male,Q1443188


In [9]:
table2 = pd.read_csv('data/interim/table2.csv', dtype=str)
table2.head(2)

Unnamed: 0,unique_id,city,postcode,full_name,first_and_surname,first_name,surname,dob,gender,cluster
0,Q2296770-4,devon,tq13 8hu,thomas 1st chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,,Q2296770
1,Q1443188-2,bristol,bs2 0el,frank edward brightman,frank brightman,frank,brightman,1856-06-18,male,Q1443188


Exact match
==
Based on https://moj-analytical-services.github.io/splink/demos/examples/duckdb/link_only.html

Linking two tables without deduplication.

Settings
--

In [10]:
person_match_settings = {
    "link_type": "link_only",
    "unique_id_column_name": "unique_id",
    
    # Blocking, for performance
    "blocking_rules_to_generate_predictions": [
        "l.postcode = r.postcode",
    ],
    "comparisons": [
        cl.exact_match("city", term_frequency_adjustments=False),
        
        # Deterministic name match (exact match)
        cl.exact_match("first_name", term_frequency_adjustments=False),
        cl.exact_match("surname", term_frequency_adjustments=False),
#         cl.exact_match("full_name", term_frequency_adjustments=False),

        # TODO: Dates need to be preprocessed first
#         ctl.date_comparison("dob", 
#                             cast_strings_to_date=True,
#                             invalid_dates_as_null=True),        
    ],
    
    # Needed for waterfall charts of results
    "retain_intermediate_calculation_columns": True,
    "retain_matching_columns": True,
}

Training
--

In [11]:
person_linker = DuckDBLinker(
    [table1, table2],
    person_match_settings)

In [12]:
person_linker.missingness_chart()

In [13]:
person_linker.estimate_probability_two_random_records_match(
    #  A list of deterministic matching rules that should be designed to admit 
    # very few (none if possible) false positives:
    [
        "l.city = r.city and l.full_name = r.full_name",
    ], 
    recall=0.9)

Probability two random records match is estimated to be  7.9e-05.
This means that amongst all possible pairwise record comparisons, one in 12,652.24 are expected to match.  With 19,386,048 total possible comparisons, we expect a total of around 1,532.22 matching pairs


In [14]:
# Estimate U
person_linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=1)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - city (no m values are trained).
    - first_name (no m values are trained).
    - surname (no m values are trained).


In [15]:
session_person_uid = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.city = r.city")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.city = r.city

Parameter estimates will be made for the following comparison(s):
    - first_name
    - surname

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - city

Iteration 1: Largest change in params was 0.0601 in the m_probability of first_name, level `All other comparisons`
Iteration 2: Largest change in params was -0.0766 in the m_probability of first_name, level `Exact match`
Iteration 3: Largest change in params was -0.0263 in the m_probability of first_name, level `Exact match`
Iteration 4: Largest change in params was -0.00618 in the m_probability of first_name, level `Exact match`
Iteration 5: Largest change in params was -0.00146 in the m_probability of surname, level `Exact match`
Iteration 6: Largest change in params was 0.00153 in the m_probability of surname, level `All other comparisons`
Iteratio

In [16]:
session_person_fsn = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.first_name = r.first_name and l.surname = r.surname")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - city

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was -0.333 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.017 in the m_probability of city, level `All other comparisons`
Iteration 3: Largest change in params was -0.000974 in the m_probability of city, level `Exact match`
Iteration 4: Largest change in params was -5.44e-05 in the m_probability of city, level `Exact match`

EM converged after 4 iterations

Your model is fully trained. All comparisons have at least one estimate for their m and u values


In [17]:
session_person_fulln = person_linker.estimate_parameters_using_expectation_maximisation(
    "l.full_name  = r.full_name ")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.full_name  = r.full_name 

Parameter estimates will be made for the following comparison(s):
    - city
    - first_name
    - surname

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Level All other comparisons on comparison first_name not observed in dataset, unable to train m value

Level All other comparisons on comparison surname not observed in dataset, unable to train m value

Iteration 1: Largest change in params was 0.829 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.169 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.00213 in probability_two_random_records_match
Iteration 4: Largest change in params was 9.93e-06 in probability_two_random_records_match

EM converged after 4 iterations
m probability not trained for first_name - All other c

Review the parameters
--

In [18]:
person_linker.match_weights_chart()

In [19]:
person_linker.parameter_estimate_comparisons_chart()

In [20]:
person_linker.unlinkables_chart()

Linking
--

In [21]:
# Match probability threshold
# See the unlinkables chart above to determine a good value for this threshold, 
# and the match probabilities below for added context
person_match_threshold = 0.99

person_match_results = person_linker.predict(threshold_match_probability=person_match_threshold) 
person_match = person_match_results.as_pandas_dataframe()
person_match.sample(10)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,city_l,city_r,gamma_city,bf_city,first_name_l,first_name_r,gamma_first_name,bf_first_name,surname_l,surname_r,gamma_surname,bf_surname,postcode_l,postcode_r
1023,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q19729649-5,Q19729649-3,hinckley and bosworth,hinckley and bosworth,1,135.419288,frances,frances,1,45.488644,redgrave,redgrave,1,1021.308718,cv13 0dn,cv13 0dn
1043,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q65116344-2,Q65116344-1,winchester,winchester,1,135.419288,norman,norman,1,45.488644,hunter,hunter,1,1021.308718,so32 1fr,so32 1fr
285,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q7348896-2,Q7348896-1,fareham,fareham,1,135.419288,robert,robert,1,45.488644,pierpont,pierpont,1,1021.308718,po15 5ey,po15 5ey
1091,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q4783329-4,Q4783329-1,brighton,brighton,1,135.419288,arabella,arabella,1,45.488644,buckley,buckley,1,1021.308718,bn2 1ej,bn2 1ej
1108,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q16197388-1,Q16197388-3,dartford,dartford,1,135.419288,thomas,thomas,1,45.488644,bell,bell,1,1021.308718,da1 5bs,da1 5bs
461,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q8020391-1,Q8020391-3,ruabon,ruabon,1,135.419288,william,william,1,45.488644,williams,williams,1,1021.308718,ll14 6dw,ll14 6dw
652,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q7792189-2,Q7792189-1,west devon,west devon,1,135.419288,thomas,thomas,1,45.488644,marshall,marshall,1,1021.308718,pl19 8pt,pl19 8pt
460,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q61354618-1,Q61354618-3,westminster,westminster,1,135.419288,robert,robert,1,45.488644,noyes,noyes,1,1021.308718,w2 6qe,w2 6qe
861,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q3752781-1,Q3752781-6,united kingdom,united kingdom,1,135.419288,frederick,frederick,1,45.488644,warde,warde,1,1021.308718,dl12 0dh,dl12 0dh
839,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q18912750-1,Q18912750-2,twigworth,twigworth,1,135.419288,walter,walter,1,45.488644,manners,manners,1,1021.308718,gl2 9pu,gl2 9pu


In [22]:
len(person_match)

1352

In [23]:
person_match.sort_values(by='match_probability', ascending=True).head(15)

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,city_l,city_r,gamma_city,bf_city,first_name_l,first_name_r,gamma_first_name,bf_first_name,surname_l,surname_r,gamma_surname,bf_surname,postcode_l,postcode_r
0,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q1443188-1,Q1443188-2,bristol,bristol,1,135.419288,frank,frank,1,45.488644,brightman,brightman,1,1021.308718,bs2 0el,bs2 0el
905,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q2383293-9,Q2383293-1,cambridge,cambridge,1,135.419288,thomas,thomas,1,45.488644,walker,walker,1,1021.308718,cb2 1tl,cb2 1tl
904,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q1052303-1,Q1052303-2,staines-upon-thames,staines-upon-thames,1,135.419288,cecil,cecil,1,45.488644,smith,smith,1,1021.308718,tw18 1nf,tw18 1nf
903,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q4718530-1,Q4718530-2,bath,bath,1,135.419288,alexander,alexander,1,45.488644,cardew,cardew,1,1021.308718,ba1 1lb,ba1 1lb
902,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q20733457-5,Q20733457-2,newark and sherwood,newark and sherwood,1,135.419288,john,john,1,45.488644,hughes,hughes,1,1021.308718,ng24 3wz,ng24 3wz
901,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q105440079-1,Q105440079-4,selby,selby,1,135.419288,mayo,mayo,1,45.488644,hazeltine,hazeltine,1,1021.308718,ls24 9bx,ls24 9bx
900,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q518407-1,Q518407-3,edinburgh,edinburgh,1,135.419288,james,james,1,45.488644,syme,syme,1,1021.308718,eh1 2ng,eh1 2ng
899,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q7526211-4,Q7526211-1,nottingham,nottingham,1,135.419288,sir,sir,1,45.488644,baronet,baronet,1,1021.308718,ng1 5hr,ng1 5hr
898,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q5718316-1,Q5718316-2,hertfordshire,hertfordshire,1,135.419288,henry,henry,1,45.488644,blount,blount,1,1021.308718,sg1 2da,sg1 2da
897,8.957936,0.997993,__splink__input_table_0,__splink__input_table_1,Q21455451-4,Q21455451-6,southend-on-sea,southend-on-sea,1,135.419288,isaac,isaac,1,45.488644,cullin,cullin,1,1021.308718,ss9 4js,ss9 4js


In [24]:
person_match.match_weight.value_counts()

match_weight
8.957936    1352
Name: count, dtype: int64

In [25]:
person_match.match_probability.value_counts()

match_probability
0.997993    1352
Name: count, dtype: int64

Inspect the results
--

In [26]:
person_linker.waterfall_chart(person_match_results.as_record_dict(limit=20))

In [27]:
# Comparison viewer
person_linker.comparison_viewer_dashboard(person_match_results, 
                                          "data/interim/person_match-exact-report-scv.html", 
                                          overwrite=True, num_example_rows=20)
IPython.display.IFrame(src="data/interim/person_match-exact-report-scv.html", width="100%", height=1200)

Export
--

In [28]:
person_match.to_csv('data/processed/person_match-exact.csv', index=False)