In [1]:
#!conda install -c conda-forge splink=4.0 --yes

In [1]:
from splink import splink_datasets

df = splink_datasets.historical_50k

In [2]:
from splink import DuckDBAPI
db_api = DuckDBAPI()

In [3]:
from splink import DuckDBAPI, block_on
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

blocking_rules = [
    block_on("substr(first_name,1,3)", "substr(surname,1,4)"),
    block_on("surname", "dob"),
    block_on("first_name", "dob"),
    block_on("postcode_fake", "first_name"),
    block_on("postcode_fake", "surname"),
    block_on("dob", "birth_place"),
    block_on("substr(postcode_fake,1,3)", "dob"),
    block_on("substr(postcode_fake,1,3)", "first_name"),
    block_on("substr(postcode_fake,1,3)", "surname"),
    block_on("substr(first_name,1,2)", "substr(surname,1,2)", "substr(dob,1,4)"),
]

In [4]:
import splink.comparison_library as cl

from splink import Linker, SettingsCreator

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=blocking_rules,
    comparisons=[
        cl.NameComparison("first_name").configure(term_frequency_adjustments=False),
        cl.NameComparison("surname").configure(term_frequency_adjustments=False),
        cl.DateOfBirthComparison("dob", input_is_string=True),
        cl.PostcodeComparison("postcode_fake"),
        cl.ExactMatch("birth_place").configure(term_frequency_adjustments=False),
        cl.ExactMatch("occupation").configure(term_frequency_adjustments=False),
    ],
    retain_intermediate_calculation_columns=True,
)

linker = Linker(df, settings, db_api=db_api)

In [5]:
linker.training.estimate_probability_two_random_records_match(
    [
        "l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob",
        "substr(l.first_name,1,2) = substr(r.first_name,1,2) and l.surname = r.surname and substr(l.postcode_fake,1,2) = substr(r.postcode_fake,1,2)",
        "l.dob = r.dob and l.postcode_fake = r.postcode_fake",
    ],
    recall=0.6,
)

Probability two random records match is estimated to be  0.000136.
This means that amongst all possible pairwise record comparisons, one in 7,362.31 are expected to match.  With 1,279,041,753 total possible comparisons, we expect a total of around 173,728.33 matching pairs


In [6]:
linker.training.estimate_u_using_random_sampling(max_pairs=5e6)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - postcode_fake (no m values are trained).
    - birth_place (no m values are trained).
    - occupation (no m values are trained).


In [7]:
training_blocking_rule = block_on("first_name", "surname")
training_session_names = (
    linker.training.estimate_parameters_using_expectation_maximisation(
        training_blocking_rule, estimate_without_term_frequencies=True
    )
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
(l."first_name" = r."first_name") AND (l."surname" = r."surname")

Parameter estimates will be made for the following comparison(s):
    - dob
    - postcode_fake
    - birth_place
    - occupation

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surname

Iteration 1: Largest change in params was -0.515 in probability_two_random_records_match
Iteration 2: Largest change in params was -0.0362 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.0135 in the m_probability of birth_place, level `Exact match on birth_place`
Iteration 4: Largest change in params was -0.00654 in the m_probability of birth_place, level `All other comparisons`
Iteration 5: Largest change in params was 0.00378 in the m_probability of birth_place, level `Exact match on birth_place`
Iteration 6: La

In [8]:
training_blocking_rule = block_on("dob")
training_session_dob = (
    linker.training.estimate_parameters_using_expectation_maximisation(
        training_blocking_rule, estimate_without_term_frequencies=True
    )
)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."dob" = r."dob"

Parameter estimates will be made for the following comparison(s):
    - first_name
    - surname
    - postcode_fake
    - birth_place
    - occupation

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - dob

Iteration 1: Largest change in params was -0.36 in the m_probability of first_name, level `Exact match on first_name`
Iteration 2: Largest change in params was 0.0382 in the m_probability of first_name, level `All other comparisons`
Iteration 3: Largest change in params was 0.00824 in the m_probability of surname, level `All other comparisons`
Iteration 4: Largest change in params was 0.00266 in the m_probability of surname, level `All other comparisons`
Iteration 5: Largest change in params was 0.000806 in the m_probability of surname, level `All other comparisons`
Iteration 6: Largest change in

In [9]:
linker.misc.save_model_to_json("model_h50k.json", overwrite=True)

{'link_type': 'dedupe_only',
 'probability_two_random_records_match': 0.00013582694460587586,
 'retain_matching_columns': True,
 'retain_intermediate_calculation_columns': True,
 'additional_columns_to_retain': [],
 'sql_dialect': 'duckdb',
 'linker_uid': '66au8ius',
 'em_convergence': 0.0001,
 'max_iterations': 25,
 'bayes_factor_column_prefix': 'bf_',
 'term_frequency_adjustment_column_prefix': 'tf_',
 'comparison_vector_value_column_prefix': 'gamma_',
 'unique_id_column_name': 'unique_id',
 'source_dataset_column_name': 'source_dataset',
 'blocking_rules_to_generate_predictions': [{'blocking_rule': '(SUBSTR(l.first_name, 1, 3) = SUBSTR(r.first_name, 1, 3)) AND (SUBSTR(l.surname, 1, 4) = SUBSTR(r.surname, 1, 4))',
   'sql_dialect': 'duckdb'},
  {'blocking_rule': '(l."surname" = r."surname") AND (l."dob" = r."dob")',
   'sql_dialect': 'duckdb'},
  {'blocking_rule': '(l."first_name" = r."first_name") AND (l."dob" = r."dob")',
   'sql_dialect': 'duckdb'},
  {'blocking_rule': '(l."postco