## Historical people: Quick and dirty

This example shows how to get some initial record linkage results as quickly as possible.

There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of.


<a target="_blank" href="https://colab.research.google.com/github/moj-analytical-services/splink/blob/splink4_examples_notebooks/docs/demos/examples/duckdb/quick_and_dirty_persons.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [1]:
# Uncomment and run this cell if you're running in Google Colab.
# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_examples_notebooks

In [2]:
from splink.datasets import splink_datasets

df = splink_datasets.historical_50k
df.head(5)

Unnamed: 0,unique_id,cluster,full_name,first_and_surname,first_name,surname,dob,birth_place,postcode_fake,gender,occupation
0,Q2296770-1,Q2296770,"thomas clifford, 1st baron clifford of chudleigh",thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8df,male,politician
1,Q2296770-2,Q2296770,thomas of chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8df,male,politician
2,Q2296770-3,Q2296770,tom 1st baron clifford of chudleigh,tom chudleigh,tom,chudleigh,1630-08-01,devon,tq13 8df,male,politician
3,Q2296770-4,Q2296770,thomas 1st chudleigh,thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8hu,,politician
4,Q2296770-5,Q2296770,"thomas clifford, 1st baron chudleigh",thomas chudleigh,thomas,chudleigh,1630-08-01,devon,tq13 8df,,politician


In [3]:
from splink.blocking_rule_library import block_on
import splink.comparison_library as cl
import splink.comparison_template_library as ctl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("full_name"),
        block_on("substr(full_name,1,6)", "dob", "birth_place"),
        block_on("dob", "birth_place"),
        block_on("postcode_fake"),
    ],
    "comparisons": [
        cl.JaroWinklerAtThresholds("full_name", [0.9, 0.7]).configure(
            term_frequency_adjustments=True
        ),
        ctl.DateComparison(
            "date_of_birth",
            input_is_string=True,
            datetime_metrics=["day", "month", "year"],
            datetime_thresholds=[5, 1, 5],

        ),
        cl.LevenshteinAtThresholds("postcode_fake", 2),
        cl.JaroWinklerAtThresholds("birth_place", 0.9).configure(
            term_frequency_adjustments=True
        ),
        cl.ExactMatch("occupation").configure(term_frequency_adjustments=True),
    ],
}

In [4]:
from splink.linker import Linker
from splink.database_api import DuckDBAPI

linker = Linker(df, settings, database_api=DuckDBAPI(), set_up_basic_logging=False)
deterministic_rules = [
    "l.full_name = r.full_name",
    "l.postcode_fake = r.postcode_fake and l.dob = r.dob",
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)

SplinkException: Error executing the following sql for table `__splink__df_count_cumulative_blocks`(__splink__df_count_cumulative_blocks_eeed0403b):
CREATE TABLE __splink__df_count_cumulative_blocks_eeed0403b AS WITH __splink__df_concat as (select * from __splink__df_concat_ff61fdaee), 
__splink__df_blocked as (
            select
            "l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."full_name" AS "full_name_l", "r"."full_name" AS "full_name_r", "l"."date_of_birth" AS "date_of_birth_l", "r"."date_of_birth" AS "date_of_birth_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r"
            , '0' as match_key
            
            from __splink__df_concat as l
            inner join __splink__df_concat as r
            on
            (l.full_name = r.full_name)
            where l."unique_id" < r."unique_id"
            
             UNION ALL 
            select
            "l"."unique_id" AS "unique_id_l", "r"."unique_id" AS "unique_id_r", "l"."full_name" AS "full_name_l", "r"."full_name" AS "full_name_r", "l"."date_of_birth" AS "date_of_birth_l", "r"."date_of_birth" AS "date_of_birth_r", "l"."postcode_fake" AS "postcode_fake_l", "r"."postcode_fake" AS "postcode_fake_r", "l"."birth_place" AS "birth_place_l", "r"."birth_place" AS "birth_place_r", "l"."occupation" AS "occupation_l", "r"."occupation" AS "occupation_r"
            , '1' as match_key
            
            from __splink__df_concat as l
            inner join __splink__df_concat as r
            on
            (l.postcode_fake = r.postcode_fake and l.dob = r.dob)
            where l."unique_id" < r."unique_id"
            AND NOT (coalesce((l.full_name = r.full_name),false))
            ) 
        select
        count(*) as row_count,
        match_key
        from __splink__df_blocked
        group by match_key
        order by cast(match_key as int) asc
    

Error was: Binder Error: Values list "l" does not have a column named "date_of_birth"
LINE 4: ...plink__df_concat_ff61fdaee), 
__splink__df_blocked as (
            select
   ...
                                                  ^

In [None]:
linker.estimate_u_using_random_sampling(max_pairs=2e6)

In [None]:
results = linker.predict(threshold_match_probability=0.9)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'full_name':
    m values not fully trained
Comparison: 'dob':
    m values not fully trained
Comparison: 'postcode_fake':
    m values not fully trained
Comparison: 'birth_place':
    m values not fully trained
Comparison: 'occupation':
    m values not fully trained


In [None]:
results.as_pandas_dataframe(limit=5)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,full_name_l,full_name_r,gamma_full_name,dob_l,dob_r,gamma_dob,postcode_fake_l,postcode_fake_r,gamma_postcode_fake,birth_place_l,birth_place_r,gamma_birth_place,occupation_l,occupation_r,gamma_occupation,match_key
0,3.173575,0.900227,Q3059269-7,Q3059269-8,ethelburga,æthelburh of,0,0601-01-01,0601-01-01,3,se2 0sg,rm13 9jy,0,greenwich,greenwich,2,nun,nun,1,2
1,3.173833,0.900244,Q4864918-11,Q4864918-2,"barry yekverton, viscount avonmore",barry avonmore,0,,1790-02-21,-1,s40 2fy,s40 2fy,2,chesterfield,chesterfield,2,,,-1,3
2,3.175729,0.900361,Q105612158-5,Q105612158-7,charles baron-suckling,charles baron-suckling,3,1862-01-01,1861-01-01,2,np11 7qn,np29 8fz,0,newport,newport,2,,,-1,0
3,3.175729,0.900361,Q6969685-2,Q6969685-9,nathaniel hooke,nathaniel hooke,3,1690-01-01,1698-01-01,2,np20 2fn,np20 7xq,0,newport,newport,2,historian,,-1,0
4,3.175729,0.900361,Q6969685-1,Q6969685-9,nathaniel hooke,nathaniel hooke,3,1690-01-01,1698-01-01,2,np20 2fn,np20 7xq,0,newport,newport,2,historian,,-1,0
