# Cookbook

This notebook contains a miscellaneous collection of runnable examples illustrating various Splink techniques.

## Array columns

### Comparing array columns

This example shows how we can use use `ArrayIntersectAtSizes` to assess the similarity of columns containing arrays.

In [None]:
# Uncomment and run this cell if you're running in Google Colab.
# !pip install splink

In [1]:
import logging
logging.getLogger("splink").setLevel(logging.ERROR)


In [2]:
import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on


data = [
    {"unique_id": 1, "first_name": "John", "postcode": ["A", "B"]},
    {"unique_id": 2, "first_name": "John", "postcode": ["B"]},
    {"unique_id": 3, "first_name": "John", "postcode": ["A"]},
    {"unique_id": 4, "first_name": "John", "postcode": ["A", "B"]},
    {"unique_id": 5, "first_name": "John", "postcode": ["C"]},
]

df = pd.DataFrame(data)

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
    ],
    comparisons=[
        cl.ArrayIntersectAtSizes("postcode", [2, 1]),
        cl.ExactMatch("first_name"),
    ]
)


linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)

linker.inference.predict().as_pandas_dataframe()

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,postcode_l,postcode_r,gamma_postcode,first_name_l,first_name_r,gamma_first_name
0,-8.287568,0.00319,4,5,"[A, B]",[C],0,John,John,1
1,-0.287568,0.450333,3,4,[A],"[A, B]",1,John,John,1
2,-8.287568,0.00319,3,5,[A],[C],0,John,John,1
3,-8.287568,0.00319,2,3,[B],[A],0,John,John,1
4,-0.287568,0.450333,2,4,[B],"[A, B]",1,John,John,1
5,-8.287568,0.00319,2,5,[B],[C],0,John,John,1
6,-0.287568,0.450333,1,2,"[A, B]",[B],1,John,John,1
7,-0.287568,0.450333,1,3,"[A, B]",[A],1,John,John,1
8,6.712432,0.990554,1,4,"[A, B]","[A, B]",2,John,John,1
9,-8.287568,0.00319,1,5,"[A, B]",[C],0,John,John,1


### Blocking on array columns

This example shows how we can use `block_on` to block on the individual elements of an array column - that is, pairwise comaprisons are created for pairs or records where any of the elements in the array columns match.

In [11]:
import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on


data = [
    {"unique_id": 1, "first_name": "John", "postcode": ["A", "B"]},
    {"unique_id": 2, "first_name": "John", "postcode": ["B"]},
    {"unique_id": 3, "first_name": "John", "postcode": ["C"]},

]

df = pd.DataFrame(data)

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("postcode", arrays_to_explode=["postcode"]),
    ],
    comparisons=[
        cl.ArrayIntersectAtSizes("postcode", [2, 1]),
        cl.ExactMatch("first_name"),
    ]
)


linker = Linker(df, settings, DuckDBAPI(), set_up_basic_logging=False)

linker.inference.predict().as_pandas_dataframe()


Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,postcode_l,postcode_r,gamma_postcode,first_name_l,first_name_r,gamma_first_name
0,-0.287568,0.450333,1,2,"[A, B]",[B],1,John,John,1
