## Remove None

In [None]:
from splink.datasets import splink_datasets
from splink.duckdb.blocking_rule_library import block_on, exact_match_rule
from splink.duckdb.comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)
from splink.duckdb.linker import DuckDBLinker
import ssl

ssl._create_default_https_context = ssl._create_unverified_context
df = splink_datasets.fake_1000


settings = {
    "probability_two_random_records_match": 0.01,
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name"]),
        exact_match_rule("surname"),
    ],
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2),
        exact_match("surname"),
        exact_match("dob"),
        exact_match("city", term_frequency_adjustments=True),
        exact_match("email"),
    ],
    "retain_intermediate_calculation_columns": True,
    "additional_columns_to_retain": ["cluster"],
    "max_iterations": 10,
    "em_convergence": 0.01,
}


linker = DuckDBLinker(df, settings)

# linker.profile_columns(
#     ["first_name", "surname", "first_name || surname", "concat(city, first_name)"]
# )


linker.estimate_u_using_random_sampling(target_rows=1e6)


blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule)


blocking_rule = "l.dob = r.dob"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule)


df_predict = linker.predict()
df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)



In [2]:
metrics = linker._compute_cluster_metrics(df_predict, df_clustered, threshold_match_probability=0.9)
metrics.as_pandas_dataframe()

Unnamed: 0,cluster_id,n_nodes,n_edges,density
0,0,2,1.0,1.000000
1,1,2,1.0,1.000000
2,4,2,1.0,1.000000
3,6,1,0.0,
4,7,1,0.0,
...,...,...,...,...
523,989,1,0.0,
524,991,1,0.0,
525,992,1,0.0,
526,993,4,5.0,0.833333


In [None]:
from splink.unique_id_concat import (
    _composite_unique_id_from_nodes_sql,
)
uid_cols = linker._settings_obj._unique_id_input_columns
composite_uid_clusters = _composite_unique_id_from_nodes_sql(uid_cols)
type(composite_uid_clusters)

## Add source dataset

The problem is that person_id is not unique across datasets
So make  a new unique id for person from the unique_id and the source_dataset fields

In [None]:
pip install -e .

In [None]:
import pandas as pd
from IPython.display import display

from splink.duckdb.duckdb_comparison_library import (
    exact_match,
)
from splink.duckdb.duckdb_linker import DuckDBLinker

settings = {
    "probability_two_random_records_match": 0.01,
    # "link_type": "link_only",
    "link_type": "link_and_dedupe",
    "comparisons": [
        exact_match("first_name"),
        exact_match("surname"),
        exact_match("dob"),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}


df_1 = [
    {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"},
    {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},
    {"unique_id": 3, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},
    {"unique_id": 4, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},
    {"unique_id": 5, "first_name": "Bob", "surname": "Ray", "dob": "1999-09-22"},

]


df_2 = [
    {"unique_id": 1, "first_name": "Bob", "surname": "Ray", "dob": "1999-09-22"},
    {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},

]

df_1 = pd.DataFrame(df_1)
df_2 = pd.DataFrame(df_2)

linker = DuckDBLinker(
    [df_1, df_2], settings, input_table_aliases=["df_left", "df_right"]
)

# linker = DuckDBLinker(df_1, settings)


df_predict = linker.predict()
display(df_predict.as_pandas_dataframe())

df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)
display(df_clustered.as_pandas_dataframe().sort_values("cluster_id"))

linker.debug_mode = True
df_result = linker._compute_cluster_metrics(df_predict, df_clustered, 0.9).as_pandas_dataframe()
df_result

In [None]:
print(df_result.dtypes)

In [None]:
n_nodes = 10
n_edges = 1

density = (n_edges * 2)/(n_nodes * (n_nodes-1))
density

In [None]:
cols = linker._settings_obj._unique_id_input_columns

print(cols)

# unique_id_col = linker._settings_obj._unique_id_column_name
# unique_id_col

In [None]:
from splink.unique_id_concat import _composite_unique_id_from_edges_sql
from splink.input_column import InputColumn


uid_cols = linker._settings_obj._unique_id_input_columns
# unique_id = InputColumn(unique_id_col)
# source_dataset = InputColumn("source_dataset")
# unique_id_col_l = input_col.name_l

uid_edges_l = _composite_unique_id_from_edges_sql(uid_cols, "l")

uid_edges_l

# Updating test from dedupe to dedupe and link

In [None]:
import pandas as pd
from pandas.testing import assert_frame_equal
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
)
from splink.duckdb.linker import DuckDBLinker

df_1 = [
    {"unique_id": 1, "first_name": "Tom", "surname": "Fox", "dob": "1980-01-01"},
    {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},
    {"unique_id": 3, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},
]

df_2 = [
    {"unique_id": 1, "first_name": "Bob", "surname": "Ray", "dob": "1999-09-22"},
    {"unique_id": 2, "first_name": "Amy", "surname": "Lee", "dob": "1980-01-01"},
]

df_1 = pd.DataFrame(df_1)
df_2 = pd.DataFrame(df_2)

In [None]:
# Function insides
from IPython.display import display

settings = {
    "probability_two_random_records_match": 0.01,
    "link_type": "dedupe_only",
    "comparisons": [
        exact_match("first_name"),
        exact_match("surname"),
        exact_match("dob"),
    ],
}
linker = DuckDBLinker(df_1, settings)

df_predict = linker.predict()
df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)

df_result = linker._compute_cluster_metrics(
    df_predict, df_clustered, threshold_match_probability=0.9
).as_pandas_dataframe()
display(df_result)

data_expected = [
    {"cluster_id": 1, "n_nodes": 1, "n_edges": 0.0, "density": None},
    {"cluster_id": 2, "n_nodes": 2, "n_edges": 1.0, "density": 1.0},
]
df_expected = pd.DataFrame(data_expected)
display(df_expected)

In [None]:
# dedupe test


def test_size_density_dedupe():
    # Linker with basic settings
    settings = {
        "probability_two_random_records_match": 0.01,
        "link_type": "dedupe_only",
        "comparisons": [
            exact_match("first_name"),
            exact_match("surname"),
            exact_match("dob"),
        ],
    }
    linker = DuckDBLinker(df_1, settings)

    df_predict = linker.predict()
    df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)

    df_result = linker._compute_cluster_metrics(
        df_predict, df_clustered, threshold_match_probability=0.9
    ).as_pandas_dataframe()

    data_expected = [
        {"cluster_id": 1, "n_nodes": 1, "n_edges": 0.0, "density": None},
        {"cluster_id": 2, "n_nodes": 2, "n_edges": 1.0, "density": 1.0},
    ]
    df_expected = pd.DataFrame(data_expected)

    assert_frame_equal(df_result, df_expected, check_index_type=False)

In [None]:
test_size_density_dedupe()

In [None]:
# function insides - link_only
from IPython.display import display

settings = {
    "probability_two_random_records_match": 0.01,
    "link_type": "link_only",
    "comparisons": [
        exact_match("first_name"),
        exact_match("surname"),
        exact_match("dob"),
    ],
}
linker = DuckDBLinker(
    [df_1, df_2], settings, input_table_aliases=["df_left", "df_right"]
)

linker.debug_mode=True

df_predict = linker.predict()
df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)

df_result = (
    linker._compute_cluster_metrics(
        df_predict, df_clustered, threshold_match_probability=0.99
    )
    .as_pandas_dataframe()
    .sort_values(by="cluster_id")
)

display(df_result)

data_expected = [
    {
        "cluster_id": "df_left-__-1",
        "n_nodes": 1,
        "n_edges": 0.0,
        "density": None,
    },
    {
        "cluster_id": "df_left-__-2",
        "n_nodes": 3,
        "n_edges": 2.0,
        "density": 0.666667,
    },
    {
        "cluster_id": "df_right-__-1",
        "n_nodes": 1,
        "n_edges": 0.0,
        "density": None,
    },
]
df_expected = pd.DataFrame(data_expected).sort_values(by="cluster_id")
display(df_expected)

In [None]:
# link test


def test_size_density_link():
    # Linker with basic settings
    settings = {
        "probability_two_random_records_match": 0.01,
        "link_type": "link_only",
        "comparisons": [
            exact_match("first_name"),
            exact_match("surname"),
            exact_match("dob"),
        ],
    }
    linker = DuckDBLinker(
        [df_1, df_2], settings, input_table_aliases=["df_left", "df_right"]
    )

    df_predict = linker.predict()
    df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)

    df_result = linker._compute_cluster_metrics(
        df_predict, df_clustered, threshold_match_probability=0.99
    ).as_pandas_dataframe().sort_values(by='cluster_id')

    data_expected = [
        {
            "cluster_id": "df_left-__-1",
            "n_nodes": 1,
            "n_edges": 0.0,
            "density": None,
        },
        {
            "cluster_id": "df_right-__-1",
            "n_nodes": 1,
            "n_edges": 0.0,
            "density": None,
        },
        {
            "cluster_id": "df_left-__-2",
            "n_nodes": 3,
            "n_edges": 2.0,
            "density": 0.666667,
        },
    ]
    df_expected = pd.DataFrame(data_expected).sort_values(by='cluster_id')

    assert_frame_equal(df_result, df_expected)

In [None]:
test_size_density_link()

## Test general functionality

In [None]:
pip install -e .