### Imports and linker set up

In [None]:
from splink.duckdb.linker import DuckDBLinker
import pandas as pd
from splink.splink_dataframe import SplinkDataFrame
from typing import TYPE_CHECKING
# https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
if TYPE_CHECKING:
    from .linker import Linker

In [None]:
# Set up linker

# Simple dummy df
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings)

### Trialing on simple dummy data

In [None]:
# Set up edges, clusters and cluster metrics tables

# Dummy edges df
person_ids_l = [1, 1, 4, 10, 12]
person_ids_r = [2, 3, 5, 11, 13]
match_probabilities = [0.99, 0.99, 0.99, 0.99, 0.95]

edges_data = {
    "match_probability": match_probabilities,
    "person_id_l": person_ids_l,
    "person_id_r": person_ids_r,
}
edges = pd.DataFrame(edges_data)

# Dummy clusters df
cluster_ids = ["A", "A", "A", "B", "B"]
clusters_data = {"cluster_id": cluster_ids, "person_id": person_ids}
clusters = pd.DataFrame(clusters_data)

# Dummy cluster metrics table
cluster = ["A", "B", "C", "D", "E","F", "G", "H", "I"]
n_nodes = [2, 3, 3, 10, 10, 10, 20, 20, 20]
n_edges = [2, 2, 2, 9, 25, 36, 22, 29, 45]
density = [
    (n_edges * 2) / (n_nodes * (n_nodes - 1))
    for n_nodes, n_edges in zip(n_nodes, n_edges)
]
df_metrics = pd.DataFrame(
    {"cluster_id": cluster, "n_nodes": n_nodes, "n_edges": n_edges, "density": density}
)
df_metrics

# Create splink dataframes from tables
df_cluster_metrics = linker.register_table(
    df_metrics, "df_cluster_metrics", overwrite=True
)
df_predict = linker.register_table(edges, "df_predict", overwrite=True)
df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True)

## Stratified sampling by density

unique id persists throughout the linking process

User journey:
Have run clusters and metrics and generate the cluster dashboard (sample by density)
Delete it and come back to remake the next day - want the same clusters to appear (assuming data hasn't changed here)
It might not be exactly the same clusters if random sampling happens but will be the same population and in the same order as the data hasn't changed.

User journey:
Do some cluster QA, make changes, rerun clustering and remake dashboard (now the clusters data has possibly changed)
The cluster labels remain the same as long as the cluster membership hasn't changed?
The ordering of clusters may well have changed though.
It would be nice to ensure that the same lowest density clusters are always chosen where possible to see the impact of the changes the user is making.
How could I ensure this? I could order the clusters within the partition according to density AND cluster id?
Alternatively, could used the connected components table, order asw in by_size and join on the density table.



In [None]:
def _get_lowest_density_clusters(
    linker: "Linker",
    df_cluster_metrics: SplinkDataFrame,
    rows_per_partition: int,
    min_nodes: int,
):
    """Returns ids of lowest density clusters of different sizes by
    performing stratified sampling.

    Args:
        linker: An instance of the Splink Linker class.
        df_cluster_metrics (SplinkDataFrame): dataframe containing
        cluster metrics including density.
        rows_per_partition (int): number of rows in each strata (partition)
        min_nodes (int): minimum number of nodes a cluster must contain
        to be included in the sample.

    Returns:
        list: A list of cluster ids of lowest density clusters of different sizes.
    """

    sql = f"""
    select
        cluster_id,
        n_nodes,
        density,
        row_number() over (partition by n_nodes order by density, cluster_id desc) as row_num
    from {df_cluster_metrics.physical_name}
    where n_nodes >= {min_nodes}
    """

    linker._enqueue_sql(sql, "__splink__partition_clusters_by_size")

    sql = f"""
    select
        cluster_id,
        round(density, 4) as density_4dp,
        n_nodes
    from __splink__partition_clusters_by_size
    where row_num <= {rows_per_partition}
    order by n_nodes
    """

    linker._enqueue_sql(sql, "__splink__lowest_density_clusters")
    df_lowest_density_clusters = linker._execute_sql_pipeline()

    return df_lowest_density_clusters.as_pandas_dataframe()

In [None]:
test = _get_lowest_density_clusters(linker, df_cluster_metrics, 1, 3)

test

In [None]:
cluster_ids = [r['cluster_id'] for r in test]
cluster_ids

In [None]:
pip install -e .

In [None]:
df_cluster_metrics.as_pandas_dataframe()

In [None]:
# Test dashboard - doesn't work on dummy data because not set up accurately?

linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="lowest_density_clusters_by_size",
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics,
)

In [None]:
# Test dashboard - doesn't work on dummy data because not set up accurately?

linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="by_cluster_size",
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics,
)

In [None]:
# Test exception - working
linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="lowest_density_clusters_by_size",
    sample_size=10,
    overwrite=True,
)

### Test on realistic dummy data

In [1]:
pip install -e .

Obtaining file:///Users/zoe.slade/coding_projects/splink_folder/splink
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: splink
  Building editable for splink (pyproject.toml) ... [?25ldone
[?25h  Created wheel for splink: filename=splink-3.9.10-py3-none-any.whl size=6542 sha256=53134cc626ca8699f5bbcf731c4fc33477ade7bd6835fccd0ab29d25e86b5487
  Stored in directory: /private/var/folders/nd/c3xr518x3txg5kcqp1h7zwc80000gp/T/pip-ephem-wheel-cache-di1qgpdv/wheels/88/cc/30/b622996cc540fe1fab8acfdbf9b4822b20095ff2d22050eba4
Successfully built splink
Installing collected packages: splink
  Attempting uninstall: splink
    Found existing installation: splink 3.9.10
    Uninstalling splink-3.9.10:
      Successfully uninstalled splink-3.9.10
Successfully ins

In [2]:
from splink.datasets import splink_datasets
from splink.duckdb.blocking_rule_library import block_on, exact_match_rule
from splink.duckdb.comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)
from splink.duckdb.linker import DuckDBLinker

df = splink_datasets.fake_1000

settings = {
    "probability_two_random_records_match": 0.01,
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on(["first_name"]),
        exact_match_rule("surname"),
    ],
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2),
        exact_match("surname"),
        exact_match("dob"),
        exact_match("city", term_frequency_adjustments=True),
        exact_match("email"),
    ],
    "retain_intermediate_calculation_columns": True,
    "additional_columns_to_retain": ["cluster"],
    "max_iterations": 10,
    "em_convergence": 0.01,
}


linker = DuckDBLinker(df, settings)

linker.estimate_u_using_random_sampling(target_rows=1e6)


blocking_rule = "l.first_name = r.first_name and l.surname = r.surname"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule)


blocking_rule = "l.dob = r.dob"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule)


df_predict = linker.predict()
df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)
df_cluster_metrics = linker._compute_graph_metrics(df_predict, df_clustered, threshold_match_probability=0.9)

  exact_match_rule("surname"),
/var/folders/nd/c3xr518x3txg5kcqp1h7zwc80000gp/T/ipykernel_42678/4141705158.py:34: SplinkDeprecated: target_rows is deprecated; use max_pairs
  linker.estimate_u_using_random_sampling(target_rows=1e6)
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - surname (no m values are trained).
    - dob (no m values are trained).
    - city (no m values are trained).
    - email (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name and l.surname = r.surname

Parameter estimates will be made for the following comparison(s):
    - dob
    - city
    - email

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name
    - surnam

In [3]:
linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="lowest_density_clusters_by_size",
    sample_size =20,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics,
)

In [4]:
df_predict = linker.predict()
df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predict, 0.9)
df_cluster_metrics

Completed iteration 1, root rows count 0


Table name in database: `__splink__cluster_metrics_clusters_5ba854f77`

To retrieve records, you can call the following methods on this object:
`.as_record_dict(limit=5)` or `.as_pandas_dataframe(limit=5)`.

You may omit the `limit` argument to return all records.

This table represents the following splink entity: __splink__cluster_metrics_clusters

In [50]:
import pandas as pd

from splink.cluster_studio import _get_lowest_density_clusters
from splink.duckdb.linker import DuckDBLinker


def test_density_sample():
    # Simple df and settings for linker
    person_ids = [i + 1 for i in range(5)]
    df = pd.DataFrame({"person_id": person_ids})

    settings = {
        "link_type": "dedupe_only",
        "unique_id_column_name": "person_id",
    }
    linker = DuckDBLinker(df, settings)

    # Dummy cluster metrics table
    cluster = ["A", "B", "C", "D", "E", "F"]
    n_nodes = [2, 3, 3, 3, 10, 10]
    n_edges = [1, 2, 2, 3, 9, 20]
    density = [
        (n_edges * 2) / (n_nodes * (n_nodes - 1))
        for n_nodes, n_edges in zip(n_nodes, n_edges)
    ]
    pd_metrics = pd.DataFrame(
        {
            "cluster_id": cluster,
            "n_nodes": n_nodes,
            "n_edges": n_edges,
            "density": density,
        }
    )

    # Convert to Splink dataframe
    df_cluster_metrics = linker.register_table(
        pd_metrics, "df_cluster_metrics", overwrite=True
    )
    result = _get_lowest_density_clusters(
        linker, df_cluster_metrics, rows_per_partition=1, min_nodes=3
    )

    result = sorted(result, key=lambda x: x["cluster_id"])

    expect = [
        {"cluster_id": "C", "density_4dp": 0.6667},
        {"cluster_id": "E", "density_4dp": 0.2},
    ]

    # assert result == expect
    return result

In [51]:
test_density_sample()
# type(result)
# result

[{'cluster_id': 'B', 'density_4dp': 0.6667},
 {'cluster_id': 'E', 'density_4dp': 0.2}]

### Test on historical 50k - working

In [None]:
clusters_hist = pd.read_csv("clusters_hist_50k.csv")
edges_hist = pd.read_csv("edges_hist_50k.csv")

# Update linker unique id to match data
linker._settings_obj._unique_id_column_name = "unique_id"

# Convert to splink dataframes
df_predict = linker.register_table(edges_hist, "df_predict", overwrite=True)
df_clustered = linker.register_table(clusters_hist, "df_clustered", overwrite=True)

In [None]:
# Generate density metrics

df_cluster_metrics = linker._compute_cluster_metrics(df_predict, df_clustered, 0.99)
df_cluster_metrics.as_pandas_dataframe().sort_values(by='density').head(20)

In [None]:
linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="lowest_density_clusters",
    sample_size=10,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics,
)

In [None]:
# Check out density produced with modified version of function

def _get_cluster_id_by_density(
    linker, df_cluster_metrics, sample_size: int, min_nodes: int
):
    # Ordering: least dense clusters first
    sql = f"""
    SELECT cluster_id, density, n_nodes, n_edges, 
    (n_edges * 2)/(n_nodes * (n_nodes-1)) AS density_check
    FROM {df_cluster_metrics.physical_name}
    WHERE n_nodes >= {min_nodes}
    ORDER BY density
    LIMIT {sample_size}
    """

    df_density_sample = linker._sql_to_splink_dataframe_checking_cache(
        sql, "__splink__density_sample"
    )

    return df_density_sample.as_pandas_dataframe()

In [None]:
_get_cluster_id_by_density(linker, df_cluster_metrics, sample_size=10, min_nodes=3)

Results:
- Correct (lowest density) clusters being found and put into splink cluster studio
- Ordering of clusters isn't from low to high density - density info is lost so might be easier to add the density to drop down menu
- Density being calculated correctly

## Testing on Nomis data

Problem is that have two nodes in clusters table for 6 edges legit edges with threshold above 0.95

In [None]:
pip install -e .

In [None]:
from splink.duckdb.linker import DuckDBLinker
import pandas as pd

# Set up linker

# Simple dummy df
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings)

In [None]:
# Set up edges and clusters data

edges_nomis = pd.read_csv("nomis_edges_anonymised.csv")
# edges_nomis["person_id_l"] = edges_nomis["person_id_l"].astype(int)
clusters_nomis = pd.read_csv("nomis_clusters_anonymised.csv")

# Give cols conventional names
# Change cluster_low to cluster_x for threshold x
clusters_nomis = clusters_nomis.rename(columns={"cluster_low": "cluster_id"})

# Transform to Splink dataframes
df_edges_nomis = linker.register_table(edges_nomis, "edges_nomis", overwrite=True)
df_clusters_nomis = linker.register_table(
    clusters_nomis, "clusters_nomis", overwrite=True
)

In [None]:
# Generate density metrics

# linker.debug_mode=True

df_cluster_metrics_nomis = linker._compute_cluster_metrics(
    df_edges_nomis, df_clusters_nomis, threshold_match_probability=0.95
)
# df_cluster_metrics_nomis.as_pandas_dataframe()

# df_cluster_metrics_nomis.as_pandas_dataframe().groupby("n_nodes").min("density").head()

df_cluster_metrics_nomis.as_pandas_dataframe().groupby("density").min('density').head(20)

In [None]:
n_nodes = 10
n_edges = 30

density = (n_edges * 2) / (n_nodes * (n_nodes - 1))
density

# Density calculated correctly
# Thing that is wrong is having 6 edges when only 2 nodes

In [None]:
cluster id = d79b5dfd903fb222e662b0eb96ccfc73
Appears twice in the clusters table - so already not enough nodes for the number of edges

dcdcba59f8e31e4bfebc6aa1e99e3f1f = person id, occurs 5 times in edges left at or above 0.95 threshold

9be9a90df36aad592ea1e88b136859b3 = person id, occurs 1 time in edge left at or above the threshold. This is to be expected


#### Try generating the clusters data again from nomis edges

In [None]:
linker.debug_mode = False

nomis_predict = pd.read_csv("nomis_edges_anonymised.csv")
# nomis_predict["person_id_l"] = nomis_predict["person_id_l"].astype(int)
# nomis_predict["person_id_r"] = nomis_predict["person_id_r"].astype(int)


# Transform to Splink dataframes
df_nomis_predict = linker.register_table(nomis_predict, "nomis_predict", overwrite=True)

new_nomis_clusters = linker.cluster_pairwise_predictions_at_threshold(
    df_nomis_predict, 0.9
)
display(new_nomis_clusters.as_pandas_dataframe().sort_values("cluster_id"))

In [None]:
linker.cluster_studio_dashboard(
    df_edges_nomis,
    df_clusters_nomis,
    out_path="cluster_studio.html",
    sampling_method="by_cluster_density",
    sample_size=10,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics_nomis,
)

## Building actual test

In [None]:
import pandas as pd
from splink.cluster_studio import _get_cluster_id_by_density

from splink.duckdb.linker import DuckDBLinker

# Dummy df and settings for linker
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
}
linker = DuckDBLinker(df, settings)

# Dummy cluster metrics table
cluster = ["A", "B", "C", "D", "E"]
n_nodes = [3, 2, 10, 3, 19]
n_edges = [2, 1, 5, 2, 25]
density = [
    (n_edges * 2) / (n_nodes * (n_nodes - 1))
    for n_nodes, n_edges in zip(n_nodes, n_edges)
]
df_metrics = pd.DataFrame(
    {"cluster_id": cluster, "n_nodes": n_nodes, "n_edges": n_edges, "density": density}
)
df_metrics

# Convert to Splink dataframe
df_cluster_metrics = linker.register_table(
    df_metrics, "df_cluster_metrics", overwrite=True
)

In [None]:
df_metrics

In [None]:
# Testing this function

def _get_cluster_id_by_density(
    linker, df_cluster_metrics, sample_size: int, min_nodes: int
):
    # Ordering: least dense clusters first
    sql = f"""
    SELECT cluster_id
    FROM {df_cluster_metrics.physical_name}
    WHERE n_nodes >= {min_nodes}
    ORDER BY density
    LIMIT {sample_size}
    """

    df_density_sample = linker._sql_to_splink_dataframe_checking_cache(
        sql, "__splink__density_sample"
    )

    return [r["cluster_id"] for r in df_density_sample.as_record_dict()]

In [None]:
result = _get_cluster_id_by_density(linker, df_cluster_metrics, sample_size=3, min_nodes=3)
result

In [None]:
# Better to put the linker inside the function?

def test_density_sample():
    df_result = _get_cluster_id_by_density(
        linker, df_cluster_metrics, sample_size=3, min_nodes=3
    )
    df_expect = ["C", "E", "A"]
    assert df_result == df_expect

test_density_sample()

In [None]:
def test_size_density():
    # Linker with basic settings
    settings = {"link_type": "dedupe_only", "unique_id_column_name": "person_id"}
    linker = DuckDBLinker(df, settings)

    # Register as Splink dataframes
    df_predict = linker.register_table(edges, "df_predict", overwrite=True)
    df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True)

    df_cluster_metrics = linker._compute_cluster_metrics(
        df_predict, df_clustered, threshold_match_probability=0.99
    )
    df_cluster_metrics = df_cluster_metrics.as_pandas_dataframe()

    assert_frame_equal(df_cluster_metrics, df_expected)