### Imports and linker set up

In [8]:
from splink.duckdb.linker import DuckDBLinker
import pandas as pd

In [9]:
# Set up linker

# Simple dummy df
person_ids = [i + 1 for i in range(6)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings)

### Trialing on simple dummy data

In [10]:
# Set up edges, clusters and cluster metrics tables

# Dummy edges df
person_ids_l = [1, 1, 4, 6, 12]
person_ids_r = [2, 3, 5, 11, 13]
match_probabilities = [0.99, 0.99, 0.99, 0.80, 0.95]

edges_data = {
    "match_probability": match_probabilities,
    "person_id_l": person_ids_l,
    "person_id_r": person_ids_r,
}
edges = pd.DataFrame(edges_data)

# Dummy clusters df
cluster_ids = ["A", "A", "A", "B", "B", "C"]
clusters_data = {"cluster_id": cluster_ids, "person_id": person_ids}
clusters = pd.DataFrame(clusters_data)

df_predict = linker.register_table(edges, "df_predict", overwrite=True)
df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True)

In [11]:
# Testing node metrics - working as expected

linker._compute_metrics_nodes(
    df_predict, df_clustered, threshold_match_probability=0.9
).as_pandas_dataframe().sort_values('cluster_id')

Unnamed: 0,composite_unique_id,cluster_id,node_degree
1,1,A,2
2,2,A,1
3,3,A,1
0,4,B,1
5,5,B,1
4,6,C,0


In [12]:
# Testing size, density, centralisation metrics - working

# linker.debug_mode=True

df_node_metrics = linker._compute_metrics_nodes(
    df_predict, df_clustered, threshold_match_probability=0.9
)

linker._compute_metrics_clusters(df_node_metrics).as_pandas_dataframe()

Unnamed: 0,cluster_id,n_nodes,n_edges,density
0,A,3,2.0,0.666667
1,B,2,1.0,1.0
2,C,1,0.0,


In [14]:
dictionary = linker._compute_cluster_metrics(df_predict, df_clustered, 0.9)

In [18]:
df_out = dictionary["clusters"]
df_out.as_pandas_dataframe()

Unnamed: 0,cluster_id,n_nodes,n_edges,density
0,A,3,2.0,0.666667
1,B,2,1.0,1.0
2,C,1,0.0,


### Test on historical 50k

In [None]:
clusters_hist = pd.read_csv("clusters_hist_50k.csv")
edges_hist = pd.read_csv("edges_hist_50k.csv")

# Update linker unique id to match data
linker._settings_obj._unique_id_column_name = "unique_id"

# Convert to splink dataframes
df_predict = linker.register_table(edges_hist, "df_predict", overwrite=True)
df_clustered = linker.register_table(clusters_hist, "df_clustered", overwrite=True)

In [None]:
# Testing node metrics - runs

linker._compute_metrics_nodes(
    df_predict, df_clustered, threshold_match_probability=0.9
).as_pandas_dataframe().sort_values("cluster_id")

In [None]:
# Testing size, density, centralisation metrics - working

# linker.debug_mode=True

df_node_metrics = linker._compute_metrics_nodes(
    df_predict, df_clustered, threshold_match_probability=0.9
)

linker._compute_metrics_clusters(df_node_metrics).as_pandas_dataframe()

## Testing on Nomis data

In [None]:
pip install -e .

In [None]:
# Set up edges and clusters data

edges_nomis = pd.read_csv("nomis_edges_anonymised.csv")
# edges_nomis["person_id_l"] = edges_nomis["person_id_l"].astype(int)
clusters_nomis = pd.read_csv("nomis_clusters_anonymised.csv")

# Give cols conventional names
# Change cluster_low to cluster_x for threshold x
clusters_nomis = clusters_nomis.rename(columns={"cluster_low": "cluster_id"})

# Transform to Splink dataframes
df_edges_nomis = linker.register_table(edges_nomis, "edges_nomis", overwrite=True)
df_clusters_nomis = linker.register_table(
    clusters_nomis, "clusters_nomis", overwrite=True
)

In [None]:
# node degree - working

linker._compute_metrics_nodes(
    df_edges_nomis, df_clusters_nomis, threshold_match_probability=0.9
).as_pandas_dataframe()

In [None]:
# Testing size, density, centralisation metrics - working

# linker.debug_mode=True

df_node_metrics = linker._compute_metrics_nodes(
    df_edges_nomis, df_clusters_nomis, threshold_match_probability=0.9
)

linker._compute_metrics_clusters(df_node_metrics).as_pandas_dataframe()

#### Try generating the clusters data again from nomis edges

In [None]:
linker.debug_mode = False

nomis_predict = pd.read_csv("nomis_edges_anonymised.csv")
# nomis_predict["person_id_l"] = nomis_predict["person_id_l"].astype(int)
# nomis_predict["person_id_r"] = nomis_predict["person_id_r"].astype(int)


# Transform to Splink dataframes
df_nomis_predict = linker.register_table(nomis_predict, "nomis_predict", overwrite=True)

new_nomis_clusters = linker.cluster_pairwise_predictions_at_threshold(
    df_nomis_predict, 0.9
)
display(new_nomis_clusters.as_pandas_dataframe().sort_values("cluster_id"))

In [None]:
linker.cluster_studio_dashboard(
    df_edges_nomis,
    df_clusters_nomis,
    out_path="cluster_studio.html",
    sampling_method="by_cluster_density",
    sample_size=10,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics_nomis,
)

In [None]:
# Should these have selects in them though?


def _centralisation_sql():
    sql = f"SELECT stuff FROM {df_node_metrics.physical_name}"
    sql = {"sql": sql, "output_table_name": "__splink__counts_per_cluster"}
    return sql


def _density_sql():
    sql = f"SELECT stuff FROM __splink_centralisation"
    sql = {"sql": sql, "output_table_name": "__splink__cluster_metrics_clusters"}
    return sql


def linker_method():
    sql = _centralisation_sql()
    self._enqueue_sql(sql["sql"], sql["output_table_name"])
    sql = _density_sql()
    self._enqueue_sql(sql["sql"], sql["output_table_name"])

    df_cluster_metrics = self._execute_sql_pipeline()
    return df_cluster_metrics

In [None]:
def _centralisation_sql(
    df_node_metrics: SplinkDataFrame,
) -> List[Dict[str, str]]:

    sql = f"""
        SELECT
            cluster_id,
            COUNT(*) AS n_nodes,
            SUM(node_degree)/2.0 AS n_edges,
            MAX(node_degree) AS max_degree,
            CASE
                WHEN COUNT(*) > 2 THEN
                    1.0*(COUNT(*) * MAX(node_degree) -  SUM(node_degree)) /
                    ((COUNT(*) - 1) * (COUNT(*) - 2))
                ELSE
                    NULL
            END AS cluster_centralisation
        FROM {df_node_metrics.physical_name}
        GROUP BY
            cluster_id
    """
    return sql

## Building actual test

In [None]:
import pandas as pd
from splink.cluster_studio import _get_cluster_id_by_density

from splink.duckdb.linker import DuckDBLinker

# Dummy df and settings for linker
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
}
linker = DuckDBLinker(df, settings)

# Dummy cluster metrics table
cluster = ["A", "B", "C", "D", "E"]
n_nodes = [3, 2, 10, 3, 19]
n_edges = [2, 1, 5, 2, 25]
density = [
    (n_edges * 2) / (n_nodes * (n_nodes - 1))
    for n_nodes, n_edges in zip(n_nodes, n_edges)
]
df_metrics = pd.DataFrame(
    {"cluster_id": cluster, "n_nodes": n_nodes, "n_edges": n_edges, "density": density}
)
df_metrics

# Convert to Splink dataframe
df_cluster_metrics = linker.register_table(
    df_metrics, "df_cluster_metrics", overwrite=True
)

In [None]:
df_metrics

In [None]:
# Testing this function

def _get_cluster_id_by_density(
    linker, df_cluster_metrics, sample_size: int, min_nodes: int
):
    # Ordering: least dense clusters first
    sql = f"""
    SELECT cluster_id
    FROM {df_cluster_metrics.physical_name}
    WHERE n_nodes >= {min_nodes}
    ORDER BY density
    LIMIT {sample_size}
    """

    df_density_sample = linker._sql_to_splink_dataframe_checking_cache(
        sql, "__splink__density_sample"
    )

    return [r["cluster_id"] for r in df_density_sample.as_record_dict()]

In [None]:
result = _get_cluster_id_by_density(linker, df_cluster_metrics, sample_size=3, min_nodes=3)
result

In [None]:
# Better to put the linker inside the function?

def test_density_sample():
    df_result = _get_cluster_id_by_density(
        linker, df_cluster_metrics, sample_size=3, min_nodes=3
    )
    df_expect = ["C", "E", "A"]
    assert df_result == df_expect

test_density_sample()

In [None]:
def test_size_density():
    # Linker with basic settings
    settings = {"link_type": "dedupe_only", "unique_id_column_name": "person_id"}
    linker = DuckDBLinker(df, settings)

    # Register as Splink dataframes
    df_predict = linker.register_table(edges, "df_predict", overwrite=True)
    df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True)

    df_cluster_metrics = linker._compute_cluster_metrics(
        df_predict, df_clustered, threshold_match_probability=0.99
    )
    df_cluster_metrics = df_cluster_metrics.as_pandas_dataframe()

    assert_frame_equal(df_cluster_metrics, df_expected)