### Imports and linker set up

In [1]:
from splink.duckdb.linker import DuckDBLinker
import pandas as pd

In [3]:
# Set up linker

# Simple dummy df
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings)

### Trialing on simple dummy data

In [4]:
# Set up edges, clusters and cluster metrics tables

# Dummy edges df
person_ids_l = [1, 1, 4, 10, 12]
person_ids_r = [2, 3, 5, 11, 13]
match_probabilities = [0.99, 0.99, 0.99, 0.99, 0.95]

edges_data = {
    "match_probability": match_probabilities,
    "person_id_l": person_ids_l,
    "person_id_r": person_ids_r,
}
edges = pd.DataFrame(edges_data)

# Dummy clusters df
cluster_ids = ["A", "A", "A", "B", "B"]
clusters_data = {"cluster_id": cluster_ids, "person_id": person_ids}
clusters = pd.DataFrame(clusters_data)

# Dummy cluster metrics table
cluster = ["A", "B", "C", "D", "E"]
n_nodes = [3, 2, 10, 3, 19]
n_edges = [2, 1, 5, 2, 25]
density = [
    (n_edges * 2) / (n_nodes * (n_nodes - 1))
    for n_nodes, n_edges in zip(n_nodes, n_edges)
]
df_metrics = pd.DataFrame(
    {"cluster_id": cluster, "n_nodes": n_nodes, "n_edges": n_edges, "density": density}
)
df_metrics

# Create splink dataframes from tables
df_cluster_metrics = linker.register_table(
    df_metrics, "df_cluster_metrics", overwrite=True
)
df_predict = linker.register_table(edges, "df_predict", overwrite=True)
df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True)

In [5]:
linker._compute_cluster_metrics(df_predict, df_clustered, threshold_match_probability=0.9).as_pandas_dataframe(limit=5)

Unnamed: 0,cluster_id,n_nodes,n_edges,density
0,A,3,2.0,0.666667
1,B,2,1.0,1.0


In [8]:
# Dashboard works on the above data!

linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="lowest_density_clusters",
    sample_size=10,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics,
)

In [None]:
# Test exception
linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="by_cluster_density",
    sample_size=10,
    overwrite=True,
)

### Test on historical 50k - working

In [9]:
clusters_hist = pd.read_csv("clusters_hist_50k.csv")
edges_hist = pd.read_csv("edges_hist_50k.csv")

# Update linker unique id to match data
linker._settings_obj._unique_id_column_name = "unique_id"

# Convert to splink dataframes
df_predict = linker.register_table(edges_hist, "df_predict", overwrite=True)
df_clustered = linker.register_table(clusters_hist, "df_clustered", overwrite=True)

In [11]:
# Generate density metrics

df_cluster_metrics = linker._compute_cluster_metrics(df_predict, df_clustered, 0.99)
df_cluster_metrics.as_pandas_dataframe().sort_values(by='density').head(20)

Unnamed: 0,cluster_id,n_nodes,n_edges,density
10322,Q25207068-1,26,85.0,0.261538
12192,Q6290227-1,13,21.0,0.269231
7266,Q15990022-1,19,48.0,0.280702
7033,Q17627000-1,18,43.0,0.281046
5175,Q10477507-1,31,136.0,0.292473
10553,Q16199347-1,16,36.0,0.3
12011,Q6142835-1,14,28.0,0.307692
12157,Q20732788-1,11,17.0,0.309091
12078,Q20734666-1,25,94.0,0.313333
3390,Q24254543-1,27,111.0,0.316239


In [None]:
linker.cluster_studio_dashboard(
    df_predict,
    df_clustered,
    out_path="cluster_studio.html",
    sampling_method="lowest_density_clusters",
    sample_size=10,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics,
)

In [None]:
# Check out density produced with modified version of function

def _get_cluster_id_by_density(
    linker, df_cluster_metrics, sample_size: int, min_nodes: int
):
    # Ordering: least dense clusters first
    sql = f"""
    SELECT cluster_id, density, n_nodes, n_edges, 
    (n_edges * 2)/(n_nodes * (n_nodes-1)) AS density_check
    FROM {df_cluster_metrics.physical_name}
    WHERE n_nodes >= {min_nodes}
    ORDER BY density
    LIMIT {sample_size}
    """

    df_density_sample = linker._sql_to_splink_dataframe_checking_cache(
        sql, "__splink__density_sample"
    )

    return df_density_sample.as_pandas_dataframe()

In [None]:
_get_cluster_id_by_density(linker, df_cluster_metrics, sample_size=10, min_nodes=3)

Results:
- Correct (lowest density) clusters being found and put into splink cluster studio
- Ordering of clusters isn't from low to high density - density info is lost so might be easier to add the density to drop down menu
- Density being calculated correctly

## Testing on Nomis data

Problem is that have two nodes in clusters table for 6 edges legit edges with threshold above 0.95

In [None]:
pip install -e .

In [12]:
from splink.duckdb.linker import DuckDBLinker
import pandas as pd

# Set up linker

# Simple dummy df
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings)

In [13]:
# Set up edges and clusters data

edges_nomis = pd.read_csv("nomis_edges_anonymised.csv")
# edges_nomis["person_id_l"] = edges_nomis["person_id_l"].astype(int)
clusters_nomis = pd.read_csv("nomis_clusters_anonymised.csv")

# Give cols conventional names
# Change cluster_low to cluster_x for threshold x
clusters_nomis = clusters_nomis.rename(columns={"cluster_low": "cluster_id"})

# Transform to Splink dataframes
df_edges_nomis = linker.register_table(edges_nomis, "edges_nomis", overwrite=True)
df_clusters_nomis = linker.register_table(
    clusters_nomis, "clusters_nomis", overwrite=True
)

In [28]:
# Generate density metrics

# linker.debug_mode=True

df_cluster_metrics_nomis = linker._compute_cluster_metrics(
    df_edges_nomis, df_clusters_nomis, threshold_match_probability=0.95
)
# df_cluster_metrics_nomis.as_pandas_dataframe()

# df_cluster_metrics_nomis.as_pandas_dataframe().groupby("n_nodes").min("density").head()

df_cluster_metrics_nomis.as_pandas_dataframe().groupby("density").min('density').head(20)

Unnamed: 0_level_0,n_nodes,n_edges
density,Unnamed: 1_level_1,Unnamed: 2_level_1
0.25,8,7.0
0.254545,11,14.0
0.285714,7,6.0
0.321429,8,9.0
0.333333,6,5.0
0.357143,8,10.0
0.380952,7,8.0
0.392857,8,11.0
0.4,5,4.0
0.428571,7,9.0


In [7]:
n_nodes = 10
n_edges = 30

density = (n_edges * 2) / (n_nodes * (n_nodes - 1))
density

# Density calculated correctly
# Thing that is wrong is having 6 edges when only 2 nodes

0.6666666666666666

In [None]:
cluster id = d79b5dfd903fb222e662b0eb96ccfc73
Appears twice in the clusters table - so already not enough nodes for the number of edges

dcdcba59f8e31e4bfebc6aa1e99e3f1f = person id, occurs 5 times in edges left at or above 0.95 threshold

9be9a90df36aad592ea1e88b136859b3 = person id, occurs 1 time in edge left at or above the threshold. This is to be expected


#### Try generating the clusters data again from nomis edges

In [None]:
linker.debug_mode = False

nomis_predict = pd.read_csv("nomis_edges_anonymised.csv")
# nomis_predict["person_id_l"] = nomis_predict["person_id_l"].astype(int)
# nomis_predict["person_id_r"] = nomis_predict["person_id_r"].astype(int)


# Transform to Splink dataframes
df_nomis_predict = linker.register_table(nomis_predict, "nomis_predict", overwrite=True)

new_nomis_clusters = linker.cluster_pairwise_predictions_at_threshold(
    df_nomis_predict, 0.9
)
display(new_nomis_clusters.as_pandas_dataframe().sort_values("cluster_id"))

In [None]:
linker.cluster_studio_dashboard(
    df_edges_nomis,
    df_clusters_nomis,
    out_path="cluster_studio.html",
    sampling_method="by_cluster_density",
    sample_size=10,
    overwrite=True,
    _df_cluster_metrics=df_cluster_metrics_nomis,
)

## Building actual test

In [None]:
import pandas as pd
from splink.cluster_studio import _get_cluster_id_by_density

from splink.duckdb.linker import DuckDBLinker

# Dummy df and settings for linker
person_ids = [i + 1 for i in range(5)]
df = pd.DataFrame({"person_id": person_ids})

settings = {
    "link_type": "dedupe_only",
    "unique_id_column_name": "person_id",
}
linker = DuckDBLinker(df, settings)

# Dummy cluster metrics table
cluster = ["A", "B", "C", "D", "E"]
n_nodes = [3, 2, 10, 3, 19]
n_edges = [2, 1, 5, 2, 25]
density = [
    (n_edges * 2) / (n_nodes * (n_nodes - 1))
    for n_nodes, n_edges in zip(n_nodes, n_edges)
]
df_metrics = pd.DataFrame(
    {"cluster_id": cluster, "n_nodes": n_nodes, "n_edges": n_edges, "density": density}
)
df_metrics

# Convert to Splink dataframe
df_cluster_metrics = linker.register_table(
    df_metrics, "df_cluster_metrics", overwrite=True
)

In [None]:
df_metrics

In [None]:
# Testing this function

def _get_cluster_id_by_density(
    linker, df_cluster_metrics, sample_size: int, min_nodes: int
):
    # Ordering: least dense clusters first
    sql = f"""
    SELECT cluster_id
    FROM {df_cluster_metrics.physical_name}
    WHERE n_nodes >= {min_nodes}
    ORDER BY density
    LIMIT {sample_size}
    """

    df_density_sample = linker._sql_to_splink_dataframe_checking_cache(
        sql, "__splink__density_sample"
    )

    return [r["cluster_id"] for r in df_density_sample.as_record_dict()]

In [None]:
result = _get_cluster_id_by_density(linker, df_cluster_metrics, sample_size=3, min_nodes=3)
result

In [None]:
# Better to put the linker inside the function?

def test_density_sample():
    df_result = _get_cluster_id_by_density(
        linker, df_cluster_metrics, sample_size=3, min_nodes=3
    )
    df_expect = ["C", "E", "A"]
    assert df_result == df_expect

test_density_sample()

In [None]:
def test_size_density():
    # Linker with basic settings
    settings = {"link_type": "dedupe_only", "unique_id_column_name": "person_id"}
    linker = DuckDBLinker(df, settings)

    # Register as Splink dataframes
    df_predict = linker.register_table(edges, "df_predict", overwrite=True)
    df_clustered = linker.register_table(clusters, "df_clustered", overwrite=True)

    df_cluster_metrics = linker._compute_cluster_metrics(
        df_predict, df_clustered, threshold_match_probability=0.99
    )
    df_cluster_metrics = df_cluster_metrics.as_pandas_dataframe()

    assert_frame_equal(df_cluster_metrics, df_expected)