## Splink visualise clusters

This demo shows how to create an interactive visualisation of clusters.

The first few steps use the model trained in the deduplication quickstart example

In [None]:
import altair as alt
import pandas as pd

In [2]:
from utility_functions.demo_utils import get_spark
spark = get_spark()  # See utility_functions/demo_utils.py for how to set up Spark
df = spark.read.parquet("data/fake_1000.parquet")

In [3]:
from splink import Splink

settings = {
    "link_type": "dedupe_only",
    "blocking_rules": ["l.surname = r.surname",
                       "l.first_name = r.first_name",
                       "l.dob = r.dob",
                       "l.email = r.email",
                        ],
    "comparison_columns": [
        {
            "col_name": "first_name",
            "num_levels": 3,
            "term_frequency_adjustments": True,
            "m_probabilities": [
                0.3941434323787689,
                0.14060422778129578,
                0.4652523398399353,
            ],
            "u_probabilities": [
                0.9941955208778381,
                0.0028420439921319485,
                0.002962463302537799,
            ],
        },
        {
            "col_name": "surname",
            "num_levels": 3,
            "term_frequency_adjustments": True,
            "m_probabilities": [
                0.3971782326698303,
                0.11397389322519302,
                0.48884785175323486,
            ],
            "u_probabilities": [
                0.9930331110954285,
                0.00222682929597795,
                0.004740049596875906,
            ],
        },
        {
            "col_name": "dob",
            "m_probabilities": [0.38818904757499695, 0.6118109226226807],
            "u_probabilities": [0.9997655749320984, 0.00023440067889168859],
        },
        {
            "col_name": "city",
          "case_expression": "case\n    when city_l is null or city_r is null then -1\n    when city_l = city_r then 1\n    else 0 end as gamma_city",
            "m_probabilities": [0.29216697812080383, 0.7078329920768738],
            "u_probabilities": [0.9105007648468018, 0.08949924260377884],
        },
        {
            "col_name": "email",
           "m_probabilities": [0.32461094856262207, 0.6753890514373779],
            "u_probabilities": [0.999818742275238, 0.00018127892690245062],
        },
    ],
    "additional_columns_to_retain": ["group"],
    "proportion_of_matches": 0.005672720726579428,
    }



linker = Splink(settings, df, spark)
df_e = linker.manually_apply_fellegi_sunter_weights()

In [4]:
# Cluster at a probability threshold of 50%
from utility_functions.cluster_utils import clusters_at_thresholds
nodes_with_clusters = clusters_at_thresholds(df, df_e, [0.75], ["cluster_low"], spark)
nodes_with_clusters.limit(2).toPandas()

Unnamed: 0,cluster_low,unique_id,first_name,surname,dob,city,email,group
0,0,2,Julia,Taylor,2016-01-27,London,hannah88@powers.com,0
1,4,12,Noah,,2008-03-23,Blotn,,1


In [5]:
from splink_visualise_clusters.render_template import render_vis_template



from splink_visualise_clusters.graph import (
    cluster_basic_stats,
    cluster_main_stats,
    weight_from_prob,
)


In [6]:
# Get a few of the largest clusters
nodes_with_clusters.createOrReplaceTempView("nodes_with_clusters")
sql = """
select count(*) as count, cluster_low
from nodes_with_clusters
group by cluster_low
order by count(*) desc
limit 10

"""
largest_clusters = spark.sql(sql).toPandas().head(10)
display(largest_clusters.head(3))
cluster_ids = list(largest_clusters["cluster_low"])

Unnamed: 0,count,cluster_low
0,16,394
1,14,858
2,10,976


The visualisation needs a list of edges and nodes.  `splink_visualise_clusters` contains functions to create and format these tables ready for input into the vis


In [7]:
from splink_visualise_clusters.sql import (
    get_edges_corresponding_to_clusters_spark,
    get_nodes_corresponding_to_clusters_spark,
)

nodes_for_vis = get_nodes_corresponding_to_clusters_spark(
    nodes_with_clusters, "cluster_low", cluster_ids
)
edges_for_vis = get_edges_corresponding_to_clusters_spark(
    nodes_with_clusters, df_e, "cluster_low", cluster_ids
)

nodes_for_vis_pd = nodes_for_vis.toPandas()
edges_for_vis_pd = edges_for_vis.toPandas()

Optionally, we can compute graph metrics, which will then be displayed in the vis.  

If we have ground truth clusters, this information will also be displayed in the vis

Naming conventions are important here:
- `ground truth cluster` contains the name of the real cluster
- `is_false_positive` designates false positive nodes
- `match_weight` contains the log2 Bayes Factor representation of the `match_probability` or `tf_adjusted_match_prob`
- 

In [8]:
from splink_visualise_clusters.graph import (
    edge_betweenness,
    eigen_centrality,
    is_bridge,
    weight_from_prob,
)


# Edges
edges_for_vis_pd = weight_from_prob(edges_for_vis_pd, "match_probability")
edges_for_vis_pd = edge_betweenness(edges_for_vis_pd, "cluster_low", "match_weight")
edges_for_vis_pd = is_bridge(edges_for_vis_pd, "cluster_low", "match_weight")


nodes_for_vis_pd = eigen_centrality(
    nodes_for_vis_pd, edges_for_vis_pd, "cluster_low", "match_weight"
)

edges_for_vis_pd["is_false_positive"] = edges_for_vis_pd["group_l"] != edges_for_vis_pd["group_r"]

nodes_for_vis_pd = nodes_for_vis_pd.rename(columns={"group": "ground_truth_cluster"})



Optionally, you can derived a `df_cluster_stats`, containing graph metrics for each of the clusters

In [9]:
df_cluster_stats = cluster_basic_stats(edges_for_vis_pd, "cluster_low")
df_cluster_stats2 = cluster_main_stats(edges_for_vis_pd, "cluster_low", "match_weight")
df_cluster_stats = df_cluster_stats.merge(
    df_cluster_stats2, left_on="cluster_low", right_on="cluster_low", how="left"
)
df_cluster_stats.head(2)

Unnamed: 0,cluster_low,num_nodes,num_edges,density,diameter,transitivity,tri_clustcoeff,sq_clustcoeff,node_conn,edge_conn,cluster_eb_modularity
0,105,10,36,0.8,3,0.942857,0.85,0.874074,10,1,0.024306
1,194,10,45,1.0,1,1.0,1.0,1.0,10,9,-0.02


The vis is rendered to a file, which you can load in your browser or dislplay in an iframe in Jupyter

In [10]:
splink_settings_dict = linker.model.current_settings_obj.settings_dict
render_vis_template(
    nodes_for_vis_pd, edges_for_vis_pd, splink_settings_dict, "interactive_clusters.html", "cluster_low", overwrite=True, df_cluster_metrics=df_cluster_stats,)


In [11]:
# Show outputted html file in iframe in Juptyer
from IPython.display import IFrame

IFrame(
    src="./interactive_clusters.html", width=1400, height=1200
)  # Show outputted html file in iframe in Juptyer