In [1]:
import pandas as pd 
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100

In [2]:
import logging 
logging.basicConfig()  # Means logs will print in Jupyter Lab

In [3]:
from utility_functions.demo_utils import get_spark
spark = get_spark() # See utility_functions/demo_utils.py for how to set up Spark

In [4]:
## splink_graph functionality
import splink_graph
from splink_graph.splink_graph import subgraph_stats
from splink_graph.splink_graph import _graphharmoniser
from pyspark.sql import functions as f
import pyspark
import os


In [5]:
if (pyspark.__version__).startswith("3"):
    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "0"
    print("Spark 3.x detected. ARROW_PRE_0_15_IPC_FORMAT is set to ",os.environ["ARROW_PRE_0_15_IPC_FORMAT"])
else:
    print("Spark 2.x detected. ARROW_PRE_0_15_IPC_FORMAT is set to 1")

Spark 2.x detected. ARROW_PRE_0_15_IPC_FORMAT is set to 1


In [6]:
spark.version

'2.4.5'

In [7]:
df_e = spark.read.parquet("data/graph/df_e.parquet")
cc = spark.read.parquet("data/graph/cc.parquet")
edges = spark.read.parquet("data/graph/edges.parquet")

In [8]:

edges.printSchema()
cc.printSchema()


root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- tf_adjusted_match_prob: double (nullable = true)

root
 |-- id: string (nullable = true)
 |-- component: long (nullable = true)



In [9]:
edgesinfo =(df_e.withColumn( "info",
            f.to_json(f.struct("surname_l","dob_l","city_l","email_l","group_l",
                              "surname_r","dob_r","city_r","email_r","group_r"))).
            select("tf_adjusted_match_prob","match_probability","unique_id_l","unique_id_r","info").
            withColumnRenamed("unique_id_l","src").withColumnRenamed("unique_id_r","dst"))



edgesinfo = _graphharmoniser(edgesinfo,"src","dst")
edges = _graphharmoniser(edges,"src","dst")


In [10]:
#edge_df = edges.join(edgesinfo,  (f.col("source") == f.col("unique_id_l") ) &
#                                   (f.col("target") == f.col("unique_id_r"))

In [11]:
edge_df = (edges.alias('a').join(cc.alias('b'),f.col("a.src")==f.col("b.id")).drop("id"))
edge_df = edge_df.withColumn("distance" ,1.01 - f.col("tf_adjusted_match_prob"))

In [12]:
sgs = subgraph_stats(edge_df,"component", "tf_adjusted_match_prob",
               src="src", dst="dst")

In [13]:
sgs.show(2)

+---------+---------------+---------+---------+-------+
|component|          nodes|nodecount|edgecount|density|
+---------+---------------+---------+---------+-------+
|       12|[226, 227, 228]|        3|        3|    1.0|
|       28|     [409, 411]|        2|        1|    1.0|
+---------+---------------+---------+---------+-------+
only showing top 2 rows



In [14]:
#in case of error 
#RuntimeError: Arrow legacy IPC format is not supported in PySpark, please unset ARROW_PRE_0_15_IPC_FORMAT
#please uncomment the line below and execute the cell again!
#os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "0"

from splink_graph.vectorised import diameter_radius_transitivity
drt = diameter_radius_transitivity(edge_df,"src", "dst")


In [15]:
drt.show()

+-----------+--------+------+------------+--------------+-------------+--------------------+
|  component|diameter|radius|transitivity|tri_clustcoeff|sq_clustcoeff|           graphhash|
+-----------+--------+------+------------+--------------+-------------+--------------------+
|         12|       1|     1|         1.0|           1.0|          0.0|7d2c307dbd866960f...|
|         28|       1|     1|         0.0|           0.0|          0.0|2148f1da1ac29711e...|
|         29|       1|     1|         1.0|           1.0|          0.0|7d2c307dbd866960f...|
|         30|       2|     1|         0.6|         0.583|          0.0|0db442538bb6dc81d...|
|         33|       1|     1|         0.0|           0.0|          0.0|2148f1da1ac29711e...|
|         42|       1|     1|         0.0|           0.0|          0.0|2148f1da1ac29711e...|
|         67|       1|     1|         1.0|           1.0|          0.0|7d2c307dbd866960f...|
| 8589934628|       1|     1|         1.0|           1.0|          0.0

In [16]:
graphstats = sgs.join(drt,on="component")
graphstats.sort(graphstats.nodecount.desc(),graphstats.transitivity.asc()).\
drop("graphhash","nodes","sq_clustcoeff").show(5)



+-----------+---------+---------+-------+--------+------+------------+--------------+
|  component|nodecount|edgecount|density|diameter|radius|transitivity|tri_clustcoeff|
+-----------+---------+---------+-------+--------+------+------------+--------------+
|         72|        6|       11|  0.733|       2|     1|        0.75|         0.867|
|         61|        5|        4|    0.4|       2|     1|         0.0|           0.0|
|17179869220|        5|        5|    0.5|       3|     2|       0.429|         0.333|
|25769803812|        5|        7|    0.7|       2|     1|         0.6|           0.8|
|         38|        5|        8|    0.8|       2|     1|       0.789|         0.867|
+-----------+---------+---------+-------+--------+------+------------+--------------+
only showing top 5 rows



In [17]:
from splink_graph.vectorised import edgebetweeness

ebdf = edgebetweeness(edge_df, src="src", dst="dst")
ebdf = _graphharmoniser(ebdf,"src","dst")
ebdf.show(10)




+----------+---------+---+---+
|        eb|component|src|dst|
+----------+---------+---+---+
|0.33333334|       12|226|227|
|0.33333334|       12|226|228|
|0.33333334|       12|227|228|
|       1.0|       28|409|411|
|0.33333334|       29| 37| 39|
|0.33333334|       29| 37| 43|
|0.33333334|       29| 39| 43|
|0.33333334|       30|434|435|
|0.16666667|       30|433|434|
|0.33333334|       30|433|435|
+----------+---------+---+---+
only showing top 10 rows



In [18]:
edge_eb_df= edge_df.join(ebdf, ['src','dst','component'])

In [19]:
edge_eb_df.show(5)
edge_eb_df.count()

+---+---+-----------+----------------------+--------------------+----------+
|src|dst|  component|tf_adjusted_match_prob|            distance|        eb|
+---+---+-----------+----------------------+--------------------+----------+
|101|105|          1|    0.9999999999999999| 0.01000000000000012|0.33333334|
|110|112|          3|    0.9999735636657863| 0.01002643633421374|       0.1|
|129|130| 8589934594|    0.9999999999961988| 0.01000000000380119|0.33333334|
|157|158|          6|    0.9999999383933627|0.010000061606637356|       1.0|
|161|163|17179869185|    0.9999677183140464|0.010032281685953603|0.33333334|
+---+---+-----------+----------------------+--------------------+----------+
only showing top 5 rows



398

In [31]:
# network visualisation prep

In [None]:
# prepare edges

In [21]:
edge_df_for_viz = edge_eb_df.join(edgesinfo,['src','dst','tf_adjusted_match_prob'])
edge_df_for_viz = edge_df_for_viz.withColumnRenamed("src","source").\
                  withColumnRenamed("dst","target").\
                  withColumnRenamed("eb","value")
edge_df_for_viz.show(1,truncate=False,vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 source                 | 101                                                                                                                                                                                                                                                     
 target                 | 105                                                                                                                                                                                                                                                     
 tf_adjusted_match_prob | 0.9999999999999999                                                                                                                                   

In [22]:
# prepare nodes for visualisation
from splink_graph.vectorised import eigencentrality
node_eigen_centrality = eigencentrality(edge_df,component="component", distance="distance",
               src="src", dst="dst")

node_eigen_centrality.show(5)

+----+------------------+
|node|  eigen_centrality|
+----+------------------+
| 226|0.5773502691896258|
| 227|0.5773502691896258|
| 228|0.5773502691896258|
| 409|0.7071067811865476|
| 411|0.7071067811865476|
+----+------------------+
only showing top 5 rows



In [23]:
df_nodes = cc.withColumnRenamed("id","index").withColumnRenamed("component","group")
df_nodes = df_nodes.withColumn("name",f.col("index"))
node_df_for_viz = df_nodes.join(node_eigen_centrality,(df_nodes.index==node_eigen_centrality.node)).drop("node")
node_df_for_viz.show(2)

+-----+-----+----+------------------+
|index|group|name|  eigen_centrality|
+-----+-----+----+------------------+
|  226|   12| 226|0.5773502691896258|
|  227|   12| 227|0.5773502691896258|
+-----+-----+----+------------------+
only showing top 2 rows



In [24]:
# choose components to visualise


comp_list = graphstats.sort(graphstats.nodecount.desc(),graphstats.transitivity.asc()).\
limit(5).select("component").rdd.flatMap(list).collect()

comp_list_str="("+(','.join(str(x) for x in comp_list))+")"
comp_list_str

'(72,61,17179869220,25769803812,38)'

In [25]:
df_nodes_pd = node_df_for_viz.filter(f" group IN {comp_list_str} ").toPandas()
df_edges_pd = edge_df_for_viz.filter(f" component IN {comp_list_str} ").toPandas()
edge_fields = ["source", "target", "value",]
node_fields = ["name", "group", "index",]

In [26]:

import json
with open("data/graph/force_template.vg.json") as f:
    vl = json.load(f)



In [27]:
vl['data'][0] = {
    "name": "node-data",
    "values": df_nodes_pd[node_fields].to_dict(orient='records'),

}

vl['data'][1] = {
    "name": "link-data",
    "values":  df_edges_pd[edge_fields].to_dict(orient='records')
}

vl['width'] = 1200
vl['height'] = 600

In [28]:

%%javascript
    var script = document.createElement('script');
    script.type = 'text/javascript';
    script.src = '//cdn.jsdelivr.net/npm/vega@5';
    document.head.appendChild(script);
    
    var script = document.createElement('script');
    script.type = 'text/javascript';
    script.src = '//cdn.jsdelivr.net/npm/vega-embed@6';
    document.head.appendChild(script);



<IPython.core.display.Javascript object>

In [29]:


from IPython.display import Javascript
script = f"""
var spec = `{json.dumps(vl)}`
spec= JSON.parse(spec)
debugger;
vegaEmbed(element, spec).then(function(result) {{
  }}).catch(console.error);  
"""



In [30]:


Javascript(script)



<IPython.core.display.Javascript object>