In [1]:
import pandas as pd 
pd.options.display.max_columns = 500
pd.options.display.max_rows = 100

In [2]:
import logging 
logging.basicConfig()  # Means logs will print in Jupyter Lab

In [3]:
from utility_functions.demo_utils import get_spark
spark = get_spark() # See utility_functions/demo_utils.py for how to set up Spark

In [4]:
## splink_graph functionality
import splink_graph
from splink_graph.splink_graph import subgraph_stats
from splink_graph.splink_graph import _graphharmoniser
from pyspark.sql import functions as f
import os


In [26]:
if (spark.version).startswith("3"):
    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "0"
    print("Spark 3.x detected. ARROW_PRE_0_15_IPC_FORMAT is set to ",os.environ["ARROW_PRE_0_15_IPC_FORMAT"])

Spark 3.x detected. ARROW_PRE_0_15_IPC_FORMAT is set to  0


In [27]:
spark.version

'3.0.2'

In [28]:
df_e = spark.read.parquet("data/graph/df_e.parquet")
cc = spark.read.parquet("data/graph/cc.parquet")
edges = spark.read.parquet("data/graph/edges.parquet")

In [29]:

edges.printSchema()
cc.printSchema()


root
 |-- src: string (nullable = true)
 |-- dst: string (nullable = true)
 |-- tf_adjusted_match_prob: double (nullable = true)

root
 |-- id: string (nullable = true)
 |-- component: long (nullable = true)



In [30]:
edgesinfo =(df_e.withColumn( "info",
            f.to_json(f.struct("surname_l","dob_l","city_l","email_l","group_l",
                              "surname_r","dob_r","city_r","email_r","group_r"))).
            select("tf_adjusted_match_prob","match_probability","unique_id_l","unique_id_r","info").
            withColumnRenamed("unique_id_l","src").withColumnRenamed("unique_id_r","dst"))



edgesinfo = _graphharmoniser(edgesinfo,"src","dst")
edges = _graphharmoniser(edges,"src","dst")


In [31]:
#edge_df = edges.join(edgesinfo,  (f.col("source") == f.col("unique_id_l") ) &
#                                   (f.col("target") == f.col("unique_id_r"))

In [32]:
edge_df = (edges.alias('a').join(cc.alias('b'),f.col("a.src")==f.col("b.id")).drop("id"))
edge_df = edge_df.withColumn("distance" ,1.01 - f.col("tf_adjusted_match_prob"))

In [33]:
sgs = subgraph_stats(edge_df,"component", "tf_adjusted_match_prob",
               src="src", dst="dst")

In [34]:
sgs.show(2)

+---------+---------------+---------+---------+-------+
|component|          nodes|nodecount|edgecount|density|
+---------+---------------+---------+---------+-------+
|       12|[226, 227, 228]|        3|        3|    1.0|
|       28|     [409, 411]|        2|        1|    1.0|
+---------+---------------+---------+---------+-------+
only showing top 2 rows



In [35]:

from splink_graph.vectorised import diameter_radius_transitivity
drt = diameter_radius_transitivity(edge_df,"src", "dst")


In [36]:
drt.show()

+-----------+--------+------+------------+--------------+-------------+--------------------+
|  component|diameter|radius|transitivity|tri_clustcoeff|sq_clustcoeff|           graphhash|
+-----------+--------+------+------------+--------------+-------------+--------------------+
|         12|       1|     1|         1.0|           1.0|          0.0|7d2c307dbd866960f...|
|         28|       1|     1|         0.0|           0.0|          0.0|2148f1da1ac29711e...|
|         29|       1|     1|         1.0|           1.0|          0.0|7d2c307dbd866960f...|
|         30|       2|     1|         0.6|         0.583|          0.0|0db442538bb6dc81d...|
|         33|       1|     1|         0.0|           0.0|          0.0|2148f1da1ac29711e...|
|         42|       1|     1|         0.0|           0.0|          0.0|2148f1da1ac29711e...|
|         67|       1|     1|         1.0|           1.0|          0.0|7d2c307dbd866960f...|
| 8589934628|       1|     1|         1.0|           1.0|          0.0

In [37]:
graphstats = sgs.join(drt,on="component")
graphstats.sort(graphstats.nodecount.desc(),graphstats.transitivity.asc()).drop("graphhash","nodes","sq_clustcoeff").show()



+-----------+---------+---------+-------+--------+------+------------+--------------+
|  component|nodecount|edgecount|density|diameter|radius|transitivity|tri_clustcoeff|
+-----------+---------+---------+-------+--------+------+------------+--------------+
|         72|        6|       11|  0.733|       2|     1|        0.75|         0.867|
|         61|        5|        4|    0.4|       2|     1|         0.0|           0.0|
|17179869220|        5|        5|    0.5|       3|     2|       0.429|         0.333|
|25769803812|        5|        7|    0.7|       2|     1|         0.6|           0.8|
|         38|        5|        8|    0.8|       2|     1|       0.789|         0.867|
| 8589934671|        5|        9|    0.9|       2|     1|       0.875|           0.9|
| 8589934629|        5|        9|    0.9|       2|     1|       0.875|           0.9|
| 8589934616|        5|       10|    1.0|       1|     1|         1.0|           1.0|
|         51|        5|       10|    1.0|       1|    

In [38]:
from splink_graph.vectorised import edgebetweeness

ebdf = edgebetweeness(edge_df, src="src", dst="dst")
ebdf = _graphharmoniser(ebdf,"src","dst")
ebdf.show()




+----------+----------+---+---+
|        eb| component|src|dst|
+----------+----------+---+---+
|0.33333334|        12|226|227|
|0.33333334|        12|226|228|
|0.33333334|        12|227|228|
|       1.0|        28|409|411|
|0.33333334|        29| 37| 39|
|0.33333334|        29| 37| 43|
|0.33333334|        29| 39| 43|
|0.33333334|        30|434|435|
|0.16666667|        30|433|434|
|0.33333334|        30|433|435|
|       0.5|        30|432|435|
|       1.0|        33|467|468|
|       1.0|        42|511|513|
|0.33333334|        67|895|897|
|0.33333334|        67|894|895|
|0.33333334|        67|894|897|
|0.33333334|8589934628|521|523|
|0.33333334|8589934628|521|525|
|0.33333334|8589934628|523|525|
|0.33333334|8589934658|853|854|
+----------+----------+---+---+
only showing top 20 rows



In [39]:
from splink_graph.vectorised import eigencentrality
node_eigen_centrality = eigencentrality(edge_df,component="component", distance="distance",
               src="src", dst="dst")

node_eigen_centrality.show()

+----+-------------------+
|node|   eigen_centrality|
+----+-------------------+
| 226| 0.5773502691896258|
| 227| 0.5773502691896258|
| 228| 0.5773502691896258|
| 409| 0.7071067811865476|
| 411| 0.7071067811865476|
|  37| 0.5773502691896258|
|  39| 0.5773502691896258|
|  43| 0.5773502691896258|
| 434| 0.5227204550943347|
| 435| 0.6116286437343044|
| 433| 0.5227204550943347|
| 432|0.28184579793865727|
| 467| 0.7071067811865476|
| 468| 0.7071067811865476|
| 511| 0.7071067811865476|
| 513| 0.7071067811865476|
| 895| 0.5773502691896258|
| 897| 0.5773502691896258|
| 894| 0.5773502691896258|
| 521| 0.5773502691896258|
+----+-------------------+
only showing top 20 rows

