In [1]:
import pandas as pd 

## Node-level (person-level) information

In [2]:
nodestats = pd.read_parquet("data/temp/nodestats.parquet")
nodestats.columns = ["index", "cluster_id", "node_id", "eigen_centrality"]
nodestats = nodestats.drop('index', axis=1)

nodes = pd.read_csv("data/fake_1000.csv")
nodes['unique_id'] = nodes['unique_id'].astype(str)

nodestats = nodestats.merge(nodes, left_on='node_id', right_on='unique_id')
nodestats = nodestats.drop(["unique_id", "group"], axis=1)
nodestats = nodestats.fillna("")
nodestats.fillna("").head()

Unnamed: 0,cluster_id,node_id,eigen_centrality,first_name,surname,dob,city,email
0,12,226,0.57735,Julia,Smith,2014-05-24,Luton,julia.smith@english.org
1,12,227,0.57735,Julia,Smith,2004-04-27,Luton,
2,12,228,0.57735,Julia,Smith,2004-04-26,Luton,julia.smith@english.org
3,28,409,0.707107,Emily,Atkinson,2017-05-03,London,emilya@roberts.com
4,28,411,0.707107,Emily,Atkinson,2008-05-05,,emilya@roberts.com


## Edge-level (pairwise-comparison level) information

In [3]:
import numpy as np
df_e = pd.read_parquet('data/graph/df_e.parquet') 
edges = pd.read_parquet("data/temp/edgestats.parquet")
df_e = df_e.drop("tf_adjusted_match_prob",axis=1)
df_e = edges.merge(df_e, left_on=['src', 'dst'], right_on = ['unique_id_l', 'unique_id_r'])
df_e = df_e.rename(columns={'component': 'cluster_id', 'eb': 'edge_betweenness'})

df_e['match_score'] = np.log2(df_e["tf_adjusted_match_prob"]/(1-df_e["tf_adjusted_match_prob"]))
df_e.head(5)

Unnamed: 0,src,dst,cluster_id,tf_adjusted_match_prob,distance,edge_betweenness,match_probability,unique_id_l,unique_id_r,surname_l,...,gamma_dob,city_l,city_r,gamma_city,email_l,email_r,gamma_email,group_l,group_r,match_score
0,101,105,1,1.0,0.01,0.333333,1.0,101,105,Griffiths,...,1,Plymouth,Plymouth,1,a.griffiths@garner-bridges.com,a.griffiths@garner-bridges.com,1,29,29,53.0
1,110,112,3,0.999974,0.010026,0.1,0.999983,110,112,Atkinnos,...,0,London,London,1,oliver.atkinson@moran-smith.com,oliver.atkinson@moran-smith.com,1,31,31,15.20708
2,129,130,8589934594,1.0,0.01,0.333333,1.0,129,130,Barker,...,1,Reading,,-1,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36,37.936689
3,157,158,6,1.0,0.01,1.0,1.0,157,158,Wright,...,1,,,-1,a.wright42@estrada.org,a.wright42@estrada.org,1,44,44,23.952339
4,161,163,17179869185,0.999968,0.010032,0.333333,0.999968,161,163,,...,1,London,,-1,hollythomson3@levine-jones.com,hollythomson3@levine-jones.com,1,45,45,14.918878


## Subgraph-level (cluster-level) information

In [4]:
graphstats = pd.read_parquet("data/temp/graphstats.parquet")
graphstats = graphstats.rename(columns={'component': 'cluster_id', 'nodes': 'node_ids'})
graphstats.head(2)

Unnamed: 0,cluster_id,node_ids,nodecount,edgecount,density,diameter,radius,transitivity,tri_clustcoeff,sq_clustcoeff,graphhash
0,12,"[226, 227, 228]",3,3,1.0,1,1,1.0,1.0,0.0,7d2c307dbd866960fae5a905cc5447de
1,28,"[409, 411]",2,1,1.0,1,1,0.0,0.0,0.0,2148f1da1ac29711e1273e364d4127c4


## Visualisation

In [16]:
from utility_functions.vis_utils import get_interface
import pandas as pd 
pd.options.display.max_columns = 1000
get_interface(df_e, nodestats, graphstats)

Dropdown(description='Cluster:', index=20, options=(1, 3, 4, 6, 7, 12, 13, 18, 20, 22, 24, 27, 28, 29, 30, 32,…

RadioButtons(description='Edge metric', options=('edge_betweenness', 'match_score'), value='edge_betweenness')

Output()