In [40]:
import pandas as pd 

## Node-level (person-level) inforamtion

In [60]:
nodestats = pd.read_parquet("data/temp/nodestats.parquet")
nodestats.columns = ["index", "cluster_id", "node_id", "eigen_centrality"]
nodestats

Unnamed: 0,index,cluster_id,node_id,eigen_centrality
0,226,12,226,0.577350
1,227,12,227,0.577350
2,228,12,228,0.577350
3,409,28,409,0.707107
4,411,28,411,0.707107
...,...,...,...,...
421,194,34359738372,194,0.707107
422,879,34359738441,879,0.707107
423,880,34359738441,880,0.707107
424,954,42949673030,954,0.707107


## Edge-level (pairwise-comparison level) information

In [80]:
df_e = pd.read_parquet('data/graph/df_e.parquet') 
edges = pd.read_parquet("data/temp/edgestats.parquet")
df_e = df_e.drop("tf_adjusted_match_prob",axis=1)
df_e = edges.merge(df_e, left_on=['src', 'dst'], right_on = ['unique_id_l', 'unique_id_r'])
df_e = df_e.rename(columns={'component': 'cluster_id', 'eb': 'edge_betweenness'})
df_e.head(5)

Unnamed: 0,src,dst,cluster_id,tf_adjusted_match_prob,distance,edge_betweenness,match_probability,unique_id_l,unique_id_r,surname_l,...,dob_r,gamma_dob,city_l,city_r,gamma_city,email_l,email_r,gamma_email,group_l,group_r
0,101,105,1,1.0,0.01,0.333333,1.0,101,105,Griffiths,...,2008-05-07,1,Plymouth,Plymouth,1,a.griffiths@garner-bridges.com,a.griffiths@garner-bridges.com,1,29,29
1,110,112,3,0.999974,0.010026,0.1,0.999983,110,112,Atkinnos,...,2010-01-20,0,London,London,1,oliver.atkinson@moran-smith.com,oliver.atkinson@moran-smith.com,1,31,31
2,129,130,8589934594,1.0,0.01,0.333333,1.0,129,130,Barker,...,1990-03-08,1,Reading,,-1,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36
3,157,158,6,1.0,0.01,1.0,1.0,157,158,Wright,...,1993-03-27,1,,,-1,a.wright42@estrada.org,a.wright42@estrada.org,1,44,44
4,161,163,17179869185,0.999968,0.010032,0.333333,0.999968,161,163,,...,1985-05-21,1,London,,-1,hollythomson3@levine-jones.com,hollythomson3@levine-jones.com,1,45,45


## Subgraph-level (cluster-level) information

In [81]:
graphstats = pd.read_parquet("data/temp/graphstats.parquet")
graphstats = graphstats.rename(columns={'component': 'cluster_id', 'nodes': 'node_ids'})
graphstats.head(2)

Unnamed: 0,cluster_id,node_ids,nodecount,edgecount,density,diameter,radius,transitivity,tri_clustcoeff,sq_clustcoeff,graphhash
0,12,"[226, 227, 228]",3,3,1.0,1,1,1.0,1.0,0.0,7d2c307dbd866960fae5a905cc5447de
1,28,"[409, 411]",2,1,1.0,1,1,0.0,0.0,0.0,2148f1da1ac29711e1273e364d4127c4


## Visualisation

We need information in the format:
    
```
    {
    "name": "node-data",
    "values": [
        {
            "id": "798",
            "group": 61,
            "tooltip": {
                "eigen_centrality": 0.7071069290249942
            }
        }
    ]
},
{
    "name": "link-data",
    "values": [
        {
            "source": "798",
            "target": "802",
            "value": 0.4000000059604645,
            "tooltip": {
                "dob_l": "2002-08-08",
            }
        }
    ]
}
```

#### Node data

In [130]:
def link_data_with_tooltip(df, source_field = 'src', target_field='dst', cols_to_retain=[], cols_to_drop_from_tooltip=[], cluster_id = None, cluster_field='cluster_id'):
    if cluster_id:
        df = df[df[cluster_field] == cluster_id]
        
    recs =  df.to_dict(orient='records')
    new_recs = []
    for r in recs:
        new_row = {}
        
        for c in cols_to_retain:
            new_row[c] = r[c]
            new_row['source'] = r[source_field]
            new_row['target'] = r[target_field]
        
        tooltip_cols = [c for c in r.keys() if c not in cols_to_drop_from_tooltip]
        tooltip = {}
        for c in tooltip_cols:
            tooltip[c] = r[c] 
        new_row['tooltip'] = tooltip
        new_recs.append(new_row)
    return new_recs

def node_data_with_tooltip(df, cols_to_retain=['cluster_id'], cols_to_drop_from_tooltip=[], cluster_id=None, cluster_field='cluster_id' ):
    if cluster_id:
        df = df[df[cluster_field] == cluster_id]
        
    recs =  df.to_dict(orient='records')
    new_recs = []
    for r in recs:
        new_row = {}
        
        for c in cols_to_retain:
            new_row[c] = r[c]
        
        tooltip_cols = [c for c in r.keys() if c not in cols_to_drop_from_tooltip]
        tooltip = {}
        for c in tooltip_cols:
            tooltip[c] = r[c] 
        new_row['tooltip'] = tooltip
        new_recs.append(new_row)
    return new_recs

cluster_id = 8589934594
link_data = link_data_with_tooltip(df_e, cluster_id=cluster_id, cols_to_retain = ['match_probability', 'edge_betweenness'],cols_to_drop_from_tooltip=['cluster_id', 'src', 'dst', 'group_l', 'group_r'])

node_data = node_data_with_tooltip(nodestats,cols_to_retain = ['cluster_id', 'eigen_centrality', 'node_id'], cluster_id=cluster_id)



In [134]:
import json
df_e['edge_id'] = df_e['unique_id_l'] + ", " + df_e['unique_id_r'] 
link_data = link_data_with_tooltip(df_e, cluster_id=8589934594, cols_to_retain = ['src', 'dst', 'match_probability', 'edge_betweenness', 'edge_id'],cols_to_drop_from_tooltip=['cluster_id', 'src', 'dst', 'group_l', 'group_r'])

node_data = node_data_with_tooltip(nodestats,cols_to_retain = ['cluster_id', 'eigen_centrality', 'node_id'], cluster_id=8589934594)


with open('data/graph/force_template.vg.json') as f:
    vl = json.load(f)
vl['data'][0] = {
    "name": "node-data",
    "values":node_data

}

vl['data'][1] = {
    "name": "link-data",
    "values":  link_data

}

vl['width'] = 400
vl['height'] = 400
from IPython.display import Javascript
script = f"""
   var script = document.createElement('script');
    script.type = 'text/javascript';
    script.src = '//cdn.jsdelivr.net/npm/vega@5';
    document.head.appendChild(script);
    
    var script = document.createElement('script');
    script.type = 'text/javascript';
    script.src = '//cdn.jsdelivr.net/npm/vega-embed@6';
    document.head.appendChild(script);
    
    var spec = `{json.dumps(vl)}`
    spec= JSON.parse(spec)
    vegaEmbed(element, spec).then(function(result) {{
      }}).catch(console.error);  
"""
display(df_e[df_e['cluster_id'] == cluster_id])
Javascript(script)

Unnamed: 0,src,dst,cluster_id,tf_adjusted_match_prob,distance,edge_betweenness,match_probability,unique_id_l,unique_id_r,surname_l,...,gamma_dob,city_l,city_r,gamma_city,email_l,email_r,gamma_email,group_l,group_r,edge_id
2,129,130,8589934594,1.0,0.01,0.333333,1.0,129,130,Barker,...,1,Reading,,-1,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36,"129, 130"
89,130,132,8589934594,1.0,0.01,0.166667,1.0,130,132,Barker,...,1,,Rading,-1,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36,"130, 132"
137,129,132,8589934594,0.999999,0.010001,0.333333,0.999999,129,132,Barker,...,1,Reading,Rading,0,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36,"129, 132"
347,129,131,8589934594,0.99922,0.01078,0.5,0.998773,129,131,Barker,...,0,Reading,Reading,1,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36,"129, 131"


<IPython.core.display.Javascript object>

In [116]:
node_data

[{'cluster_id': 8589934594,
  'eigen_centrality': 0.5227204550943347,
  'node_id': '130',
  'tooltip': {'index': '130',
   'cluster_id': 8589934594,
   'node_id': '130',
   'eigen_centrality': 0.5227204550943347}},
 {'cluster_id': 8589934594,
  'eigen_centrality': 0.5227204550943347,
  'node_id': '132',
  'tooltip': {'index': '132',
   'cluster_id': 8589934594,
   'node_id': '132',
   'eigen_centrality': 0.5227204550943347}},
 {'cluster_id': 8589934594,
  'eigen_centrality': 0.6116286437343044,
  'node_id': '129',
  'tooltip': {'index': '129',
   'cluster_id': 8589934594,
   'node_id': '129',
   'eigen_centrality': 0.6116286437343044}},
 {'cluster_id': 8589934594,
  'eigen_centrality': 0.28184579793865727,
  'node_id': '131',
  'tooltip': {'index': '131',
   'cluster_id': 8589934594,
   'node_id': '131',
   'eigen_centrality': 0.28184579793865727}}]

In [117]:
link_data

[{'src': '129',
  'dst': '130',
  'match_probability': 0.9999997520213268,
  'edge_betweenness': 0.3333333432674408,
  'edge_id': '129, 130',
  'tooltip': {'tf_adjusted_match_prob': 0.9999999999961988,
   'distance': 0.01000000000380119,
   'edge_betweenness': 0.3333333432674408,
   'match_probability': 0.9999997520213268,
   'unique_id_l': '129',
   'unique_id_r': '130',
   'surname_l': 'Barker',
   'surname_r': 'Barker',
   'gamma_surname': 2,
   'dob_l': '1990-03-08',
   'dob_r': '1990-03-08',
   'gamma_dob': 1,
   'city_l': 'Reading',
   'city_r': None,
   'gamma_city': -1,
   'email_l': 'm.b@bell-brown.com',
   'email_r': 'm.b@bell-brown.com',
   'gamma_email': 1,
   'edge_id': '129, 130'}},
 {'src': '130',
  'dst': '132',
  'match_probability': 0.9999995549406339,
  'edge_betweenness': 0.1666666716337204,
  'edge_id': '130, 132',
  'tooltip': {'tf_adjusted_match_prob': 0.9999995549406339,
   'distance': 0.010000445059366148,
   'edge_betweenness': 0.1666666716337204,
   'match_pr