In [1]:
import pandas as pd 

## Node-level (person-level) inforamtion

In [2]:
nodestats = pd.read_parquet("data/temp/nodestats.parquet")
nodestats.columns = ["index", "cluster_id", "node_id", "eigen_centrality"]
nodestats = nodestats.drop('index', axis=1)

nodes = pd.read_csv("data/fake_1000.csv")
nodes['unique_id'] = nodes['unique_id'].astype(str)

nodestats = nodestats.merge(nodes, left_on='node_id', right_on='unique_id')
nodestats = nodestats.drop(["unique_id", "group"], axis=1)
nodestats = nodestats.fillna("")
nodestats.fillna("").head()

Unnamed: 0,cluster_id,node_id,eigen_centrality,first_name,surname,dob,city,email
0,12,226,0.57735,Julia,Smith,2014-05-24,Luton,julia.smith@english.org
1,12,227,0.57735,Julia,Smith,2004-04-27,Luton,
2,12,228,0.57735,Julia,Smith,2004-04-26,Luton,julia.smith@english.org
3,28,409,0.707107,Emily,Atkinson,2017-05-03,London,emilya@roberts.com
4,28,411,0.707107,Emily,Atkinson,2008-05-05,,emilya@roberts.com


## Edge-level (pairwise-comparison level) information

In [3]:
import numpy as np
df_e = pd.read_parquet('data/graph/df_e.parquet') 
edges = pd.read_parquet("data/temp/edgestats.parquet")
df_e = df_e.drop("tf_adjusted_match_prob",axis=1)
df_e = edges.merge(df_e, left_on=['src', 'dst'], right_on = ['unique_id_l', 'unique_id_r'])
df_e = df_e.rename(columns={'component': 'cluster_id', 'eb': 'edge_betweenness'})

df_e['match_score'] = np.log2(df_e["tf_adjusted_match_prob"]/(1-df_e["tf_adjusted_match_prob"]))
df_e.head(5)

Unnamed: 0,src,dst,cluster_id,tf_adjusted_match_prob,distance,edge_betweenness,match_probability,unique_id_l,unique_id_r,surname_l,...,gamma_dob,city_l,city_r,gamma_city,email_l,email_r,gamma_email,group_l,group_r,match_score
0,101,105,1,1.0,0.01,0.333333,1.0,101,105,Griffiths,...,1,Plymouth,Plymouth,1,a.griffiths@garner-bridges.com,a.griffiths@garner-bridges.com,1,29,29,53.0
1,110,112,3,0.999974,0.010026,0.1,0.999983,110,112,Atkinnos,...,0,London,London,1,oliver.atkinson@moran-smith.com,oliver.atkinson@moran-smith.com,1,31,31,15.20708
2,129,130,8589934594,1.0,0.01,0.333333,1.0,129,130,Barker,...,1,Reading,,-1,m.b@bell-brown.com,m.b@bell-brown.com,1,36,36,37.936689
3,157,158,6,1.0,0.01,1.0,1.0,157,158,Wright,...,1,,,-1,a.wright42@estrada.org,a.wright42@estrada.org,1,44,44,23.952339
4,161,163,17179869185,0.999968,0.010032,0.333333,0.999968,161,163,,...,1,London,,-1,hollythomson3@levine-jones.com,hollythomson3@levine-jones.com,1,45,45,14.918878


## Subgraph-level (cluster-level) information

In [4]:
graphstats = pd.read_parquet("data/temp/graphstats.parquet")
graphstats = graphstats.rename(columns={'component': 'cluster_id', 'nodes': 'node_ids'})
graphstats.head(2)

Unnamed: 0,cluster_id,node_ids,nodecount,edgecount,density,diameter,radius,transitivity,tri_clustcoeff,sq_clustcoeff,graphhash
0,12,"[226, 227, 228]",3,3,1.0,1,1,1.0,1.0,0.0,7d2c307dbd866960fae5a905cc5447de
1,28,"[409, 411]",2,1,1.0,1,1,0.0,0.0,0.0,2148f1da1ac29711e1273e364d4127c4


## Visualisation

#### Node data

In [5]:
def link_data_with_tooltip(df, source_field = 'src', target_field='dst', cols_to_retain=[], cols_to_drop_from_tooltip=[], cluster_id = None, cluster_field='cluster_id'):
    if cluster_id:
        df = df[df[cluster_field] == cluster_id]
        
    recs =  df.to_dict(orient='records')
    new_recs = []
    for r in recs:
        new_row = {}
        
        for c in cols_to_retain:
            new_row[c] = r[c]
            new_row['source'] = r[source_field]
            new_row['target'] = r[target_field]
        
        tooltip_cols = [c for c in r.keys() if c not in cols_to_drop_from_tooltip]
        tooltip = {}
        for c in tooltip_cols:
            tooltip[c] = r[c] 
        new_row['tooltip'] = tooltip
        new_recs.append(new_row)
    return new_recs

def node_data_with_tooltip(df, cols_to_retain=['cluster_id'], cols_to_drop_from_tooltip=[], cluster_id=None, cluster_field='cluster_id' ):
    if cluster_id:
        df = df[df[cluster_field] == cluster_id]
        
    recs =  df.to_dict(orient='records')
    new_recs = []
    for r in recs:
        new_row = {}
        
        for c in cols_to_retain:
            new_row[c] = r[c]
        
        tooltip_cols = [c for c in r.keys() if c not in cols_to_drop_from_tooltip]
        tooltip = {}
        for c in tooltip_cols:
            tooltip[c] = r[c] 
        new_row['tooltip'] = tooltip
        new_recs.append(new_row)
    return new_recs


In [6]:
def get_html_cluster_metrics(graphstats_row_as_df):
    c = graphstats_row_as_df.to_dict(orient="records")[0]
    table = """
    <h2>Cluster {cluster_id}</h2>
    <table>
      <tr>
        <th style="text-align:left">Metric</th>
        <th>Value</th>
        <th style="text-align:left">Description</th>
      </tr>
      <tbody>
      
      <tr>
        <td style="text-align:left">Diameter</td>
        <td>{diameter}</td>
        <td style="text-align:left">The <b>diameter</b> of a graph is the longest path between any two nodes.</td>
      </tr>
      
    
      <tr>
        <td style="text-align:left">Radius</td>
        <td>{radius}</td>
                <td style="text-align:left">The <b>radius</b> is the largest distance from node at the centre of a graph to the edge of the graph.  </td>
      </tr>
      
      <tr>
        <td style="text-align:left">Density</td>
        <td>{density:,.3f}</td>
                <td style="text-align:left">The <a href="https://bookdown.org/omarlizardo/_main/2-9-density.html"><b>density</b></a> is a mesure of interconnectedness. It is defined by the number of edges divided by the number of possible edges.</td>
      </tr>
      
      <tr>
        <td style="text-align:left">Transitivity</td>
        <td>{transitivity:,.3f}</td>
                <td style="text-align:left">The <b>transitivity</b> is the overall probability for the network to have adjacent nodes interconnected. Higher transitivity indicates more tightly connected groups of nodes.</td>
      </tr>
      
      <tr>
        <td style="text-align:left">Triangle cluster coefficient</td>
        <td>{tri_clustcoeff:,.3f}</td>
                <td style="text-align:left"></td>
      </tr>
      
      <tr>
        <td style="text-align:left">Square cluster coefficient</td>
        <td>{sq_clustcoeff:,.3f}</td>
                <td style="text-align:left"></td>
      </tr>
      

      </tbody>
    </table>
    """
    return table.format(**c)
    

In [7]:
import json
from IPython.display import display, clear_output,Javascript, Markdown, HTML
import ipywidgets as widgets

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000



def display_outputs(cluster_id, df_e, nodestats, graphstats,edge_metric="edge_betweenness"):
    
    link_data = link_data_with_tooltip(df_e, cluster_id=cluster_id, cols_to_retain = ['match_probability', 'edge_betweenness','match_score'],cols_to_drop_from_tooltip=['cluster_id', 'src', 'dst', 'group_l', 'group_r'])

    node_data = node_data_with_tooltip(nodestats,cols_to_retain = ['cluster_id', 'eigen_centrality', 'node_id'], cluster_id=cluster_id)

    with open('data/graph/force_template.vg.json') as f:
        vl = json.load(f)
    vl['data'][0] = {
        "name": "node-data",
        "values":node_data

    }

    vl['data'][1] = {
        "name": "link-data",
        "values":  link_data

    }

    vl['width'] = 400
    vl['height'] = 400
    
    if edge_metric == "match_score":
        vl["scales"][1]["domain"]["field"] = "match_score"

        vl["scales"][1]["reverse"] = False
        vl["marks"][1]["encode"]["update"]["stroke"]["field"] = "match_score"
        
        # Scale 3 is edge_length_scale         
        vl["scales"][3]["domain"] = {"data": "link-data","field": "match_score"}
        vl["scales"][3]["reverse"] = True
            
        vl["marks"][0]["transform"][0]["forces"][3]["distance"]["expr"] = "scale('edge_length_scale',datum.match_score)*linkDistance"
        
        

    script = f"""
       var script = document.createElement('script');
        script.type = 'text/javascript';
        script.src = '//cdn.jsdelivr.net/npm/vega@5';
        document.head.appendChild(script);

        var script = document.createElement('script');
        script.type = 'text/javascript';
        script.src = '//cdn.jsdelivr.net/npm/vega-embed@6';
        document.head.appendChild(script);

        var spec = `{json.dumps(vl)}`
        spec= JSON.parse(spec)
        vegaEmbed(element, spec).then(function(result) {{
          }}).catch(console.error);  
    """


    #     display(HTML("""
    #     <style>
    #     .jupyter-widgets {

    #      overflow: visible !important; 
    #     }
    #     </style>
    #     """))
    html = get_html_cluster_metrics(graphstats[graphstats['cluster_id'] == cluster_id])
    display(HTML(html))

    display(Javascript(script))
    display(nodestats[nodestats['cluster_id'] == cluster_id])
    display(df_e[df_e['cluster_id'] == cluster_id])

#     display(print(json.dumps(vl,indent=4)))
    
def on_change(change):
    output.clear_output()
    cluster_id = dd_cluster_id.value
    edge_metric = dd_edge_metric.value
    
    with output:
        display_outputs(cluster_id, df_e, nodestats, graphstats, edge_metric=edge_metric)


output = widgets.Output(layout={'overflow': 'auto'})


    

dd_values = sorted(list(nodestats["cluster_id"].unique()))

dd_cluster_id = widgets.Dropdown(
    options=dd_values,
    value=39,
    description='Cluster:',
)




dd_cluster_id.observe(on_change,names=['value'])

dd_edge_metric = widgets.RadioButtons(description='Edge metric', options=['edge_betweenness','match_score'])


dd_cluster_id.observe(on_change, names='value')
dd_edge_metric.observe(on_change, names='value')

display(dd_cluster_id)
display(dd_edge_metric)
display(output)
on_change(None)


Dropdown(description='Cluster:', index=20, options=(1, 3, 4, 6, 7, 12, 13, 18, 20, 22, 24, 27, 28, 29, 30, 32,…

RadioButtons(description='Edge metric', options=('edge_betweenness', 'match_score'), value='edge_betweenness')

Output(layout=Layout(overflow='auto'))