In [1]:
import pandas as pd 

## Node-level (person-level) inforamtion

In [2]:
nodestats = pd.read_parquet("data/temp/nodestats.parquet")
nodestats.columns = ["index", "cluster_id", "node_id", "eigen_centrality"]
nodestats

Unnamed: 0,index,cluster_id,node_id,eigen_centrality
0,226,12,226,0.577350
1,227,12,227,0.577350
2,228,12,228,0.577350
3,409,28,409,0.707107
4,411,28,411,0.707107
...,...,...,...,...
421,194,34359738372,194,0.707107
422,879,34359738441,879,0.707107
423,880,34359738441,880,0.707107
424,954,42949673030,954,0.707107


## Edge-level (pairwise-comparison level) information

In [None]:
df_e = pd.read_parquet('data/graph/df_e.parquet') 
edges = pd.read_parquet("data/temp/edgestats.parquet")
df_e = df_e.drop("tf_adjusted_match_prob",axis=1)
df_e = edges.merge(df_e, left_on=['src', 'dst'], right_on = ['unique_id_l', 'unique_id_r'])
df_e = df_e.rename(columns={'component': 'cluster_id', 'eb': 'edge_betweenness'})
df_e.head(5)

## Subgraph-level (cluster-level) information

In [None]:
graphstats = pd.read_parquet("data/temp/graphstats.parquet")
graphstats = graphstats.rename(columns={'component': 'cluster_id', 'nodes': 'node_ids'})
graphstats.head(2)

## Visualisation

#### Node data

In [323]:
def link_data_with_tooltip(df, source_field = 'src', target_field='dst', cols_to_retain=[], cols_to_drop_from_tooltip=[], cluster_id = None, cluster_field='cluster_id'):
    if cluster_id:
        df = df[df[cluster_field] == cluster_id]
        
    recs =  df.to_dict(orient='records')
    new_recs = []
    for r in recs:
        new_row = {}
        
        for c in cols_to_retain:
            new_row[c] = r[c]
            new_row['source'] = r[source_field]
            new_row['target'] = r[target_field]
        
        tooltip_cols = [c for c in r.keys() if c not in cols_to_drop_from_tooltip]
        tooltip = {}
        for c in tooltip_cols:
            tooltip[c] = r[c] 
        new_row['tooltip'] = tooltip
        new_recs.append(new_row)
    return new_recs

def node_data_with_tooltip(df, cols_to_retain=['cluster_id'], cols_to_drop_from_tooltip=[], cluster_id=None, cluster_field='cluster_id' ):
    if cluster_id:
        df = df[df[cluster_field] == cluster_id]
        
    recs =  df.to_dict(orient='records')
    new_recs = []
    for r in recs:
        new_row = {}
        
        for c in cols_to_retain:
            new_row[c] = r[c]
        
        tooltip_cols = [c for c in r.keys() if c not in cols_to_drop_from_tooltip]
        tooltip = {}
        for c in tooltip_cols:
            tooltip[c] = r[c] 
        new_row['tooltip'] = tooltip
        new_recs.append(new_row)
    return new_recs

def get_markdown(graphstats_row_as_df):
    c = graphstats_row_as_df.to_dict(orient="records")[0]
    md = []
    md.append(f"## Cluster ID: {c['cluster_id']}")
    md.append(f"Nodes: {c['nodecount']}, Edges: {c['edgecount']}")
    md.append(f"[Density](https://bookdown.org/omarlizardo/_main/2-9-density.html): {c['density']}")
    md.append(f"[Diameter](https://mathworld.wolfram.com/GraphDiameter.html#:~:text=is%20a%20graph%20distance.,loop%20are%20excluded%20from%20consideration.): {c['diameter']}")
    md.append(f"Radius: {c['radius']}, transitivity: {c['transitivity']}")
    md.append(f"Triangle cluster coefficient: {c['tri_clustcoeff']}, Square cluster coefficient: {c['sq_clustcoeff']}")
    
    
    
    return ('\n\n').join(md)

In [324]:
import json
from IPython.display import display, clear_output
import ipywidgets as widgets

def display_outputs(cluster_id, df_e, nodestats, graphstats,edge_metric="edge_betweenness"):
    
    link_data = link_data_with_tooltip(df_e, cluster_id=cluster_id, cols_to_retain = ['match_probability', 'edge_betweenness'],cols_to_drop_from_tooltip=['cluster_id', 'src', 'dst', 'group_l', 'group_r'])

    node_data = node_data_with_tooltip(nodestats,cols_to_retain = ['cluster_id', 'eigen_centrality', 'node_id'], cluster_id=cluster_id)

    with open('data/graph/force_template.vg.json') as f:
        vl = json.load(f)
    vl['data'][0] = {
        "name": "node-data",
        "values":node_data

    }

    vl['data'][1] = {
        "name": "link-data",
        "values":  link_data

    }

    vl['width'] = 400
    vl['height'] = 400
    
    if edge_metric == "match_probability":
        vl["scales"][1]["domain"]["field"] = "match_probability"
        vl["scales"][3]["domain"] = {"data": "link-data","field": "match_probability"}
        vl["scales"][1]["range"]["reverse"] = False
        vl["marks"][1]["encode"]["update"]["stroke"]["field"] = "match_probability"
        vl["marks"][0]["transform"][0]["forces"][3]["distance"]["expr"] = "scale('edge_length_scale',datum.match_probability)*linkDistance"
        
        
    from IPython.display import Javascript
    script = f"""
       var script = document.createElement('script');
        script.type = 'text/javascript';
        script.src = '//cdn.jsdelivr.net/npm/vega@5';
        document.head.appendChild(script);

        var script = document.createElement('script');
        script.type = 'text/javascript';
        script.src = '//cdn.jsdelivr.net/npm/vega-embed@6';
        document.head.appendChild(script);

        var spec = `{json.dumps(vl)}`
        spec= JSON.parse(spec)
        vegaEmbed(element, spec).then(function(result) {{
          }}).catch(console.error);  
    """
    md = get_markdown(graphstats[graphstats['cluster_id'] == cluster_id])
    display(Markdown(md))
    display(df_e[df_e['cluster_id'] == cluster_id])
    display(Javascript(script))
#     display(print(json.dumps(vl,indent=4)))
    
def on_change(change):
    output.clear_output()
    cluster_id = dd_cluster_id.value
    edge_metric = dd_edge_metric.value
    
    with output:
        display_outputs(cluster_id, df_e, nodestats, graphstats, edge_metric=edge_metric)


output = widgets.Output()


    

dd_values = sorted(list(nodestats["cluster_id"].unique()))

dd_cluster_id = widgets.Dropdown(
    options=dd_values,
    value=39,
    description='Cluster:',
)




dd_cluster_id.observe(on_change,names=['value'])

dd_edge_metric = widgets.RadioButtons(description='Edge metric', options=['edge_betweenness','match_probability'])


dd_cluster_id.observe(on_change, names='value')
dd_edge_metric.observe(on_change, names='value')

display(dd_cluster_id)
display(dd_edge_metric)
display(output)
on_change(None)


Dropdown(description='Cluster:', index=20, options=(1, 3, 4, 6, 7, 12, 13, 18, 20, 22, 24, 27, 28, 29, 30, 32,…

RadioButtons(description='Edge metric', options=('edge_betweenness', 'match_probability'), value='edge_between…

Output()

## Cluster ID: 39

Nodes: 4, Edges: 5

[Density](https://bookdown.org/omarlizardo/_main/2-9-density.html): 0.833

[Diameter](https://mathworld.wolfram.com/GraphDiameter.html#:~:text=is%20a%20graph%20distance.,loop%20are%20excluded%20from%20consideration.): 2