# Sample queries exploring Graph structure in GEMD database


note, pip install yfiles.
As this uses javascript code, may also have to be installed in 'base' environment. <br/>
in terminal:
<pre>
> pip install yfiles_jupyter_graphs
> conda activate base
> pip install yfiles_jupyter_graphs
</pre>
then restart kernel. maybe restart container. 

In [None]:
import SciServer.CasJobs as cj
from pprint import pprint
import json
import pandas
from yfiles_jupyter_graphs import GraphWidget
# use networkx to create the graph object that is to be used by yfiles
import networkx as nx   

In [None]:
DATABASE='GEMD'

count for each node how many distinct nodes can be reached from it

In [None]:
%%time
sql="""
with gr as (
select c.uid as root_uid
,      c.gemd_type as root_type
,      0 as level
,      cast(NULL as varchar(64)) as endpoint_uid
,      c.uid as from_uid, cast(NULL as bigint) as edge_id, cast(NULL as varchar(64)) as gemd_ref
,      cast(gemd_type+c.uid as varchar(max)) as [path]
  from GEMDObject c
 union all
select gr.root_uid, gr.root_type, gr.level+1, e.to_uid
,      e.to_uid, e.id, e.gemd_ref
,      gr.path+'==>'+e.gemd_ref+':'+e.to_uid
  from gr
  join GEMDEdge e on e.from_uid=gr.from_uid
where gr.level < 16
)
select root_uid, root_type, count(distinct endpoint_uid) as num_out_nodes
  from gr
group by root_type, root_uid
 order by num_out_nodes desc
"""
nodes_out=cj.executeQuery(sql, DATABASE)

In [None]:
nodes_out.head(10)

count for each node how many distinct nodes can reach it

In [None]:
%%time
sql="""
with gr as (
select c.uid as root_uid
,      c.gemd_type as root_type
,      0 as level
,      cast(NULL as varchar(64)) as endpoint_uid
,      c.uid as from_uid, cast(NULL as bigint) as edge_id, cast(NULL as varchar(64)) as gemd_ref
,      cast(gemd_type+c.uid as varchar(max)) as [path]
  from GEMDObject c
 union all
select gr.root_uid, gr.root_type, gr.level+1, e.to_uid
,      e.to_uid, e.id, e.gemd_ref
,      gr.path+'==>'+e.gemd_ref+':'+e.to_uid
  from gr
  join GEMDEdge e on e.from_uid=gr.from_uid
where gr.level < 16
)
select endpoint_uid, count(distinct root_uid) as num_in_nodes
  from gr
group by endpoint_uid
 order by num_in_nodes desc
"""
nodes_in=cj.executeQuery(sql, DATABASE)

## visualize

take a node from nodes_out and find the graph reachable from it

In [None]:
UID = nodes_out.iloc[0].root_uid
UID
# UID = '01b44457-6ebe-43bc-b316-66fcff35957e'

Following query gets graph reachable form the specified node. It also retrieves attributes for the nodes to be added to the graph. 

In [None]:
UID = 'adaa778f-62d8-476f-a774-4a3877a53d05'
sql=f"""
declare @uid varchar(40) = '{UID}'
;
with gr as (
select cast(NULL as varchar(64)) as Source_uid
,      cast(NULL as varchar(32)) as source_type
,      c.uid Target_uid
,      c.gemd_type as Target_type
,      cast(NULL as bigint) as edge_id
,      cast(NULL as varchar(64)) as gemd_ref 
,      0 as level
  from GEMDObject c
  where uid=@UID
 union all
select gr.Target_uid as Source_uid
,      gr.target_type as source_type
,      c.uid as Target_uid
,      c.gemd_type as target_type
,      e.id as edge_id
,      e.gemd_ref
,      gr.level+1 as level
  from gr
  join GEMDEdge e on e.from_uid=gr.Target_uid
  join GEMDObject c on c.uid=e.to_uid
where gr.level < 16
), edges as (
select Source_uid,Target_uid, source_type, target_type,gemd_ref, level
  from gr
 where Source_uid is not null
)
, nodes as (
select source_uid as node_uid, source_type as node_type
  from edges 
union
select Target_uid , target_type
from edges
)
select n.node_uid, n.node_type, max(a.attributes ) as attributes
,     (select e.target_uid , e.gemd_ref
          from edges e
         where e.source_uid=n.node_uid
           for json path) as out_edges
  from nodes n
  join GEMDObjectAttributes a on a.uid=n.node_uid
group by node_uid,node_type
"""
df = cj.executeQuery(sql,"GEMD",format='pandas')

In [None]:
nodes = []
edges = {}
nodes_d = {}
for t in df.itertuples():
    atts = {"node_type":t.node_type}
    if t.attributes is not None and len(t.attributes.strip()) > 0:
        tatts = json.loads(t.attributes)
        atts.update(tatts)
    LABEL = t.node_type+':'+t.node_uid
    node = (LABEL,atts)
    nodes.append(node)
    nodes_d[t.node_uid] = node
    edges[t.node_uid] = t.out_edges

In [None]:
all_edges = []
for k,node in nodes_d.items():
    try:
        out_edges = json.loads(edges[k])
        oe = [(node[0],nodes_d[e['target_uid']][0]) for e in out_edges]
        all_edges+=oe
    except:
        pass

In [None]:
G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_edges_from(all_edges)

In [None]:
GraphWidget(graph=G)