# Sample queries exploring Graph structure in GEMD database


In [None]:
import SciServer.CasJobs as cj
from pprint import pprint
import json
import pandas

In [None]:
DATABASE='GEMD'

This path finds airs of nodes with >1 path from one to the other. \
Turns out there are none.

In [None]:
%%time
sql="""
with gr as (
select c.uid as root_uid
,      c.gemd_type as root_type
,      0 as level
,      cast(NULL as varchar(64)) as endpoint_uid
,      cast(NULL as varchar(32)) as endpoint_type
,      c.uid as from_uid, cast(NULL as bigint) as edge_id, cast(NULL as varchar(64)) as gemd_ref
,      cast(gemd_type+c.uid as varchar(max)) as [path]
  from GEMDContext c where gemd_type='material_run'
 union all
select gr.root_uid, gr.root_type, gr.level+1, e.to_uid, c.gemd_type
,      e.to_uid, e.id, e.gemd_ref
,      gr.path+'==>'+e.gemd_ref+':'+e.to_uid
  from gr 
  join GEMDEdge e on e.from_uid=gr.from_uid
  join gemdcontext c on c.uid=e.to_uid
where gr.level < 16
)
select root_uid, root_type, endpoint_uid, endpoint_type
,      min(path) as path, min(level) as min_level, max(level) as max_level
,      count(*) as num_paths
  from gr
group by root_type, root_uid, endpoint_uid,endpoint_type having count(*) > 1  -- if you want to find multiple paths between nodes
 order by root_type,root_uid, endpoint_uid,path
"""
cj.executeQuery(sql, DATABASE)

Following query returns all paths between all nodes. This can be compared with the graph functionality inside MS SQL itself but is much faster.

In [None]:
%%time
sql="""
with gr as (
select c.uid as root_uid
,      c.gemd_type as root_type
,      0 as level
,      cast(NULL as varchar(64)) as endpoint_uid
,      c.uid as from_uid, cast(NULL as bigint) as edge_id, cast(NULL as varchar(64)) as gemd_ref
,      cast(gemd_type+':'+c.uid as varchar(max)) as [path]
  from GEMDContext c
 union all
select gr.root_uid, gr.root_type, gr.level+1, e.to_uid
,      e.to_uid, e.id, e.gemd_ref
,      gr.path+'==>'+e.gemd_ref+':'+e.to_uid
  from gr
  join GEMDEdge e on e.from_uid=gr.from_uid
where gr.level < 16
)
select root_uid, root_type, endpoint_uid
,      edge_id,gemd_ref
,      path, level
  from gr
 order by root_type,root_uid, path
"""
paths=cj.executeQuery(sql, DATABASE)

In [None]:
edges=paths[paths['level']>0]
edges

count for each node how many distinct nodes can be reached from it

In [None]:
%%time
sql="""
with gr as (
select c.uid as root_uid
,      c.gemd_type as root_type
,      0 as level
,      cast(NULL as varchar(64)) as endpoint_uid
,      c.uid as from_uid, cast(NULL as bigint) as edge_id, cast(NULL as varchar(64)) as gemd_ref
,      cast(gemd_type+c.uid as varchar(max)) as [path]
  from GEMDContext c
 union all
select gr.root_uid, gr.root_type, gr.level+1, e.to_uid
,      e.to_uid, e.id, e.gemd_ref
,      gr.path+'==>'+e.gemd_ref+':'+e.to_uid
  from gr
  join GEMDEdge e on e.from_uid=gr.from_uid
where gr.level < 16
)
select root_uid, root_type, count(distinct endpoint_uid) as num_out_nodes
  from gr
group by root_type, root_uid
 order by num_out_nodes desc
"""
nodes_out=cj.executeQuery(sql, DATABASE)

count for each node how many distinct nodes can reached it

In [None]:
%%time
sql="""
with gr as (
select c.uid as root_uid
,      c.gemd_type as root_type
,      0 as level
,      cast(NULL as varchar(64)) as endpoint_uid
,      c.uid as from_uid, cast(NULL as bigint) as edge_id, cast(NULL as varchar(64)) as gemd_ref
,      cast(gemd_type+c.uid as varchar(max)) as [path]
  from GEMDContext c
 union all
select gr.root_uid, gr.root_type, gr.level+1, e.to_uid
,      e.to_uid, e.id, e.gemd_ref
,      gr.path+'==>'+e.gemd_ref+':'+e.to_uid
  from gr
  join GEMDEdge e on e.from_uid=gr.from_uid
where gr.level < 16
)
select endpoint_uid, count(distinct root_uid) as num_in_nodes
  from gr
group by endpoint_uid
 order by num_in_nodes desc
"""
nodes_in=cj.executeQuery(sql, DATABASE)

In [None]:
nodes_out.head()

## visualize
Goal is to get a visualization vapability ala that supported by 
<a href="http://www.thebrain.com" target="_blank">http://www.thebrain.com</a>. \
This allows one to trace edges from a starting node, restricting views to only that node and nodes 1 or 2 steps away. \
Needs code development!

In [None]:
import networkx as nx
G = nx.Graph()

In [None]:
%%time
# load all nodes including all JSON context info into memory
sql="""
select uid,gemd_type,JSON_VALUE(context,'$.name') as name from GEMDContext
"""
nodes=cj.executeQuery(sql,DATABASE)
len(nodes)

In [None]:
%%time
# load all nodes including all JSON context info into memory
sql="""
select * from GEMDEdge
order by from_uid,gemd_ref
"""
edges=cj.executeQuery(sql,DATABASE)
len(edges)

In [None]:
edges.head()

In [None]:
G=nx.from_pandas_edgelist(edges[:100],'from_uid','to_uid',edge_attr=True)

In [None]:
nx.draw_spring(G)

In [None]:
nodes.head()