**Constructing Node and Edge Lists**

This code takes JSON files containing the node and edge lists for each of the 92 glioblastoma-related subgraphs extracted from the NCATS GARD Knowledge Graph, converts them to CSVs, and appends them into a single node and single edge list. It also removes all nodes labeled "TRANSIENT" and the edges connected to them. 

In [None]:
from google.colab import files
import json
import pandas as pd
import numpy as np
import csv

Converting node list JSON files (exported from the NCATS GARD Knowledge Database) to CSV files:

In [None]:
#Read the node list for each subgraph
counter = 0
for filename in os.listdir('/content/GARD_Node_JSONs/'):
  # load data using Python JSON module
  counter = counter + 1
  with open(filename,'r') as f:
      data = json.loads(f.read(), strict=False)
  # Flatten data
  df_flat_node = pd.json_normalize(data, record_path =['collect(DISTINCT b)'])
  # Save to CSV in Node lists folder
  df_flat_node.to_csv('/content/GARD_Node_CSVs/' + filename[25:-5] + '.csv')

In [None]:
# load data using Python JSON module (repeat for each subgraph)
with open('/content/GARD0008570_nodes.json','r') as f:
    data = json.loads(f.read())
# Flatten data
df_flat_node = pd.json_normalize(data, record_path =['collect(DISTINCT b)'])

In [None]:
df_flat_node.to_csv('GARD0008570_nodes.csv')

In [None]:
!zip -r /content/GARD_Edge_CSVs.zip /content/GARD_Edge_CSVs

In [None]:
files.download('/content/GARD_Edge_CSVs.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Converting edge list JSON files (exported from the NCATS GARD Knowledge Database) to CSV files:

In [None]:
counter = 0
for filename in os.listdir('/content/GARD_Edge_JSONs/'):
  # load data using Python JSON module
  counter = counter + 1
  with open(filename,'r') as f:
      data = json.loads(f.read(), strict=False)
  # Flatten data
  df_flat_node = pd.json_normalize(data, record_path =['collect(DISTINCT f)'])
  # Save to CSV in Edge lists folder
  df_flat_node.to_csv('/content/GARD_Edge_CSVs/' + filename[0:-5] + '.csv')

In [None]:
# load data using Python JSON module (repeat for each subgraph)
with open('/content/GARD0008214_edges.json','r') as f:
    data = json.loads(f.read())
# Flatten data
df_flat_edge = pd.json_normalize(data, record_path =['collect(DISTINCT f)'])

In [None]:
df_flat_edge.to_csv('GARD0008214_edges.csv')

Concatenate Node Lists:

In [None]:
concat_df = pd.read_csv('/content/GARD0000017_nodes.csv') 

for filename in os.listdir('/content/GARD_Node_CSVs'):
  #Read each CSV
  current_df = pd.read_csv('/content/GARD_Node_CSVs/' + filename) 

  #Concatenate (in order as blocks) and remove rows containing duplicate nodes (i.e. nodes with same identity)
  concat_df = pd.concat([concat_df, current_df], sort=False).drop_duplicates(subset=['identity']).reset_index(drop=True)

#Display
display(concat_df)

In [None]:
concat_df.to_csv('node_list.csv')

In [None]:
files.download('/content/GARD_Node_CSVs/node_list.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Concatenate Edge Lists:

In [None]:
concat_edge_df = pd.read_csv('/content/GARD0000017_edges.csv') 

for filename in os.listdir('/content/GARD_Edge_CSVs'):
  #Read each CSV
  current_df = pd.read_csv('/content/GARD_Edge_CSVs/' + filename) 

  #Concatenate (in order as blocks) and remove rows containing duplicate nodes (i.e. nodes with same identity)
  concat_edge_df = pd.concat([concat_edge_df, current_df], sort=False).drop_duplicates().reset_index(drop=True)

#Display
display(concat_edge_df)

In [None]:
concat_edge_df.to_csv('edge_list.csv')

In [None]:
files.download('edge_list.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Remove nodes labeled "TRANSIENT" (and the edges connected to them):

In [None]:
#Read node list CSV
node_list_df = pd.read_csv('/content/node_list.csv') 
display(node_list_df)

#Create list of the identity #s of each transient node (so we can remove the edges connected to them later)
transient_node_ids = []
for index, row in node_list_df.iterrows():
  if 'TRANSIENT' in row['labels']:
    transient_node_ids.append(row['identity'])

#should be 202
print(len(transient_node_ids))
print(transient_node_ids)

#Drop TRANSIENT nodes (should be 4789 left)
for index, row in node_list_df.iterrows():
  if 'TRANSIENT' in row['labels']:
    node_list_df = node_list_df.drop([index])

display(node_list_df)

In [None]:
node_list_df.to_csv('node_list_no_transient.csv')

In [None]:
files.download('/content/node_list_no_transient.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
transient_node_ids = [5764141, 5811875, 5821818, 5816977, 5672797, 5726512, 5985031, 5688846, 6090352, 5944228, 5899779, 6071778, 5985530, 5684855, 6124134, 5732474, 5740431, 5896648, 5699254, 5790224, 5820708, 6039767, 5909892, 5907191, 6117100, 6128387, 5775920, 5994886, 5931844, 5873301, 5923343, 5871294, 5768288, 5996425, 5955790, 6075890, 6139142, 5946522, 6061565, 5781226, 5844538, 6038548, 5941250, 6107393, 5931821, 5708259, 6036119, 6092650, 6030233, 5810328, 5990559, 6158459, 6106569, 5939101, 5693351, 5987827, 5977650, 5823958, 5828600, 6070501, 6092597, 5803087, 5949599, 5850521, 6070310, 5906051, 6011026, 5762669, 5831259, 6027185, 5866990, 6015091, 6064396, 6101090, 6138780, 5912201, 5718299, 5948660, 5730233, 5900158, 5850055, 5695667, 5895924, 5931876, 6092564, 5932686, 5860252, 5910430, 5702117, 5819397, 5946391, 5729017, 6065664, 5676856, 5721708, 5673299, 5911859, 6106721, 5808987, 5774402, 5879078, 5989124, 5778150, 6085935, 5850007, 5889490, 6037306, 5959612, 5997029, 5846418, 6038670, 5874253, 6038173, 6102979, 6059283, 5875889, 6039510, 6050144, 6051656, 5918313, 5869059, 5826250, 5914847, 6072021, 5821828, 5734580, 6138685, 5949630, 6040505, 5789223, 5735314, 5909563, 5872770, 6095144, 5911995, 5928878, 6050140, 6152175, 5920535, 6047730, 5853726, 6000444, 5721115, 5971563, 6016052, 6046334, 5721252, 5929111, 6050160, 5821758, 5986991, 6022687, 5949055, 6025528, 6073673, 5782675, 5948545, 5680603, 5971339, 6009443, 5996614, 5833261, 5982022, 5896138, 5840246, 5870177, 6094025, 5895678, 5952198, 5683249, 5942850, 5931912, 6160195, 6036123, 6051625, 6007887, 5762089, 6100854, 5744196, 5938338, 5988766, 6125571, 5845745, 5980314, 6062679, 6011060, 6026305, 6048682, 6105732, 6090493, 6070450, 5951490, 6157075, 5998936, 6155243, 5824803, 5674292, 5739597, 5820383, 6024178, 5742849, 6030935]

In [None]:
#Read edge list CSV
edge_list_df = pd.read_csv('/content/edge_list_WITH_TRANSIENT.csv') 
display(edge_list_df)

#Drop edges connected to TRANSIENT nodes 
for index, row in edge_list_df.iterrows():
  if row['start'] in transient_node_ids:
    edge_list_df = edge_list_df.drop([index])
  if row['end'] in transient_node_ids:
    edge_list_df = edge_list_df.drop([index])

display(edge_list_df)

In [None]:
edge_list_df.to_csv('edge_list_no_transient.csv')

In [None]:
files.download('/content/edge_list_no_transient.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>