# Convert .json.bz2 graph files to neo4j import .csv format

Simple code to quickly get a saved json graph ready for neo4j import, the format of which is used by this package.

In [1]:
import bz2
import json
import pandas as pd
from tqdm import tqdm

In [2]:
def get_edge_abbrev(edge, kind_to_abbrev):
    """Converts a graph edge and """

    start_node = edge['source_id']
    end_node = edge['target_id']
    
    edge_abbrev = ''
    edge_abbrev += kind_to_abbrev[start_node[0]]

    if edge['direction'] == 'backward':
        edge_abbrev += '<'

    edge_abbrev += kind_to_abbrev[edge['kind']]

    if edge['direction'] == 'forward':
        edge_abbrev += '>'

    edge_abbrev += kind_to_abbrev[end_node[0]]

    return edge_abbrev

In [3]:
def convert_to_neo_edge(json_graph):
    edges = {':START_ID': [], ':END_ID': [], ':TYPE': []}
    
    for edge in json_graph['edges']:
        edges[':START_ID'].append(str(edge['source_id'][1]))
        edges[':END_ID'].append(str(edge['target_id'][1]))
       
        edge_type = edge['kind'] + '_' + get_edge_abbrev(edge, json_graph['kind_to_abbrev'])
        
        edges[':TYPE'].append(edge_type)
        
    out = pd.DataFrame(edges)
    out = out.sort_values([':TYPE', ':START_ID', ':END_ID'])
    out = out.reset_index(drop=True)
    
    return out[[':START_ID', ':END_ID', ':TYPE']]

In [4]:
def convert_to_neo_node(json_graph):
    # more columns can be added, but will not be used by this package for now
    nodes = {':ID': [], 'name': [], ':LABEL': []} 
    
    for node in json_graph['nodes']:
        nodes[':ID'].append(str(node['identifier']))
        nodes['name'].append(str(node['name']))
        nodes[':LABEL'].append(str(node['kind']))
        
    return pd.DataFrame(nodes)[[':ID', 'name', ':LABEL']].sort_values(':ID').reset_index(drop=True)

In [5]:
for i in tqdm(range(1, 6)):
    with bz2.open('hetnet_perm-{}.json.bz2'.format(i), 'rt') as fin:
        graph = json.load(fin)
    neo_graph = convert_to_neo_edge(graph)
    neo_graph.to_csv('hetnet_perm-{}.csv'.format(i), index=False)

100%|██████████| 5/5 [03:04<00:00, 37.20s/it]
