# GNNs and Neo4j

![Neo4j version](https://img.shields.io/badge/Neo4j->=4.4.9-brightgreen)
![GDS version](https://img.shields.io/badge/GDS-2.3-brightgreen)
![GDS Python Client version](https://img.shields.io/badge/GDS_Python_Client-1.6-brightgreen)


In [1]:
import torch
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from ogb.nodeproppred import NodePropPredDataset
from graphdatascience import GraphDataScience

## Prepare Data
Load source data via graph construct

In [2]:
dataset = NodePropPredDataset(name = 'ogbn-mag')

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0] # graph: library-agnostic graph object



Downloading http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip


Downloaded 0.40 GB: 100%|██████████| 413/413 [00:31<00:00, 13.20it/s]


Extracting dataset/mag.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 27776.85it/s]

Saving...





In [16]:
def camel_case(s):
    ss = s.split('_')
    return ''.join(st.title() for st in ss)

# get node ids with df & ogbId mapping -> involves offset strategy
def make_node_id_arrays(ogb_graph):
    res = {}
    for triple, edge_index in ogb_graph['edge_index_dict'].items():
        for i in [0,1]:
            node_label = camel_case(triple[i*2])
            if node_label not in res:
                res[node_label] = np.unique(edge_index[i,:])
            else:
                res[node_label] = np.union1d(res[node_label],edge_index[i,:])
    return res

def make_staged_node_dfs(node_id_arr_index):
    res = {}
    for node_label, node_arr in node_id_arr_index.items():
        tdf = pd.DataFrame(node_arr, columns=['ogbId'])
        tdf['labels'] = node_label
        tdf.set_index('ogbId', drop=False, inplace=True)
        res[node_label] = tdf
    return res

def generate_offset_neo4j_node_ids(node_dfs, reserve_block_size=0):
    offset = 0
    for node_df in node_dfs.values():
        node_df['nodeId'] = node_df['ogbId'] + offset
        offset = max(node_df['nodeId']) + 1 + reserve_block_size


def make_node_id_dfs(ogb_graph, node_id_strategy='offset', node_id_reserve_block_size=0):
    arr_dict = make_node_id_arrays(ogb_graph)
    res = make_staged_node_dfs(arr_dict)

    if node_id_strategy == 'as_is':
        for node_df in res.values():
            node_df['nodeId'] = node_df['ogbId']
    elif node_id_strategy == 'offset':
        generate_offset_neo4j_node_ids(res, node_id_reserve_block_size)
    else:
        raise Exception(f'node_id_strategy was "{node_id_strategy}" but must be one of "as_is" or "offset"')
    return res

# make relationships-> include options for directionality
def format_triple(triple):
    return camel_case(triple[0]), triple[1].upper(), camel_case(triple[2])

def map_neo4j_node_ids(triple, rel_df, node_dfs):
    rel_df = rel_df.merge(node_dfs[triple[0]][['nodeId']].rename(columns={'nodeId':'sourceNodeId'}),
                          left_on='sourceOgbId', right_index=True)
    rel_df = rel_df.merge(node_dfs[triple[2]][['nodeId']].rename(columns={'nodeId':'targetNodeId'}),
                          left_on='targetOgbId', right_index=True)
    rel_df.drop(columns = ['sourceOgbId', 'targetOgbId'], inplace=True)
    rel_df.reset_index(drop=True, inplace=True)
    return rel_df

def make_rel_dfs(ogb_graph, node_dfs, undirected=False):
    res = {}
    for triple, edge_index in ogb_graph['edge_index_dict'].items():
        # stage relationship df
        tdf = pd.DataFrame(edge_index.T, columns=['sourceOgbId', 'targetOgbId'])
        # format triple and assign relationship type
        formatted_triple = format_triple(triple)
        tdf['relationshipType'] = formatted_triple[1]
        # map ogb to neo4j node ids
        tdf = map_neo4j_node_ids(formatted_triple, tdf, node_dfs)
        if undirected:
            tdf['orientation'] = 0
            rev_tdf = tdf.copy().rename(columns={'sourceNodeId':'targetNodeId', 'targetNodeId':'sourceNodeId'})
            rev_tdf['orientation'] = 1
            tdf = pd.concat([tdf, rev_tdf])
        # set in results
        res[formatted_triple] = tdf
    return res

In [17]:
node_dfs = make_node_id_dfs(graph)

In [18]:
# include Word Embeddings
node_dfs['Paper']['wordEmbedding'] = list(graph['node_feat_dict']['paper'])

In [19]:
# include paper subjects (numeric classes)
node_dfs['Paper']['subjectId'] = np.squeeze(label['paper'])

In [20]:
node_dfs

{'Author':            ogbId  labels   nodeId
 ogbId                            
 0              0  Author        0
 1              1  Author        1
 2              2  Author        2
 3              3  Author        3
 4              4  Author        4
 ...          ...     ...      ...
 1134644  1134644  Author  1134644
 1134645  1134645  Author  1134645
 1134646  1134646  Author  1134646
 1134647  1134647  Author  1134647
 1134648  1134648  Author  1134648
 
 [1134649 rows x 3 columns],
 'Institution':        ogbId       labels   nodeId
 ogbId                             
 0          0  Institution  1134649
 1          1  Institution  1134650
 2          2  Institution  1134651
 3          3  Institution  1134652
 4          4  Institution  1134653
 ...      ...          ...      ...
 8735    8735  Institution  1143384
 8736    8736  Institution  1143385
 8737    8737  Institution  1143386
 8738    8738  Institution  1143387
 8739    8739  Institution  1143388
 
 [8740 rows x 3 col

In [23]:
rel_dfs = make_rel_dfs(graph, node_dfs)
rel_dfs

[{('Author',
   'AFFILIATED_WITH',
   'Institution'):         relationshipType  sourceNodeId  targetNodeId
  0        AFFILIATED_WITH             0       1135494
  1        AFFILIATED_WITH          3806       1135494
  2        AFFILIATED_WITH          4444       1135494
  3        AFFILIATED_WITH          6991       1135494
  4        AFFILIATED_WITH          9344       1135494
  ...                  ...           ...           ...
  1043993  AFFILIATED_WITH       1123654       1137491
  1043994  AFFILIATED_WITH       1124610       1143250
  1043995  AFFILIATED_WITH       1127137       1138383
  1043996  AFFILIATED_WITH       1131701       1143006
  1043997  AFFILIATED_WITH       1133265       1141017
  
  [1043998 rows x 3 columns],
  ('Author',
   'WRITES',
   'Paper'):         relationshipType  sourceNodeId  targetNodeId
  0                 WRITES             0       1163092
  1                 WRITES         41685       1163092
  2                 WRITES         84368       116309

In [30]:
print(f'total number of nodes: {sum([df.shape[0] for df in node_dfs.values()]):,}')
print(f'total number of relationships: {sum([df.shape[0] for df in rel_dfs.values()]):,}')

total number of nodes: 1,939,743
total number of relationships: 21,111,007


In [24]:
load_dotenv('aura-credentials-fixed.env', override=True)

# Use Neo4j URI and credentials according to our setup
gds = GraphDataScience(
    os.getenv('NEO4J_URI'),
    auth=(os.getenv('NEO4J_USERNAME'),
          os.getenv('NEO4J_PASSWORD')),
    aura_ds=True)

# Necessary if you enabled Arrow on the db - this is true for AuraDS
gds.set_database("neo4j")

[1043998, 7145660, 5416271, 7505078]

In [None]:
%%time
g = gds.alpha.graph.construct(
    "ogbn-mag3",
    list(node_dfs.values()),
    list(rel_dfs.values()),
    undirected_relationship_types = [k[1] for k in rel_dfs.keys()]
)