# GNNs and Neo4j

![Neo4j version](https://img.shields.io/badge/Neo4j->=4.4.9-brightgreen)
![GDS version](https://img.shields.io/badge/GDS-2.3-brightgreen)
![GDS Python Client version](https://img.shields.io/badge/GDS_Python_Client-1.6-brightgreen)


In [2]:
%pip install graphdatascience python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-0.21.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
TORCH_CUDA = torch.__version__
print(TORCH_CUDA)

1.13.1+cu117


In [4]:
%pip install pyg_lib torch_scatter torch_sparse -f https://data.pyg.org/whl/torch-1.13.1+cu117.html
%pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-1.13.1+cu117.html
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from torch_geometric.datasets import OGB_MAG
from graphdatascience import GraphDataScience

## Prepare Data
Load source data via graph construct

In [6]:
from torch_geometric.datasets import OGB_MAG

dataset = OGB_MAG(root='./data', preprocess='metapath2vec')
data = dataset[0]

In [7]:
def camel_case(s):
    ss = s.split('_')
    return ''.join(st.title() for st in ss)

def make_node_df(node_tensor, node_label, id_offset=0):
    df = pd.DataFrame(range(node_tensor.shape[0]), columns=['orgInd'])
    df['nodeId'] = df['orgInd'] + id_offset
    df['labels'] = node_label
    df['wordEmbedding'] = list(node_tensor.numpy())
    return df

def make_node_dfs_with_id_offset(graph_data):
    id_offset=0
    node_dfs = []
    id_offset_map = {}
    for node_type in graph_data.node_types:
        node_label = camel_case(node_type)
        id_offset_map[node_label] = id_offset
        node_df = make_node_df(graph_data[node_type].x, node_label, id_offset)
        id_offset += node_df.shape[0]
        node_dfs.append(node_df)
    return pd.concat(node_dfs), id_offset_map

def make_rel_df(rel_tensor, rel_type, source_id_offset, target_id_offset):
    rel_df = pd.DataFrame(rel_tensor.edge_index.T.numpy(), columns=['sourceNodeId','targetNodeId'])
    rel_df['sourceNodeId'] = rel_df['sourceNodeId'] + source_id_offset
    rel_df['targetNodeId'] = rel_df['targetNodeId'] + target_id_offset
    rel_df['relationshipType'] = rel_type
    return rel_df

def make_rel_dfs(graph_data, id_offset_map):
    rel_dfs = []
    for edge_type in graph_data.edge_types:
        rel_dfs.append(make_rel_df(graph_data[edge_type],
                                 edge_type[1].upper(),
                                 id_offset_map[camel_case(edge_type[0])],
                                 id_offset_map[camel_case(edge_type[2])]))
    return pd.concat(rel_dfs)

In [8]:
node_df, id_offset_map = make_node_dfs_with_id_offset(data)

In [9]:
node_df

Unnamed: 0,orgInd,nodeId,labels,wordEmbedding
0,0,0,Paper,"[-0.095379, 0.040758, -0.210948, -0.064362, -0..."
1,1,1,Paper,"[-0.151047, -0.107315, -0.221964, -0.034725, 0..."
2,2,2,Paper,"[-0.114799, -0.175982, -0.260556, 0.01192, -0...."
3,3,3,Paper,"[0.004506, 0.042368, -0.178465, -0.138479, -0...."
4,4,4,Paper,"[-0.094474, -0.080044, -0.222468, -0.158952, -..."
...,...,...,...,...
59960,59960,1939738,FieldOfStudy,"[-0.6320879, 0.13672572, -0.3656389, 0.9023778..."
59961,59961,1939739,FieldOfStudy,"[-0.31150684, -0.38850185, 0.0430072, 0.111707..."
59962,59962,1939740,FieldOfStudy,"[-0.31353936, 0.8047326, 0.094272844, -0.21214..."
59963,59963,1939741,FieldOfStudy,"[-0.5500094, 0.17063577, -0.37380353, -0.29022..."


In [10]:
rel_df = make_rel_dfs(data, id_offset_map)

In [11]:
rel_df

Unnamed: 0,sourceNodeId,targetNodeId,relationshipType
0,736389,1871883,AFFILIATED_WITH
1,736390,1872034,AFFILIATED_WITH
2,736391,1874235,AFFILIATED_WITH
3,736391,1877171,AFFILIATED_WITH
4,736391,1877782,AFFILIATED_WITH
...,...,...,...
7505073,736388,1893417,HAS_TOPIC
7505074,736388,1893833,HAS_TOPIC
7505075,736388,1901236,HAS_TOPIC
7505076,736388,1902061,HAS_TOPIC


In [13]:
load_dotenv('db-credentials.env', override=True)

# Use Neo4j URI and credentials according to our setup
gds = GraphDataScience(
    os.getenv('NEO4J_URI'),
    auth=(os.getenv('NEO4J_USERNAME'),
          os.getenv('NEO4J_PASSWORD')),
    aura_ds=eval(os.getenv('AURA_DS').title()))

# Necessary if you enabled Arrow on the db - this is true for AuraDS
gds.set_database("neo4j")

In [18]:
all_rel_types = list(rel_df.relationshipType.unique())

In [19]:
%%time
g = gds.alpha.graph.construct(
    "ogbn-mag",
    node_df,
    rel_df,
    undirected_relationship_types = all_rel_types
)

Uploading Nodes:   0%|          | 0/1939743 [00:00<?, ?Records/s]

Uploading Relationships:   0%|          | 0/21111007 [00:00<?, ?Records/s]

CPU times: user 5.16 s, sys: 1.43 s, total: 6.58 s
Wall time: 17.9 s


In [16]:
list(rel_df.relationshipType.unique())

['AFFILIATED_WITH', 'WRITES', 'CITES', 'HAS_TOPIC']