# Neo4j Graph Sampling for PyG GNN 
__An example using the [`ogbn-arxiv`](https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv) dataset__

![Neo4j version](https://img.shields.io/badge/Neo4j-5-brightgreen)
![GDS version](https://img.shields.io/badge/GDS-2.3-brightgreen)
![GDS Python Client version](https://img.shields.io/badge/GDS_Python_Client-1.6-brightgreen)

## Setup

In [1]:
%pip install graphdatascience python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from graphdatascience import GraphDataScience
from dotenv import load_dotenv
import os
from numpy.typing import ArrayLike
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.transforms import RandomNodeSplit
import random
import numpy as np

In [3]:
load_dotenv('db-credentials.env', override=True)

# Use Neo4j URI and credentials according to our setup
gds = GraphDataScience(
    os.getenv('NEO4J_URI'),
    auth=(os.getenv('NEO4J_USERNAME'),
          os.getenv('NEO4J_PASSWORD')),
    aura_ds=eval(os.getenv('AURA_DS').title()))

# Necessary if you enabled Arrow on the db - this is true for AuraDS
gds.set_database("neo4j")
PROJ_NAME = 'proj'

In [4]:
gds.version()

'2.3.1'

In [5]:
RANDOM_SEED = 7474
VALID_YEAR = 2018
SAMPLE_RATIO = 0.34

## Sampl Neo4j Graph

In [6]:
if gds.graph.exists(PROJ_NAME)['exists']:
    gds.graph.get(PROJ_NAME).drop()

In [7]:
%%time
g, _ = gds.graph.project(PROJ_NAME, ['Train', 'Valid', 'Test'], ['CITES'], nodeProperties =['textEmbedding', 'subjectId', 'year'])

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

CPU times: user 30.3 ms, sys: 326 µs, total: 30.7 ms
Wall time: 1.64 s


In [8]:
print(f"Number of nodes in our graph: {g.node_count():,}")
print(f"Number of relationships in our graph: {g.relationship_count():,}")

Number of nodes in our graph: 169,343
Number of relationships in our graph: 1,166,243


In [9]:
SAMPLE_PROJ_NAME = PROJ_NAME + '_sample'
if gds.graph.exists(SAMPLE_PROJ_NAME)['exists']:
    gds.graph.get(SAMPLE_PROJ_NAME).drop()

In [10]:
%%time
g_sample, _ = gds.alpha.graph.sample.rwr(SAMPLE_PROJ_NAME, g, samplingRatio=SAMPLE_RATIO,
                                         restartProbability=0.05, nodeLabelStratification=True, concurrency=1, randomSeed=RANDOM_SEED)

Random walk with restarts sampling:   0%|          | 0/100 [00:00<?, ?%/s]

CPU times: user 406 ms, sys: 61.7 ms, total: 468 ms
Wall time: 1min 1s


In [11]:
# If our SAMPLE_RATIO is 0.34 we should have somewhere around 0.34 * 269,343 ~ 55,883 nodes in our sample
print(f"Number of nodes in our sample: {g_sample.node_count():,}")

# And let's see how many relationships we have
print(f"Number of relationships in our sample: {g_sample.relationship_count():,}")

Number of nodes in our sample: 57,577
Number of relationships in our sample: 459,928


In [12]:
# We should also see similar degree distributions
print(f"Degree distribution in our full graph:")
print(g.degree_distribution())

# And let's see how many relationships we got
print(f"\n\nDegree distribution in our sample:")
print(g_sample.degree_distribution())

Degree distribution in our full graph:
p99      37.000000
min       0.000000
max     436.000000
mean      6.886869
p90      18.000000
p50       4.000000
p999     71.000000
p95      24.000000
p75       9.000000
dtype: float64


Degree distribution in our sample:
p99      40.000000
min       0.000000
max     403.000000
mean      7.988051
p90      20.000000
p50       5.000000
p999     82.000000
p95      26.000000
p75      11.000000
dtype: float64


## Export Sampled Graph to Pandas DataFrames

In [13]:
raw_sample_topology_df = gds.beta.graph.relationships.stream(g_sample)

In [14]:
raw_sample_node_df = gds.graph.nodeProperties.stream(
    g_sample,
    ['subjectId', 'year', 'textEmbedding'],
    separate_property_columns=True,
)

## Re-Index Graph Data for PyG
PyG needs an ordered index

In [15]:
sample_node_df = raw_sample_node_df.reset_index().rename(columns={'nodeId':'neo4jNodeId'}).rename(columns={'index':'nodeId'})
sample_node_df

Unnamed: 0,nodeId,neo4jNodeId,subjectId,year,textEmbedding
0,0,112370,10,2012,"[-0.03758800029754639, -0.019776999950408936, ..."
1,1,112371,10,2013,"[-0.09249299764633179, 0.06776200234889984, -0..."
2,2,112375,10,2011,"[-0.05762699991464615, -0.02996399998664856, -..."
3,3,112376,10,2013,"[-0.010092999786138535, -0.06655500084161758, ..."
4,4,112380,10,2000,"[-0.0767190009355545, -0.13565599918365479, -0..."
...,...,...,...,...,...
57572,57572,52393,24,2019,"[-0.04432599991559982, -0.05412999913096428, -..."
57573,57573,52398,24,2019,"[-0.17306800186634064, 0.027883000671863556, -..."
57574,57574,52404,24,2019,"[-0.13642600178718567, -0.03896800056099892, -..."
57575,57575,52405,24,2019,"[-0.33581000566482544, 0.2053699940443039, -0...."


In [16]:
sample_topology_df = (raw_sample_topology_df
    .merge(sample_node_df[['neo4jNodeId','nodeId']], how='left', left_on='sourceNodeId', right_on='neo4jNodeId')
    .drop(columns=['sourceNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'sourceNodeId'})
    .merge(sample_node_df[['neo4jNodeId','nodeId']], how='left', left_on='targetNodeId', right_on='neo4jNodeId')
    .drop(columns=['targetNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'targetNodeId'})
)
sample_topology_df

Unnamed: 0,relationshipType,sourceNodeId,targetNodeId
0,CITES,20000,38218
1,CITES,20002,39028
2,CITES,20002,39224
3,CITES,20002,47274
4,CITES,20005,20030
...,...,...,...
459923,CITES,57575,40806
459924,CITES,57576,22136
459925,CITES,57576,26918
459926,CITES,57576,26977


## Construct Inputs for Training

In [17]:
# By using `by_rel_type` we get the topology in a format that can be used as input to several GNN frameworks:
# {"rel_type": [[source_nodes], [target_nodes]]}
sample_topology = sample_topology_df.by_rel_type()

In [18]:
edge_index = torch.tensor(sample_topology['CITES'], dtype=torch.long)
edge_index

tensor([[20000, 20002, 20002,  ..., 57576, 57576, 57576],
        [38218, 39028, 39224,  ..., 26918, 26977,  6490]])

In [19]:
x = torch.tensor(np.stack(sample_node_df['textEmbedding']), dtype=torch.float)

In [20]:
y = torch.tensor(np.stack(sample_node_df['subjectId']), dtype=torch.long)

In [21]:
data = Data(x=x, y=y, edge_index=edge_index)
data.train_mask = torch.tensor(np.stack(sample_node_df.year < VALID_YEAR))
data.val_mask = torch.tensor(np.stack(sample_node_df.year == VALID_YEAR))
data.test_mask = torch.tensor(np.stack(sample_node_df.year > VALID_YEAR))
print(data)

Data(x=[57577, 128], edge_index=[2, 459928], y=[57577], train_mask=[57577], val_mask=[57577], test_mask=[57577])


In [22]:
num_classes = y.unique().shape[0]
print(f'there are {num_classes} possible target classes')

there are 40 possible target classes


## Define Convolutional Nueral Network and Other Configurations for Training

In [23]:
# Define the GCN architecture
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 72)
        self.conv2 = GCNConv(72, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        # We use log_softmax and nll_loss instead of softmax output and cross entropy loss
        # for reasons for performance and numerical stability.
        # They are mathematically equivalent
        return F.log_softmax(x, dim=1)

In [24]:
# Prepare training by setting up for the chosen device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Let's see what device was chosen
print(device)

cuda


In [25]:
# In standard PyTorch fashion we instantiate our model, and transfer it to the memory of the chosen device
model = GCN().to(device)

# Let's inspect our model architecture
print(model)

GCN(
  (conv1): GCNConv(128, 72)
  (conv2): GCNConv(72, 40)
)


In [26]:
# Pass our input data to the chosen device too
data = data.to(device)

In [27]:
# Since hyperparameter tuning is out of scope for this small example, we initialize an
# Adam optimizer with some fixed learning rate and weight decay
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [28]:
# Train the GCN using the CORA sample represented by `data` using the standard PyTorch training loop
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    train_loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    valid_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask])
    test_loss = F.nll_loss(out[data.test_mask], data.y[data.test_mask])
    print(f'Epoch: {epoch:03d}, '
      f'Train: {train_loss:.4f}, '
      f'Valid: {valid_loss:.4f}, '
      f'Test: {test_loss:.4f}')
    valid_loss.backward()
    optimizer.step()

Epoch: 000, Train: 3.6372, Valid: 3.6192, Test: 3.6436
Epoch: 001, Train: 3.3100, Valid: 3.0667, Test: 3.1962
Epoch: 002, Train: 3.3278, Valid: 2.8460, Test: 2.9109
Epoch: 003, Train: 3.3234, Valid: 2.7527, Test: 2.7664
Epoch: 004, Train: 3.1973, Valid: 2.6637, Test: 2.7154
Epoch: 005, Train: 3.1076, Valid: 2.6448, Test: 2.7097
Epoch: 006, Train: 3.0156, Valid: 2.5776, Test: 2.6657
Epoch: 007, Train: 2.9657, Valid: 2.5028, Test: 2.6078
Epoch: 008, Train: 2.9486, Valid: 2.4663, Test: 2.5776
Epoch: 009, Train: 2.9336, Valid: 2.4291, Test: 2.5490
Epoch: 010, Train: 2.8895, Valid: 2.3839, Test: 2.5330
Epoch: 011, Train: 2.8353, Valid: 2.3449, Test: 2.4954
Epoch: 012, Train: 2.7676, Valid: 2.2982, Test: 2.4792
Epoch: 013, Train: 2.7135, Valid: 2.2692, Test: 2.4552
Epoch: 014, Train: 2.6766, Valid: 2.2263, Test: 2.4250
Epoch: 015, Train: 2.6482, Valid: 2.2006, Test: 2.3736
Epoch: 016, Train: 2.6276, Valid: 2.1663, Test: 2.3400
Epoch: 017, Train: 2.5844, Valid: 2.1317, Test: 2.3052
Epoch: 018

In [29]:
# Evaluate the trained GCN model on our test set
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())

print(f"Test Set Accuracy: {acc:.4f}")   

Test Set Accuracy: 0.5795


## Attempt with Full Graph

In [34]:
raw_topology_df = gds.beta.graph.relationships.stream(g)

raw_node_df = gds.graph.nodeProperties.stream(
    g,
    ['subjectId', 'year', 'textEmbedding'],
    separate_property_columns=True,
)

node_df = raw_node_df.reset_index().rename(columns={'nodeId':'neo4jNodeId'}).rename(columns={'index':'nodeId'})

topology_df = (raw_topology_df
    .merge(node_df[['neo4jNodeId','nodeId']], how='left', left_on='sourceNodeId', right_on='neo4jNodeId')
    .drop(columns=['sourceNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'sourceNodeId'})
    .merge(node_df[['neo4jNodeId','nodeId']], how='left', left_on='targetNodeId', right_on='neo4jNodeId')
    .drop(columns=['targetNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'targetNodeId'})
)

topology = topology_df.by_rel_type()

edge_index = torch.tensor(topology['CITES'], dtype=torch.long)

x = torch.tensor(np.stack(node_df['textEmbedding']), dtype=torch.float)
y = torch.tensor(np.stack(node_df['subjectId']), dtype=torch.long)

full_data = Data(x=x, y=y, edge_index=edge_index)
full_data.train_mask = torch.tensor(np.stack(node_df.year < VALID_YEAR))
full_data.val_mask = torch.tensor(np.stack(node_df.year == VALID_YEAR))
full_data.test_mask = torch.tensor(np.stack(node_df.year > VALID_YEAR))
print(full_data)

num_classes = y.unique().shape[0]
print(f'there are {num_classes} possible target classes')

full_model = GCN().to(device)
print(full_model)

full_data = full_data.to(device)

optimizer = torch.optim.Adam(full_model.parameters(), lr=0.01, weight_decay=5e-4)

full_model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = full_model(full_data)
    train_loss = F.nll_loss(out[full_data.train_mask], full_data.y[full_data.train_mask])
    valid_loss = F.nll_loss(out[full_data.val_mask], full_data.y[full_data.val_mask])
    test_loss = F.nll_loss(out[full_data.test_mask], full_data.y[full_data.test_mask])
    #print(f'Epoch: {epoch:03d}, '
    #  f'Train: {train_loss:.4f}, '
    #  f'Valid: {valid_loss:.4f}, '
    #  f'Test: {test_loss:.4f}')
    valid_loss.backward()
    optimizer.step()

Data(x=[169343, 128], edge_index=[2, 1166243], y=[169343], train_mask=[169343], val_mask=[169343], test_mask=[169343])
there are 40 possible target classes
GCN(
  (conv1): GCNConv(128, 72)
  (conv2): GCNConv(72, 40)
)


In [35]:
# Evaluate the full data trained GCN model on our test set
full_model.eval()
pred = full_model(full_data).argmax(dim=1)
correct = (pred[full_data.test_mask] == full_data.y[full_data.test_mask]).sum()
acc = int(correct) / int(full_data.test_mask.sum())

print(f"Test Set Accuracy: {acc:.4f}")  

Test Set Accuracy: 0.5597


## Cleanup
Remove Neo4j graph projections to free up memory

In [36]:
g_sample.drop()
g.drop()

graphName                                                             proj
database                                                             neo4j
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                           169343
relationshipCount                                                  1166243
configuration                                                           {}
density                                                           0.000041
creationTime                           2023-03-12T23:45:44.421389770+00:00
modificationTime                       2023-03-12T23:45:45.729357467+00:00
schema                   {'graphProperties': {}, 'relationships': {'CIT...
schemaWithOrientation    {'graphProperties': {}, 'relationships': {'CIT...
Name: 0, dtype: object