## Setup

In [None]:
%pip install graphdatascience python-dotenv

In [None]:
!pip install torch_geometric

In [None]:
!pip install neo4j

In [None]:
import pandas as pd
import numpy as np
from graphdatascience import GraphDataScience
from dotenv import load_dotenv
import os
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [None]:
from neo4j import GraphDatabase

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "bolt://localhost:7687"
AUTH = ("shivam", "graphAccess001")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [None]:
load_dotenv('db-credentials.env', override=True)

# Use Neo4j URI and credentials according to our setup
gds = GraphDataScience(
    os.getenv('bolt://localhost:7687'),
    auth=(os.getenv('shivam'),
          os.getenv('graphAccess001')),
    aura_ds=eval(os.getenv('AURA_DS').title()))

# Necessary if you enabled Arrow on the db - this is true for AuraDS
gds.set_database("neo4j")
PROJ_NAME = 'proj'

In [None]:
gds.version()

NameError: ignored

In [None]:
RANDOM_SEED = 7474

## Sample Neo4j Graph

In [None]:
if gds.graph.exists(PROJ_NAME)['exists']:
    gds.graph.get(PROJ_NAME).drop()

In [None]:
%%time
g, _ = gds.graph.project(PROJ_NAME, ['Train', 'Valid', 'Test'], ['Employees'],
                         nodeProperties =['YOE', 'Designation'])

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

CPU times: user 28.3 ms, sys: 8.72 ms, total: 37 ms
Wall time: 2.12 s


In [None]:
print(f"Number of nodes in our graph: {g.node_count():,}")
print(f"Number of relationships in our graph: {g.relationship_count():,}")

In [None]:
SAMPLE_PROJ_NAME = PROJ_NAME + '_sample'
if gds.graph.exists(SAMPLE_PROJ_NAME)['exists']:
    gds.graph.get(SAMPLE_PROJ_NAME).drop()

In [None]:
%%time
g_sample, _ = gds.alpha.graph.sample.rwr(SAMPLE_PROJ_NAME, g,
                                         restartProbability=0.05, nodeLabelStratification=True,
                                         concurrency=1, randomSeed=RANDOM_SEED)

Random walk with restarts sampling:   0%|          | 0/100 [00:00<?, ?%/s]

CPU times: user 448 ms, sys: 34.4 ms, total: 483 ms
Wall time: 1min


## Export Sampled Graph to Pandas DataFrames

In [None]:
raw_topology_df = gds.beta.graph.relationships.stream(g_sample)

In [None]:
sample_node_df = raw_topology_df.reset_index().rename(columns={'nodeId':'neo4jNodeId'}).rename(columns={'index':'nodeId'})
sample_node_df

In [None]:
sample_topology_df = (raw_topology_df
    .merge(sample_node_df[['neo4jNodeId','nodeId']], how='left',
           left_on='sourceNodeId', right_on='neo4jNodeId')
    .drop(columns=['sourceNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'sourceNodeId'})
    .merge(sample_node_df[['neo4jNodeId','nodeId']], how='left',
           left_on='targetNodeId', right_on='neo4jNodeId')
    .drop(columns=['targetNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'targetNodeId'})
)
sample_topology_df

## Construct Inputs for Training
Now that we re-assigned node ids, we can easily transform our `sample_topology_df` and `node_df` into the `edge_index`, features (`x`), and target (`y`) tensors for PyG. We will also use `node_df.year` for data splitting.

In [None]:
# By using `by_rel_type` we get the topology in a format that can be used as input to several GNN frameworks:
# {"rel_type": [[source_nodes], [target_nodes]]}
sample_topology = sample_topology_df.by_rel_type()

In [None]:
edge_index = torch.tensor(sample_topology['Skills'], dtype=torch.long)
edge_index

tensor([[    0,     0,     0,  ..., 57576, 57576, 57576],
        [48832, 50197, 50560,  ..., 36918, 36977, 26490]])

In [None]:
#node features
x = torch.tensor(np.stack(sample_node_df['employees']), dtype=torch.float)

In [None]:
#target class
y = torch.tensor(np.stack(sample_node_df['Skills']), dtype=torch.long)

In [None]:
# data objects and masks for data splitting
data = Data(x=x, y=y, edge_index=edge_index)
data.train_mask = torch.tensor(np.stack(3 < sample_node_df.yoe < 7))
data.val_mask = torch.tensor(np.stack(sample_node_df.year < 3))
data.test_mask = torch.tensor(np.stack(sample_node_df.year > 7))
print(data)

In [None]:
num_classes = y.unique().shape[0]
print(f'there are {num_classes} possible target classes')

## Define Convolutional Neural Network and Other Configurations for Training

In [None]:
# Define the GCN architecture
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 72)
        self.conv2 = GCNConv(72, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        # We use log_softmax and nll_loss instead of softmax output and cross entropy loss
        # for reasons for performance and numerical stability.
        # They are mathematically equivalent
        return F.log_softmax(x, dim=1)

In [None]:
# Prepare training by setting up for the chosen device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Let's see what device was chosen
print(device)

cuda


In [None]:
# In standard PyTorch fashion we instantiate our model, and transfer it to the memory of the chosen device
model = GCN().to(device)

# Let's inspect our model architecture
print(model)

GCN(
  (conv1): GCNConv(128, 72)
  (conv2): GCNConv(72, 40)
)


In [None]:
# Pass our input data to the chosen device too
data = data.to(device)

In [None]:
# Since hyperparameter tuning is out of scope for this small example, we initialize an
# Adam optimizer with some fixed learning rate and weight decay
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

## Train and Evaluate GNN

In [None]:
# Train the GCN using the CORA sample represented by `data` using the standard PyTorch training loop
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    train_loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    valid_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask])
    test_loss = F.nll_loss(out[data.test_mask], data.y[data.test_mask])
    print(f'Epoch: {epoch:03d}, '
      f'Train: {train_loss:.4f}, '
      f'Valid: {valid_loss:.4f}, '
      f'Test: {test_loss:.4f}')
    valid_loss.backward()
    optimizer.step()

Epoch: 000, Train: 3.8124, Valid: 3.7650, Test: 3.7201
Epoch: 001, Train: 3.3731, Valid: 3.2336, Test: 3.3666
Epoch: 002, Train: 3.2592, Valid: 2.8746, Test: 3.0503
Epoch: 003, Train: 3.3073, Valid: 2.7021, Test: 2.7979
Epoch: 004, Train: 3.4405, Valid: 2.7065, Test: 2.6882
Epoch: 005, Train: 3.3579, Valid: 2.6410, Test: 2.6505
Epoch: 006, Train: 3.2403, Valid: 2.5980, Test: 2.6353
Epoch: 007, Train: 3.1113, Valid: 2.5501, Test: 2.6210
Epoch: 008, Train: 2.9964, Valid: 2.5046, Test: 2.6095
Epoch: 009, Train: 2.9466, Valid: 2.4629, Test: 2.5795
Epoch: 010, Train: 2.9036, Valid: 2.4082, Test: 2.5354
Epoch: 011, Train: 2.8694, Valid: 2.3436, Test: 2.5014
Epoch: 012, Train: 2.8564, Valid: 2.3194, Test: 2.4805
Epoch: 013, Train: 2.8281, Valid: 2.2840, Test: 2.4312
Epoch: 014, Train: 2.7816, Valid: 2.2448, Test: 2.4202
Epoch: 015, Train: 2.7344, Valid: 2.2156, Test: 2.3927
Epoch: 016, Train: 2.6814, Valid: 2.1843, Test: 2.3644
Epoch: 017, Train: 2.6233, Valid: 2.1484, Test: 2.3403
Epoch: 018

In [None]:
# Evaluate the trained GCN model on our test set
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())

print(f"Test Set Accuracy: {acc:.4f}")

Test Set Accuracy: 0.5820


In [None]:
raw_topology_df = gds.beta.graph.relationships.stream(g)

raw_node_df = gds.graph.nodeProperties.stream(
    g,
    ['designation', 'yoe'],
    separate_property_columns=True,
)

node_df = raw_node_df.reset_index().rename(columns={'nodeId':'neo4jNodeId'}).rename(columns={'index':'nodeId'})

topology_df = (raw_topology_df
    .merge(node_df[['neo4jNodeId','nodeId']], how='left', left_on='sourceNodeId',
           right_on='neo4jNodeId')
    .drop(columns=['sourceNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'sourceNodeId'})
    .merge(node_df[['neo4jNodeId','nodeId']], how='left', left_on='targetNodeId',
           right_on='neo4jNodeId')
    .drop(columns=['targetNodeId', 'neo4jNodeId'])
    .rename(columns={'nodeId':'targetNodeId'})
)

topology = topology_df.by_rel_type()

edge_index = torch.tensor(topology['Skills'], dtype=torch.long)

x = torch.tensor(np.stack(node_df['Employees']), dtype=torch.float)
y = torch.tensor(np.stack(node_df['Designation']), dtype=torch.long)

full_data = Data(x=x, y=y, edge_index=edge_index)
full_data.train_mask = torch.tensor(np.stack(3 < sample_node_df.yoe < 7))
full_data.val_mask = torch.tensor(np.stack(sample_node_df.year < 3))
full_data.test_mask = torch.tensor(np.stack(sample_node_df.year > 7))

num_classes = y.unique().shape[0]

full_model = GCN().to(device)
print(full_model)

full_data = full_data.to(device)

optimizer = torch.optim.Adam(full_model.parameters(), lr=0.01, weight_decay=5e-4)

full_model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = full_model(full_data)
    train_loss = F.nll_loss(out[full_data.train_mask], full_data.y[full_data.train_mask])
    valid_loss = F.nll_loss(out[full_data.val_mask], full_data.y[full_data.val_mask])
    test_loss = F.nll_loss(out[full_data.test_mask], full_data.y[full_data.test_mask])
    valid_loss.backward()
    optimizer.step()

Data(x=[169343, 128], edge_index=[2, 1166243], y=[169343], train_mask=[169343], val_mask=[169343], test_mask=[169343])
there are 40 possible target classes
GCN(
  (conv1): GCNConv(128, 72)
  (conv2): GCNConv(72, 40)
)


In [None]:
# Evaluate the full data trained GCN model on our test set
full_model.eval()
full_pred = full_model(full_data).argmax(dim=1)
full_correct = (full_pred[full_data.test_mask] == full_data.y[full_data.test_mask]).sum()
full_acc = int(full_correct) / int(full_data.test_mask.sum())

print(f'Test set accuracy for full dataset: {full_acc:.4f}')
print(f'This is a difference of {100*(full_acc-acc):.2f} percentage points from the sampled dataset')

Test set accuracy for full dataset: 0.5583
This is a difference of -2.37 percentage points from the sampled dataset
