#### Installation

In [None]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install ogb
!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric


#### Imports

In [None]:
import torch
from ogb.linkproppred import PygLinkPropPredDataset
import networkx as nx
import numpy as np
from torch.utils.data import DataLoader
from torch_geometric.utils import negative_sampling, to_networkx
from ogb.linkproppred import Evaluator
import torch.nn.functional as F

import pandas as pd

from helpers import *

#### Load and configure data

In [None]:
device = "cuda" if torch.cuda.is_available() else 'cpu'

dataset = PygLinkPropPredDataset(name = 'ogbl-ddi') 

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"]["edge"], split_edge["valid"]["edge"], split_edge["test"]["edge"]
graph = dataset[0]
edge_index = graph.edge_index.to(device)
nx_graph = to_networkx(graph, to_undirected=True)

#### Generating SPD features

In [None]:
# SPD: Matrix with shortest possible distance from all nodes to our anchor nodes
k = 4267
np.random.seed(0)
node_subset = np.random.choice(nx_graph.number_of_nodes(), size=k, replace=False)
spd = get_spd_matrix(G=nx_graph, S=node_subset, max_spd=5)
spd = torch.Tensor(spd).to(device)
node_features = spd

#### Model setup

In [None]:
# Predictor params
in_channels = k
hidden_channels = 256
out_channels = 1
num_layers = 5 + 1
dropout = 0.3

# Learning params
lr = 0.003
batch_size = 64 * 1024
num_epochs = 200
eval_steps = 1
run = 0

In [None]:
def train_NN(predictor, node_features, edge_index, optimizer, batch_size):
    predictor.train()

    total_loss = total_examples = 0
    for perm in DataLoader(range(pos_train_edge.size(0)), batch_size, shuffle=True):
        optimizer.zero_grad()

        # Get positive loss
        edge = pos_train_edge[perm].t()
        feature_0 = node_features[edge[0], :]
        feature_1 = node_features[edge[1], :]
        pos_out = predictor(feature_0, feature_1)
        pos_loss = -torch.log(pos_out + 1e-15).mean()

        # Get negative loss (Sample using torch_geometric.utils.negative_sampling)
        edge = negative_sampling(edge_index, num_nodes=node_features.size(0),
                                 num_neg_samples=perm.size(0), method='dense')
        feature_0 = node_features[edge[0], :]
        feature_1 = node_features[edge[1], :]
        neg_out = predictor(feature_0, feature_1)
        neg_loss = -torch.log(1 - neg_out + 1e-15).mean()

        # Get total loss and gradients
        loss = pos_loss + neg_loss
        loss.backward()

        # Clip gradient (Limit maximum l2 norm of gradients to 1)
        torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0)

        optimizer.step()

        # Get total loss
        num_examples = pos_out.size(0)
        total_loss += loss.item() * num_examples
        total_examples += num_examples

    return total_loss / total_examples

@torch.no_grad()
def test_NN(predictor, node_features, split_edge, batch_size, evaluator, device):
    predictor.eval()

    pos_valid_edge = split_edge['valid']['edge'].to(device)
    neg_valid_edge = split_edge['valid']['edge_neg'].to(device)
    pos_test_edge = split_edge['test']['edge'].to(device)
    neg_test_edge = split_edge['test']['edge_neg'].to(device)

    pos_valid_preds = []
    for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size):
        edge = pos_valid_edge[perm].t()
        feature_0 = node_features[edge[0], :]
        feature_1 = node_features[edge[1], :]
        pos_valid_preds += [predictor(feature_0, feature_1).squeeze().cpu()]
    pos_valid_pred = torch.cat(pos_valid_preds, dim=0)

    neg_valid_preds = []
    for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size):
        edge = neg_valid_edge[perm].t()
        feature_0 = node_features[edge[0], :]
        feature_1 = node_features[edge[1], :]
        neg_valid_preds += [predictor(feature_0, feature_1).squeeze().cpu()]
    neg_valid_pred = torch.cat(neg_valid_preds, dim=0)

    pos_test_preds = []
    for perm in DataLoader(range(pos_test_edge.size(0)), batch_size):
        edge = pos_test_edge[perm].t()
        feature_0 = node_features[edge[0], :]
        feature_1 = node_features[edge[1], :]
        pos_test_preds += [predictor(feature_0, feature_1).squeeze().cpu()]
    pos_test_pred = torch.cat(pos_test_preds, dim=0)

    neg_test_preds = []
    for perm in DataLoader(range(neg_test_edge.size(0)), batch_size):
        edge = neg_test_edge[perm].t()
        feature_0 = node_features[edge[0], :]
        feature_1 = node_features[edge[1], :]
        neg_test_preds += [predictor(feature_0, feature_1).squeeze().cpu()]
    neg_test_pred = torch.cat(neg_test_preds, dim=0)

    results = {}
    for K in [20, 50, 100,1000,10000]:
        evaluator.K = K
        valid_hits = evaluator.eval({
            'y_pred_pos': pos_valid_pred,
            'y_pred_neg': neg_valid_pred,
        })[f'hits@{K}']
        test_hits = evaluator.eval({
            'y_pred_pos': pos_test_pred,
            'y_pred_neg': neg_test_pred,
        })[f'hits@{K}']

        results[f'Hits@{K}'] = (valid_hits, test_hits)

    return results

#### Train model

In [None]:
# Evaluator
evaluator = Evaluator(name='ogbl-ddi')
loggers = {
    'Hits@20': Logger(1),
    'Hits@50': Logger(1),
    'Hits@100': Logger(1),
    'Hits@1000': Logger(1),
    'Hits@10000': Logger(1)
}

In [None]:
# Init model 
predictor = LinkPredictor(in_channels, hidden_channels, out_channels,
                          num_layers, dropout).to(device)
predictor.reset_parameters()
optimizer = torch.optim.Adam(list(predictor.parameters()), lr=lr)

# Train
pos_train_edge = train_edge.to(device)

for epoch in range(1, num_epochs + 1):
    loss = train_NN(predictor, node_features, edge_index, optimizer, batch_size)

    # Test
    results = test_NN(predictor, node_features, split_edge, batch_size, evaluator, device)

    # Logging and prints
    for key, result in results.items():
        loggers[key].add_result(run, result)
    for key, result in results.items():
        valid_hits, test_hits = result
        print(key)
        print(f'Run: {run + 1:02d}, '
              f'Epoch: {epoch:02d}, '
              f'Loss: {loss:.4f}, '
              f'Valid: {100 * valid_hits:.2f}%, '
              f'Test: {100 * test_hits:.2f}%')
    print('---')

Hits@20
Run: 01, Epoch: 01, Loss: 2.3068, Valid: 3.16%, Test: 3.94%
Hits@50
Run: 01, Epoch: 01, Loss: 2.3068, Valid: 4.47%, Test: 5.34%
Hits@100
Run: 01, Epoch: 01, Loss: 2.3068, Valid: 5.70%, Test: 6.54%
Hits@1000
Run: 01, Epoch: 01, Loss: 2.3068, Valid: 15.55%, Test: 15.37%
Hits@10000
Run: 01, Epoch: 01, Loss: 2.3068, Valid: 45.92%, Test: 47.09%
---
Hits@20
Run: 01, Epoch: 02, Loss: 1.2308, Valid: 0.84%, Test: 0.48%
Hits@50
Run: 01, Epoch: 02, Loss: 1.2308, Valid: 2.60%, Test: 1.85%
Hits@100
Run: 01, Epoch: 02, Loss: 1.2308, Valid: 4.14%, Test: 3.91%
Hits@1000
Run: 01, Epoch: 02, Loss: 1.2308, Valid: 14.12%, Test: 12.94%
Hits@10000
Run: 01, Epoch: 02, Loss: 1.2308, Valid: 44.06%, Test: 45.41%
---
Hits@20
Run: 01, Epoch: 03, Loss: 1.1204, Valid: 5.64%, Test: 5.05%
Hits@50
Run: 01, Epoch: 03, Loss: 1.1204, Valid: 7.54%, Test: 6.88%
Hits@100
Run: 01, Epoch: 03, Loss: 1.1204, Valid: 9.11%, Test: 8.83%
Hits@1000
Run: 01, Epoch: 03, Loss: 1.1204, Valid: 22.44%, Test: 20.86%
Hits@10000
Run:

In [None]:
hits_10000 = loggers["Hits@10000"].results[0]
hits_1000 = loggers["Hits@1000"].results[0]
hits_100 = loggers["Hits@100"].results[0]
hits_50 = loggers["Hits@50"].results[0]
hits_20 = loggers["Hits@20"].results[0]

df_10000 = pd.DataFrame(hits_10000)
df_1000 = pd.DataFrame(hits_1000)
df_100 = pd.DataFrame(hits_100)
df_50 = pd.DataFrame(hits_50)
df_20 = pd.DataFrame(hits_20)

df_10000.to_csv("NN_4267_SPD_features_hits_10000.csv")
df_1000.to_csv("NN_4267_SPD_features_hits_1000.csv")
df_100.to_csv("NN_4267_SPD_features_hits_100.csv")
df_50.to_csv("NN_4267_SPD_features_hits_50.csv")
df_20.to_csv("NN_4267_SPD_features_hits_20.csv")