#### Installation of prereqs

In [None]:
import torch

!pip install ogb

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

#### Imports

In [None]:
import random
import networkx as nx

import torch
from torch.utils.data import DataLoader
from torch_geometric.utils import negative_sampling

from ogb.linkproppred import PygLinkPropPredDataset, Evaluator

import pandas as pd

from helpers import *

#### Functions for train and test

In [None]:
def train(model, predictor, edge_attr, x, emb_ea, adj_t, split_edge, optimizer, batch_size):
    edge_index = adj_t

    model.train()
    predictor.train()

    pos_train_edge = split_edge['train']['edge'].to(x.device)

    total_loss = total_examples = 0
    for perm in DataLoader(range(pos_train_edge.size(0)), batch_size, shuffle=True):
        optimizer.zero_grad()

        h = model(x, adj_t, edge_attr, emb_ea)

        # Get positive loss
        edge = pos_train_edge[perm].t()
        pos_out = predictor(h[edge[0]], h[edge[1]])
        pos_loss = -torch.log(pos_out + 1e-15).mean()

        # Get negative loss (Sample using torch_geometric.utils.negative_sampling)
        edge = negative_sampling(edge_index, num_nodes=x.size(0),
                                 num_neg_samples=perm.size(0), method='dense')
        neg_out = predictor(h[edge[0]], h[edge[1]])
        neg_loss = -torch.log(1 - neg_out + 1e-15).mean()

        # Get total loss and gradients
        loss = pos_loss + neg_loss
        loss.backward()

        # Clip gradient (Limit maximum l2 norm of gradients to 1)
        torch.nn.utils.clip_grad_norm_(x, 1.0)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0)

        optimizer.step()

        num_examples = pos_out.size(0)
        total_loss += loss.item() * num_examples
        total_examples += num_examples

    return total_loss / total_examples


@torch.no_grad()
def test(model, predictor, edge_attr, x, emb_ea, adj_t, split_edge, evaluator, batch_size):
    model.eval()
    predictor.eval()

    h = model(x, adj_t, edge_attr, emb_ea)

    pos_valid_edge = split_edge['valid']['edge'].to(x.device)
    neg_valid_edge = split_edge['valid']['edge_neg'].to(x.device)
    pos_test_edge = split_edge['test']['edge'].to(x.device)
    neg_test_edge = split_edge['test']['edge_neg'].to(x.device)

    pos_valid_preds = []
    for perm in DataLoader(range(pos_valid_edge.size(0)), batch_size):
        edge = pos_valid_edge[perm].t()
        pos_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    pos_valid_pred = torch.cat(pos_valid_preds, dim=0)

    neg_valid_preds = []
    for perm in DataLoader(range(neg_valid_edge.size(0)), batch_size):
        edge = neg_valid_edge[perm].t()
        neg_valid_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    neg_valid_pred = torch.cat(neg_valid_preds, dim=0)

    pos_test_preds = []
    for perm in DataLoader(range(pos_test_edge.size(0)), batch_size):
        edge = pos_test_edge[perm].t()
        pos_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    pos_test_pred = torch.cat(pos_test_preds, dim=0)

    neg_test_preds = []
    for perm in DataLoader(range(neg_test_edge.size(0)), batch_size):
        edge = neg_test_edge[perm].t()
        neg_test_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    neg_test_pred = torch.cat(neg_test_preds, dim=0)

    results = {}
    for K in [20, 50, 100]:
        evaluator.K = K
        valid_hits = evaluator.eval({
            'y_pred_pos': pos_valid_pred,
            'y_pred_neg': neg_valid_pred,
        })[f'hits@{K}']
        test_hits = evaluator.eval({
            'y_pred_pos': pos_test_pred,
            'y_pred_neg': neg_test_pred,
        })[f'hits@{K}']

        results[f'Hits@{K}'] = (valid_hits, test_hits)

    return results

#### Set up model

In [None]:
# Model and training parameters
args={
      "device":0, "num_layers":2, "num_samples":100, "node_emb":256, 
      "hidden_channels":256, "dropout":0.3, "batch_size":64 * 1024,
      "lr":0.003, "epochs":200, "log_steps":1, "eval_steps":1, 
      "runs":1
      }

In [None]:
# Set device
device = f'cuda:{args["device"]}' if torch.cuda.is_available() else 'cpu'

# Load OGB Dataset
dataset = PygLinkPropPredDataset(name='ogbl-ddi')
data = dataset[0]
edge_index = data.edge_index.to(device)
split_edge = dataset.get_edge_split()

# Load model
model = GraphSAGE(args["node_emb"], args["hidden_channels"], args["hidden_channels"],
                  args["num_layers"], args["dropout"]).to(device)
emb = torch.nn.Embedding(data["num_nodes"], args["node_emb"]).to(device)
emb_ea = torch.nn.Embedding(args["num_samples"], args["node_emb"]).to(device)
predictor = LinkPredictor(args["hidden_channels"], args["hidden_channels"], 1,
                          args["num_layers"]+1, args["dropout"]).to(device)

print('Number of parameters:',
      sum(p.numel() for p in list(model.parameters()) +
      list(predictor.parameters()) + list(emb.parameters()) + list(emb_ea.parameters())))

Number of parameters: 1512449


#### Generate Laplacian features

In [None]:
# Set dimensionality k of our Laplacian feature vectors
k = 100 

# Get edge list
train_edges = dataset[0]["edge_index"]
edge_list = train_edges.transpose(0,1 ).tolist()

# Make graph and get adjacency
G = nx.from_edgelist(edge_list)
G.number_of_nodes()
A = nx.to_numpy_matrix(G) 
L = nx.normalized_laplacian_matrix(G) 

# Get eigen decomposition
L_dense = L.todense()
L_tens = torch.tensor(L_dense)
L_cud = L_tens.to(device)
lamb, U = torch.linalg.eig(L_cud)
lamb_real, U_real = torch.real(lamb), torch.real(U)

# Sort eigenvectors after eigenvalues in descending order
lamb_real_sorted, indices = torch.sort(lamb_real)
U_sorted = U_real[indices]

eigen_features = U_sorted[:, :k]
eigen_features.to(device)
eigen_features = eigen_features.float()

# edge_attr: For all edges in the entire graph, collect the laplacian node eigenfeatures for each node in each pair
edge_attr = eigen_features[edge_index, :]

# edge_attr: We now create an edge feature averaging over the node eigenfeatures for each edge (Two nodes per edge)
edge_attr = edge_attr.mean(0)

# To tensor
edge_attr = torch.tensor(edge_attr).to(device)

# edge_attr: Min max normalize
a_max = torch.max(edge_attr, dim=0, keepdim=True)[0]
a_min = torch.min(edge_attr, dim=0, keepdim=True)[0]
edge_attr = (edge_attr - a_min) / (a_max - a_min + 1e-6)

#### Set up for training

In [None]:
# Evaluator
evaluator = Evaluator(name='ogbl-ddi')
loggers = {
    'Hits@20': Logger(args["runs"], args),
    'Hits@50': Logger(args["runs"], args),
    'Hits@100': Logger(args["runs"], args),
}

In [None]:
# Initialize node embeddings
run = 0
random.seed(run)
torch.manual_seed(run)
torch.nn.init.xavier_uniform_(emb.weight)
torch.nn.init.xavier_uniform_(emb_ea.weight)
model.reset_parameters()
predictor.reset_parameters()
optimizer = torch.optim.Adam(
    list(model.parameters()) + list(emb.parameters()) +
    list(emb_ea.parameters()) + list(predictor.parameters()), lr=args["lr"])

#### Train

In [None]:
# Add to device
edge_index = edge_index.to(device)
emb = emb.to(device)
emb_ea = emb_ea.to(device)
model = model.to(device)
predictor = predictor.to(device)

# Train
for epoch in range(1, 1 + args["epochs"]):
    loss = train(model, predictor, edge_attr, emb.weight, emb_ea.weight, edge_index, split_edge,
                  optimizer, args["batch_size"])
    
    # Test
    if epoch % args["eval_steps"] == 0:
        results = test(model, predictor, edge_attr, emb.weight, emb_ea.weight, edge_index, split_edge,
                        evaluator, args["batch_size"])

        # Logging and prints 
        for key, result in results.items():
            loggers[key].add_result(run, result)
        if epoch % args["log_steps"] == 0:
            for key, result in results.items():
                valid_hits, test_hits = result
                print(key)
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Valid: {100 * valid_hits:.2f}%, '
                      f'Test: {100 * test_hits:.2f}%')
            print('---')
for key in loggers.keys():
    print(key)
    loggers[key].print_statistics(run)

Hits@20
Run: 01, Epoch: 01, Loss: 1.3083, Valid: 0.98%, Test: 1.18%
Hits@50
Run: 01, Epoch: 01, Loss: 1.3083, Valid: 1.51%, Test: 1.81%
Hits@100
Run: 01, Epoch: 01, Loss: 1.3083, Valid: 2.61%, Test: 3.07%
---
Hits@20
Run: 01, Epoch: 02, Loss: 0.9302, Valid: 1.67%, Test: 2.57%
Hits@50
Run: 01, Epoch: 02, Loss: 0.9302, Valid: 2.77%, Test: 3.97%
Hits@100
Run: 01, Epoch: 02, Loss: 0.9302, Valid: 4.21%, Test: 5.93%
---
Hits@20
Run: 01, Epoch: 03, Loss: 0.8228, Valid: 2.05%, Test: 1.87%
Hits@50
Run: 01, Epoch: 03, Loss: 0.8228, Valid: 5.10%, Test: 6.12%
Hits@100
Run: 01, Epoch: 03, Loss: 0.8228, Valid: 8.23%, Test: 8.68%
---
Hits@20
Run: 01, Epoch: 04, Loss: 0.7273, Valid: 3.78%, Test: 5.36%
Hits@50
Run: 01, Epoch: 04, Loss: 0.7273, Valid: 6.66%, Test: 7.65%
Hits@100
Run: 01, Epoch: 04, Loss: 0.7273, Valid: 9.49%, Test: 10.42%
---
Hits@20
Run: 01, Epoch: 05, Loss: 0.6703, Valid: 9.94%, Test: 6.66%
Hits@50
Run: 01, Epoch: 05, Loss: 0.6703, Valid: 13.16%, Test: 11.49%
Hits@100
Run: 01, Epoch: 

In [None]:
hits_100 = loggers["Hits@100"].results[0]
hits_50 = loggers["Hits@50"].results[0]
hits_20 = loggers["Hits@20"].results[0]

df_100 = pd.DataFrame(hits_100)
df_50 = pd.DataFrame(hits_50)
df_20 = pd.DataFrame(hits_20)

df_100.to_csv("laplacian_edge_features_hits_100.csv")
df_50.to_csv("laplacian_edge_features_hits_50.csv")
df_20.to_csv("laplacian_edge_features_hits_20.csv")