# Imports

In [53]:
import torch 
import numpy 
import matplotlib
import matplotlib.pyplot as plt
from ogb.linkproppred import PygLinkPropPredDataset, Evaluator
import torch_geometric 
import myutils
import models
import networkx as nx
import random
#from models import SAGE,DotProductLinkPredictor
from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.utils import negative_sampling,convert
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
dataset_name='ogbl-ddi'
dataset=PygLinkPropPredDataset(name=dataset_name)
data=dataset[0]
adj_t=PygLinkPropPredDataset(name=dataset_name,transform=torch_geometric.transforms.ToSparseTensor('coo'))[0].adj_t.to(device)
#use to_undirected to get some metrics
G=convert.to_networkx(data,to_undirected=True)



cpu


In [42]:
initial_embeddings=torch.ones(data.num_nodes, 1).to(device=device)
split_edge=dataset.get_edge_split()

In [43]:
# Initialize our model and LinkPredictor
hidden_dimension = 256
model = models.SAGE(1, hidden_dimension, hidden_dimension, 7, 0.5).to(device)
predictor = models.DotProductLinkPredictor().to(device)

# Run our initial "node features" through the GNN to get node embeddings
model.eval()
predictor.eval()
h = model(initial_embeddings, adj_t)

# Randomly sample some training edges and pass them through our basic predictor
torch.manual_seed(1955)
idx = torch.randperm(split_edge['train']['edge'].size(0))[:10]
edges = split_edge['train']['edge'][idx].t()
predictor(h[edges[0]], h[edges[1]])

tensor([0.7311, 0.7311, 0.7311, 0.7311, 0.7311, 0.7311, 0.7310, 0.7311, 0.7311,
        0.7311], grad_fn=<SigmoidBackward0>)

In [44]:
def create_train_batch(all_pos_train_edges,perm,edge_index):
    pos_edges=all_pos_train_edges[perm].t().to(device)

    #produce as many negative edges as positive edges
    neg_edges=negative_sampling(edge_index, num_neg_samples=perm.shape[0], method='dense').to(device)
    training_edges=torch.cat([pos_edges, neg_edges], dim=1)

    pos_labels=torch.ones(pos_edges.shape[1], dtype=torch.float, device=device)
    neg_labels=torch.zeros(neg_edges.shape[1], dtype=torch.float, device=device)

    training_labels=torch.cat([pos_labels, neg_labels], dim=0).to(device)

    return training_edges, training_labels

Example of a training batch of size 64 produces a training batch with size 128--> produces 64 real edges and 64 fake edges (total 64 training examples)
src_edges= [1 X 128]-->64 real and 64 fake edges of source
dest_edges=[1 X 128]--> 64 real and 64 fake destinations

training edges=[src_edges,
                dest_edges]= [ 2 X 
                                [1 X 128]
                                ]

training_labels=[1 X 128] 64 ones and 64 zeroes

In [45]:
batch_size=64
kk,kl=create_train_batch(split_edge['train']['edge'],torch.randperm(n=split_edge['train']['edge'].size(0))[:batch_size],data.edge_index)

for(src_node,dst_node) in list(zip(kk[0],kk[1])):

    preds = nx.jaccard_coefficient(G, [(src_node.item(), dst_node.item())])
    print(list(*preds)[2]) 


0.4797570850202429
0.5159165751920965
0.11585760517799353
0.3251088534107402
0.4250886524822695
0.4798792756539235
0.280359820089955
0.24956672443674177
0.3878787878787879
0.2648648648648649
0.23351063829787233
0.6697530864197531
0.36221498371335503
0.6227951153324288
0.33683749452474815
0.061630218687872766
0.18518518518518517
0.3333333333333333
0.14728148657949072
0.5658914728682171
0.4575342465753425
0.28593272171253825
0.5910326086956522
0.31241655540720964
0.24108416547788872
0.26693629929221435
0.13542688910696762
0.18377088305489261
0.14265734265734265
0.19063360881542699
0.35687453042824946
0.3344343517753922
0.28254847645429365
0.13083497698882315
0.3620689655172414
0.2061114439784302
0.5133950316609839
0.4028203556100552
0.28233351678591084
0.7471698113207547
0.4275109170305677
0.18896321070234115
0.6666666666666666
0.11558854718981973
0.3962025316455696
0.3932788374205268
0.3033075299085151
0.10325318246110325
0.23142250530785563
0.18181818181818182
0.3749235474006116
0.4088

In [46]:

preds = nx.jaccard_coefficient(G, [(kk[0][0].item(), kk[0][1].item())])
list(*preds)[2]

0.1391170431211499

In [59]:
def train(model, predictor, x, adj_t, split_edge, loss_fn, optimizer, batch_size, num_epochs,edge_model=False, spd=None):
  # adj_t isn't used everywhere in PyG yet, so we switch back to edge_index for negative sampling
  # row, col, edge_attr = adj_t.t().coo()
  # edge_index = torch.stack([row, col], dim=0)

  edge_index=PygLinkPropPredDataset(name='ogbl-ddi')[0].edge_index.to(device)
  model.train()
  predictor.train()

  model.reset_parameters()
  predictor.reset_parameters()
  loss_per_epoch = []
  all_pos_train_edges = split_edge['train']['edge']
  for epoch in range(num_epochs):
    epoch_total_loss = 0
    for perm in DataLoader(range(all_pos_train_edges.shape[0]), batch_size,
                           shuffle=True):
      optimizer.zero_grad()

      train_edge, train_label = create_train_batch(all_pos_train_edges, perm, edge_index)

      if edge_model:
        h=model(x,edge_index,spd)
      else:
        h = model(x, adj_t)

      # Get predictions for our batch and compute the loss
      preds = predictor(h[train_edge[0]], h[train_edge[1]])
      loss = loss_fn(preds, train_label)

      epoch_total_loss += loss.item()

      # Update our parameters
      # pass the loss of the current training batch backwards
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0)
      optimizer.step()
    loss_per_epoch.append(epoch_total_loss)
    myutils.draw_metric_per_epoch(loss_per_epoch, "Loss per epoch", "Loss", "Epoch", "loss_per_epoch")
    print(f'Epoch {epoch} has loss {round(epoch_total_loss, 4)}')

In [61]:

#turn off gradient tracking for test
@torch.no_grad()
def test(model, predictor, x, adj_t, split_edge, evaluator, batch_size, edge_model=False, spd=None):
    model.eval()
    predictor.eval()

    if edge_model:
        edge_index = PygLinkPropPredDataset(name='ogbl-ddi')[0].edge_index.to(device)
        h = model(x, edge_index, spd)
    else:
        h = model(x, adj_t)

    pos_eval_edge = split_edge['edge'].to(device)
    neg_eval_edge = split_edge['edge_neg'].to(device)

    pos_eval_preds = []
    for perm in DataLoader(range(pos_eval_edge.shape[0]), batch_size):
        edge = pos_eval_edge[perm].t()
        pos_eval_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    pos_eval_pred = torch.cat(pos_eval_preds, dim=0)

    neg_eval_preds = []
    for perm in DataLoader(range(neg_eval_edge.size(0)), batch_size):
        edge = neg_eval_edge[perm].t()
        neg_eval_preds += [predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
    neg_eval_pred = torch.cat(neg_eval_preds, dim=0)

    total_preds = torch.cat((pos_eval_pred, neg_eval_pred), dim=0)
    labels = torch.cat((torch.ones_like(pos_eval_pred), torch.zeros_like(neg_eval_pred)), dim=0)
    acc = models.BinaryAccuracy(total_preds, labels)

    results = {}
    for K in [10, 20, 30, 40, 50]:
        evaluator.K = K
        valid_hits = evaluator.eval({
            'y_pred_pos': pos_eval_pred,
            'y_pred_neg': neg_eval_pred,
        })[f'hits@{K}']
        results[f'Hits@{K}'] = (valid_hits)
    results['Accuracy'] = acc

    return results
eval = Evaluator(name='ogbl-ddi')
# ogb Evaluators can be invoked to get their expected format
print(eval.expected_input_format) 

==== Expected input format of Evaluator for ogbl-ddi
{'y_pred_pos': y_pred_pos, 'y_pred_neg': y_pred_neg}
- y_pred_pos: numpy ndarray or torch tensor of shape (num_edges, ). Torch tensor on GPU is recommended for efficiency.
- y_pred_neg: numpy ndarray or torch tensor of shape (num_edges, ). Torch tensor on GPU is recommended for efficiency.
y_pred_pos is the predicted scores for positive edges.
y_pred_neg is the predicted scores for negative edges.
Note: As the evaluation metric is ranking-based, the predicted scores need to be different for different edges.


In [None]:
# SAGE AND CONCAT NEURAL LINK PREDICTOR
# Initialize our model and LinkPredictor
hidden_dimension = 256
model = models.SAGE(1, hidden_dimension, hidden_dimension, 7, 0.5).to(device)
predictor = models.ConcatNeuralLinkPredictor(hidden_dimension,hidden_dimension,1,4,0.5).to(device)

# Run our initial "node features" through the GNN to get node embeddings
model.eval()
predictor.eval()
h = model(initial_embeddings, adj_t)


# SAGE AND neural LINK PREDICTOR
optimizer = torch.optim.Adam(
            list(model.parameters())  +
            list(predictor.parameters()), lr=0.01)
train(model, predictor, initial_embeddings, adj_t, split_edge, torch.nn.BCELoss(), 
      optimizer, 64*1024, 10)
test(model, predictor, initial_embeddings, adj_t, split_edge["valid"], Evaluator(name='ogbl-ddi'), 64*1024)

In [None]:
# SAGE AND NEURAL LINK PREDICTOR
# Initialize our model and LinkPredictor
hidden_dimension = 256
model = models.SAGE(1, hidden_dimension, hidden_dimension, 7, 0.5).to(device)
predictor = models.NeuralLinkPredictor(hidden_dimension,hidden_dimension,1,4,0.5).to(device)

# Run our initial "node features" through the GNN to get node embeddings
model.eval()
predictor.eval()
h = model(initial_embeddings, adj_t)


# SAGE AND neural LINK PREDICTOR
optimizer = torch.optim.Adam(
            list(model.parameters())  +
            list(predictor.parameters()), lr=0.01)
train(model, predictor, initial_embeddings, adj_t, split_edge, torch.nn.BCELoss(), 
      optimizer, 64*1024, 10)
test(model, predictor, initial_embeddings, adj_t, split_edge["valid"], Evaluator(name='ogbl-ddi'), 64*1024)


In [None]:
# SAGE AND DOT LINK PREDICTOR
optimizer = torch.optim.Adam(
            list(model.parameters())  +
            list(predictor.parameters()), lr=0.01)
train(model, predictor, initial_embeddings, adj_t, split_edge, torch.nn.BCELoss(), 
      optimizer, 64*1024, 5)
test(model, predictor, initial_embeddings, adj_t, split_edge["valid"], Evaluator(name='ogbl-ddi'), 64*1024)

# Why our basic SAGE model performs poorly
The node embeddings that are produced do not have any information regarding other measures such as centrality. We will try and enhance our initial embeddings with other metrics before passing them to our SAGE model


In [None]:
# create embedding with some metrics
# change the function to get training batches to generate additional measures
# for the embeddings

def train_w_metrics(model, predictor, x, adj_t, split_edge, loss_fn, optimizer, batch_size, num_epochs):
  # adj_t isn't used everywhere in PyG yet, so we switch back to edge_index for negative sampling
  # row, col, edge_attr = adj_t.t().coo()
  # edge_index = torch.stack([row, col], dim=0)

  edge_index=PygLinkPropPredDataset(name='ogbl-ddi')[0].edge_index.to(device)
  model.train()
  predictor.train()

  model.reset_parameters()
  predictor.reset_parameters()
  loss_per_epoch = []
  all_pos_train_edges = split_edge['train']['edge']
  for epoch in range(num_epochs):
    epoch_total_loss = 0
    for perm in DataLoader(range(all_pos_train_edges.shape[0]), batch_size,
                           shuffle=True):
      optimizer.zero_grad()

      train_edge, train_label = create_train_batch(all_pos_train_edges, perm, edge_index)
      
      # calculate embeddings and metrics for all the nodes in the training batch
      
      for(src_node,dst_node) in list(zip(train_edge[0],train_edge[1])):

        preds = nx.jaccard_coefficient(G, [(src_node.item(), dst_node.item())])
        score=list(*preds)[2] 
        x[src_node][5]=score 
        x[dst_node][5]=score
        #x[_][5]=nx.resource_allocation_index(G,[(train_edge[0][_],train_edge[1][_])])
        #x[_][6]=nx.adamic_adar_index(G,[(train_edge[0][_],train_edge[1][_])])
        #x[_][7]=nx.preferential_attachment(G,[(train_edge[0][_],train_edge[1][_])])
        


      #pass the augmented embeddings into sage to transform them
      h = model(x, adj_t)

      # Get predictions for our batch and compute the loss
      preds = predictor(h[train_edge[0]], h[train_edge[1]])
      loss = loss_fn(preds, train_label)

      epoch_total_loss += loss.item()

      # Update our parameters
      # pass the loss of the current training batch backwards
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      torch.nn.utils.clip_grad_norm_(predictor.parameters(), 1.0)
      optimizer.step()
    loss_per_epoch.append(epoch_total_loss)
    myutils.draw_metric_per_epoch(loss_per_epoch, "Loss per epoch", "Loss", "Epoch", "loss_per_epoch")
    print(f'Epoch {epoch} has loss {round(epoch_total_loss, 4)}')

In [50]:
#the saved txt are 0-based to be consistent with the node ids in the graph

augmented_embeddings = torch.ones((len(G.nodes)), 5, dtype=torch.float64).to(device)
clustering_coef_dict=myutils.load_data_from_txt("clustering_coef")
betweenness_dict=myutils.load_data_from_txt("betweeness_centrality")
pagerank_dict=myutils.load_data_from_txt("pagerank")
for i in range(G.number_of_nodes()):
    augmented_embeddings[i][0]=clustering_coef_dict[i]
    augmented_embeddings[i][1]=betweenness_dict[i]
    augmented_embeddings[i][2]=pagerank_dict[i]
    augmented_embeddings[i][3]=G.degree[i]
    augmented_embeddings[i][4]=1.0
augmented_embeddings=augmented_embeddings.float()


In [None]:
# SAGE AND NEURAL LINK PREDICTOR and augmented embeddings
# Initialize our model and LinkPredictor
hidden_dimension = 16
model = models.SAGE(5, hidden_dimension, hidden_dimension, 5, 0.5).to(device)
predictor = models.NeuralLinkPredictor(hidden_dimension,hidden_dimension,1,4,0.5).to(device)

# Run our initial "node features" through the GNN to get node embeddings
model.eval()
predictor.eval()
augmented_init_embeddings=augmented_embeddings
#h = model(augmented_init_embeddings, adj_t)


# SAGE AND neural LINK PREDICTOR
optimizer = torch.optim.Adam(
            list(model.parameters())  +
            list(predictor.parameters()), lr=0.005)
train(model, predictor, augmented_init_embeddings, adj_t, split_edge, torch.nn.BCELoss(), 
      optimizer, 64*1024, 100)
test(model, predictor, augmented_init_embeddings, adj_t, split_edge["valid"], Evaluator(name='ogbl-ddi'), 64*1024)

In [None]:
# Sage for embeddings and LinkPredictorEdgeInfo for prediction
# Take the embeddings from the SAGE model and use spatial information from anchor nodes
#before making a prediction
# get the ShortestPathDistance for each node to the anchor nodes
K = 200
sampled_nodes = sorted(random.sample(G.nodes, K))
num_nodes = G.number_of_nodes()
spd = torch.ones(num_nodes, K, dtype=torch.float64).to(device)
for k in range(K):
  distance_from_sample_k_to_all_nodes = nx.shortest_path_length(G, source=sampled_nodes[k])
  for node in distance_from_sample_k_to_all_nodes:
    spd[node][k] = distance_from_sample_k_to_all_nodes[node]
spd = spd.float()
spd

In [None]:
hidden_dimension = 16
model = models.EdgeSAGE(5, hidden_dimension, hidden_dimension, 5, 0.5).to(device)
predictor = models.NeuralLinkPredictor(hidden_dimension,hidden_dimension,1,4,0.5).to(device)
predictor=models.NeuralLinkPredictor(hidden_dimension,hidden_dimension,1,4,0.5).to(device)
# Run our initial "node features" through the GNN to get node embeddings
model.eval()
predictor.eval()
augmented_init_embeddings=augmented_embeddings
#h = model(augmented_init_embeddings, adj_t)


# SAGE AND neural LINK PREDICTOR
optimizer = torch.optim.Adam(
            list(model.parameters())  +
            list(predictor.parameters()), lr=0.005)
train(model, predictor, augmented_init_embeddings, adj_t, split_edge, torch.nn.BCELoss(), 
      optimizer, 64*1024, 100,edge_model=True,spd=spd)
test(model, predictor, augmented_init_embeddings, adj_t, split_edge["valid"], Evaluator(name='ogbl-ddi'), 64*1024,edge_model=True,spd=spd)