In [1]:
#!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv torch_geometric -f https://data.pyg.org/whl/torch-1.13.0+cpu.html

In [2]:
import torch

## Protein-Protein Interaction (PPI) network from SNAP

This is protein-protein interaction network that contains physical interactions between proteins that are experimentally documented in humans, such as metabolic enzyme-coupled interactions and signaling interactions. Nodes represent human proteins and edges represent physical interaction between proteins in a human cell. 


More information about the dataset can be found at [https://snap.stanford.edu/biodata/datasets/10000/10000-PP-Pathways.html](https://snap.stanford.edu/biodata/datasets/10000/10000-PP-Pathways.html)

In [3]:
import networkx as nx
Data = open('data/PP-Pathways_ppi.edges', "r")
#next(Data, None)  # skip the first line in the input file
Graphtype = nx.DiGraph()

G = nx.parse_edgelist(Data, delimiter=',', create_using=Graphtype,
                      nodetype=int)

In [4]:
from torch_geometric.utils import from_networkx

data = from_networkx(G, group_node_attrs = None)

In [5]:
data.x = torch.Tensor([[1] for _ in range(data.num_nodes)])

### Link Prediction setting

#### Link split and negative sampling

In [6]:
#Perform random link split
from torch_geometric.transforms import RandomLinkSplit

link_split = RandomLinkSplit(num_val=0.0,num_test=0.25)
train_link, val_link, test_link = link_split(data)

In [7]:
train_link

Data(edge_index=[2, 256765], num_nodes=21557, x=[21557, 1], edge_label=[513530], edge_label_index=[2, 513530])

In [8]:
# edge_label : 1 for closed link, 0 for open link
# edge_label_index: edge_index + negative_sampling edge_index

### GNN Model

In [9]:
from sklearn.metrics import roc_auc_score

We can re-use the same GNN model!

<img src="../img/CoraGCNLinkPre.png" alt="linkpred_gcn" width="500"/>

In [10]:
from torch_geometric.nn import GCNConv, Linear
import torch.nn.functional as F

In [11]:
class GCN4PPI(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        torch.manual_seed(1234567) #manual seed for reproducibility
        
        self.lin1 = Linear(data.num_features, 64) 
        self.lin2 = Linear(64, 32)
        self.conv1 = GCNConv(32, 16) 
        self.conv2 = GCNConv(16, 2)
        
    def reset_parameters(self):
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, x, edge_index):
        #x : node feature matrix, edge_index : structure of the graph
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.25, training=self.training)
        
        x = self.lin2(x)
        x = x.relu()
        x = F.dropout(x, p=0.25, training=self.training)
        
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.25, training=self.training)
        
        x = self.conv2(x, edge_index)
        
        return x

In [12]:
model = GCN4PPI()
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion =  torch.nn.BCEWithLogitsLoss() #change loss function

def train_linkpre():
    
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(train_link.x, train_link.edge_index)  # Perform a single forward pass.
    
    ### LINK PREDICTION ACTS HERE ###
    
    out_src = out[train_link.edge_label_index[0]] #embedding src nodes
    out_dst = out[train_link.edge_label_index[1]] #embedding dst nodes
    
    # LINK EMBEDDING #
    # 1 - Dot Product
    out_sim = out_src * out_dst #dotproduct
    pred = torch.sum(out_sim, dim=-1)
    
    # 2 - Concatenation + linear function
    #out_sim = torch.cat([out_src, out_dst], dim=-1)
    #pred = torch.sum(out_sim,dim=-1)
    
    
    loss = criterion(pred, train_link.edge_label.type_as(pred))  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test_linkpre(test_link):
    model.eval()
    out = model(test_link.x, test_link.edge_index)
    
    ### LINK PREDICTION ACTS HERE ###
    
    out_src = out[test_link.edge_label_index[0]] #embedding src nodes
    out_dst = out[test_link.edge_label_index[1]] #embedding dst nodes
    
    # LINK EMBEDDING #
    # 1 - Dot Product
    out_sim = out_src * out_dst
    h = torch.sum(out_sim, dim=-1)
    
    # 2 - Concatenation + linear function
    #out_sim = torch.cat([out_src, out_dst], dim=-1)
    #h = torch.sum(out_sim,dim=-1)
    
    pred_cont = torch.sigmoid(h).cpu().detach().numpy()
    
    # EVALUATION
    test_label = test_link.edge_label.cpu().detach().numpy() #retrieve test set labels
    test_roc_score = roc_auc_score(test_label, pred_cont) #comput AUROC score for test set
    
    return test_roc_score


for epoch in range(1, 101):
    loss = train_linkpre()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 0.6545
Epoch: 002, Loss: 0.8919
Epoch: 003, Loss: 0.6087
Epoch: 004, Loss: 0.6539
Epoch: 005, Loss: 0.6717
Epoch: 006, Loss: 0.6777
Epoch: 007, Loss: 0.6802
Epoch: 008, Loss: 0.6769
Epoch: 009, Loss: 0.6684
Epoch: 010, Loss: 0.6569
Epoch: 011, Loss: 0.6367
Epoch: 012, Loss: 0.6173
Epoch: 013, Loss: 0.6144
Epoch: 014, Loss: 0.6295
Epoch: 015, Loss: 0.6310
Epoch: 016, Loss: 0.6190
Epoch: 017, Loss: 0.6131
Epoch: 018, Loss: 0.6101
Epoch: 019, Loss: 0.6138
Epoch: 020, Loss: 0.6155
Epoch: 021, Loss: 0.6153
Epoch: 022, Loss: 0.6119
Epoch: 023, Loss: 0.6092
Epoch: 024, Loss: 0.6084
Epoch: 025, Loss: 0.6046
Epoch: 026, Loss: 0.6057
Epoch: 027, Loss: 0.6044
Epoch: 028, Loss: 0.6060
Epoch: 029, Loss: 0.6083
Epoch: 030, Loss: 0.6050
Epoch: 031, Loss: 0.6013
Epoch: 032, Loss: 0.6014
Epoch: 033, Loss: 0.6015
Epoch: 034, Loss: 0.6009
Epoch: 035, Loss: 0.6000
Epoch: 036, Loss: 0.6000
Epoch: 037, Loss: 0.5974
Epoch: 038, Loss: 0.5963
Epoch: 039, Loss: 0.5953
Epoch: 040, Loss: 0.5963


In [13]:
roc_train = test_linkpre(train_link)
roc_test = test_linkpre(test_link)
print(f'Train AUROC: {roc_train:.4f}\nTest AUROC: {roc_test:.4f}')

Train AUROC: 0.8578
Test AUROC: 0.8570


### Exercise 3 - HadamardMLP
Actually, there are two popular solutions to perform the decoder step of link prediction tasks: the dot product and the HadamardMLP architecture. After theoretical and empirical analysis, [Wang et al. (2022)](https://arxiv.org/abs/2209.10100) find that the HadamardMLP decoders are generally more effective for LP.  

Implement the HadamardMLP decoder into the CoraGCN architecture. The HadamardMLP consists of two steps:
- Perform the hadamard product between the embeddings of nodes src and the embeddings of nodes dst.
- Use a MLP layer to post-process the hadamard product.  

Note that, for this task, you need to create a CoraGCNLP class that returns link prediction scores as the output of the forward method. Only in this way, the MLP layer parameters can be updated through the backpropagation steps.