<a href="https://colab.research.google.com/github/milagjurovska/PPI-link-prediction-with-optimized-gcn/blob/main/ppi_link_prediction_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from torch_geometric.data import Data
import torch
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx, to_networkx
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [3]:
from google.colab import files
uploaded = files.upload()

Saving PP-Pathways_ppi.csv to PP-Pathways_ppi (4).csv


## Creating the graph

In [4]:
df = pd.read_csv("PP-Pathways_ppi.csv", sep=',', header=None, names=['source', 'target'])

df.dropna(subset=['source', 'target'], inplace=True)

encoder = LabelEncoder()
all_nodes = pd.concat([df['source'], df['target']], axis=0)
encoder.fit(all_nodes)
df['source'] = encoder.transform(df['source'])
df['target'] = encoder.transform(df['target'])

G = nx.from_pandas_edgelist(df, source='source', target='target')
print(f"Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")

G_data = from_networkx(G)
G_data.num_nodes = G.number_of_nodes()

Nodes: 21557, Edges: 342353


## Splitting data into training, testing and validating data

In [5]:
embed_size = 5
X = torch.nn.Embedding(G.number_of_nodes(), embed_size).weight
X = X.requires_grad_(True)
G_data.x = X

transform = RandomLinkSplit(
  num_val=0.1,
  num_test=0.2,
  is_undirected=True,
  disjoint_train_ratio=0.0,
  neg_sampling_ratio=1.0,
  add_negative_train_samples=True
)
train_data, val_data, test_data = transform(G_data)

## Creating the GCN model

In [6]:
class GCNLinkPredictor(torch.nn.Module):
  def __init__(self, in_channels, hidden_channels):
      super().__init__()
      self.conv1 = GCNConv(in_channels, hidden_channels)
      self.conv2 = GCNConv(hidden_channels, hidden_channels)

  def encode(self, x, edge_index):
      x = self.conv1(x, edge_index).relu()
      return self.conv2(x, edge_index)


  def decode(self, z, edge_label_index):
      src = z[edge_label_index[0]]
      dst = z[edge_label_index[1]]
      return (src * dst).sum(dim=-1)


In [7]:
model = GCNLinkPredictor(in_channels=5, hidden_channels=256)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [8]:
def train():
  model.train()
  z = model.encode(train_data.x, train_data.edge_index)
  optimizer.zero_grad()

  edge_index = train_data.edge_label_index
  edge_labels = train_data.edge_label
  pos_mask = edge_labels == 1

  pos_out = model.decode(z, edge_index[:, pos_mask])
  neg_out = model.decode(z, edge_index[:, ~pos_mask])

  eps = 1e-15
  pos_loss = -torch.log(torch.sigmoid(pos_out) + eps).mean()
  neg_loss = -torch.log(1 - torch.sigmoid(neg_out) + eps).mean()
  total_loss = pos_loss + neg_loss

  total_loss.backward()
  optimizer.step()

  return total_loss.item()

In [9]:
@torch.no_grad()
def test(data):
  model.eval()
  z = model.encode(data.x, data.edge_index)
  scores = model.decode(z, data.edge_label_index)
  probs = torch.sigmoid(scores)
  return probs.cpu().numpy()

In [10]:
print("\n=== Before training ===")
test_probs = test(test_data)
print(f"Sample test predictions (first 5):\n{test_probs[:5]}")

print("\n=== After training ===")
for epoch in range(1, 7):
  loss = train()
  test_probs = test(test_data)
  true_labels = test_data.edge_label.cpu().numpy()
  auc = roc_auc_score(true_labels, test_probs)
  test_preds = (test_probs >= 0.5).astype(int)
  f1 = f1_score(true_labels, test_preds)
  print(f"Epoch {epoch}, Loss: {loss:.4f}, AUC score: {auc:.4f}, F1 score: {f1:.4f}")
  print(f"Sample predictions: {test_probs[:5].tolist()}...")



=== Before training ===
Sample test predictions (first 5):
[0.5223207  0.5043248  0.51440567 0.50635105 0.52099824]

=== After training ===
Epoch 1, Loss: 1.3796, AUC score: 0.7335, F1 score: 0.6667
Sample predictions: [0.6776979565620422, 0.5590129494667053, 0.6165188550949097, 0.5706403255462646, 0.6880089044570923]...
Epoch 2, Loss: 1.4315, AUC score: 0.7695, F1 score: 0.6715
Sample predictions: [0.542576253414154, 0.512789249420166, 0.5313093066215515, 0.5176675915718079, 0.5499387383460999]...
Epoch 3, Loss: 1.3576, AUC score: 0.7799, F1 score: 0.6667
Sample predictions: [0.8036202788352966, 0.6052180528640747, 0.7256286144256592, 0.6090024709701538, 0.8410631418228149]...
Epoch 4, Loss: 1.5019, AUC score: 0.7885, F1 score: 0.6667
Sample predictions: [0.6257538199424744, 0.5383560061454773, 0.5903383493423462, 0.5425963401794434, 0.6510991454124451]...
Epoch 5, Loss: 1.3381, AUC score: 0.8284, F1 score: 0.6707
Sample predictions: [0.5530480742454529, 0.5156046748161316, 0.5413348