<a href="https://colab.research.google.com/github/milagjurovska/PPI-link-prediction-with-optimized-gan/blob/main/ppi_link_prediction_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from torch_geometric.data import Data
import torch
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics import roc_auc_score, average_precision_score
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear
from torch.nn import Module
from torch_geometric.utils import from_networkx, to_networkx
from sklearn.preprocessing import LabelEncoder
import torch.optim as optim

In [3]:
from google.colab import files
uploaded = files.upload()

Saving PP-Pathways_ppi.csv to PP-Pathways_ppi (2).csv


## Creating the graph

In [4]:
df = pd.read_csv("PP-Pathways_ppi.csv", sep=',', header=None, names=['source', 'target'])

df.dropna(subset=['source', 'target'], inplace=True)

le = LabelEncoder()
all_nodes = pd.concat([df['source'], df['target']], axis=0)
le.fit(all_nodes)
df['source'] = le.transform(df['source'])
df['target'] = le.transform(df['target'])

G = nx.from_pandas_edgelist(df, source='source', target='target')
print(f"Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")

G_data = from_networkx(G)
G_data.num_nodes = G.number_of_nodes()

Nodes: 21557, Edges: 342353


## Splitting data into training, testing and validating data

In [6]:
embed_size = 5
X = torch.nn.Embedding(G.number_of_nodes(), embed_size).weight
X = X.requires_grad_(True)
G_data.x = X

transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    is_undirected=True,
    disjoint_train_ratio=0.0,
    neg_sampling_ratio=1.0,
    add_negative_train_samples=True
)
train_data, val_data, test_data = transform(G_data)
print(f"Original edges: {G_data.edge_index.shape[1]}")
print(f"Train edges: {train_data.edge_index.shape[1]}")
print(f"Train labels: {train_data.edge_label_index.shape[1]}")

Original edges: 680989
Train edges: 479296
Train labels: 479296


## Creating the GCN model

In [7]:
class GCNLinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


    def decode(self, z, edge_label_index):
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        return (src * dst).sum(dim=-1)


In [8]:
model = GCNLinkPredictor(in_channels=5, hidden_channels=256)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [9]:
def train():
    model.train()
    z = model.encode(train_data.x, train_data.edge_index)
    optimizer.zero_grad()

    edge_index = train_data.edge_label_index
    edge_labels = train_data.edge_label
    pos_mask = edge_labels == 1

    pos_out = model.decode(z, edge_index[:, pos_mask])
    neg_out = model.decode(z, edge_index[:, ~pos_mask])

    eps = 1e-15
    pos_loss = -torch.log(torch.sigmoid(pos_out) + eps).mean()
    neg_loss = -torch.log(1 - torch.sigmoid(neg_out) + eps).mean()
    total_loss = pos_loss + neg_loss

    total_loss.backward()
    optimizer.step()

    return total_loss.item()

In [10]:
@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    scores = model.decode(z, data.edge_label_index)
    probs = torch.sigmoid(scores)
    return probs.cpu().numpy()

In [11]:
print("\n=== Before training ===")
test_probs = test(test_data)
print(f"Sample test predictions (first 5):\n{test_probs[:7]}")

for epoch in range(1, 7):
    loss = train()
    print(f"Epoch {epoch}, Loss: {loss:.4f}")
    test_probs = test(test_data)
    print(f"Sample predictions: {test_probs[:7].tolist()}...")


=== Before training ===
Sample test predictions (first 5):
[0.530382   0.52972996 0.50987554 0.5071415  0.51766694 0.51650465
 0.55038464]
Epoch 1, Loss: 1.3797
Sample predictions: [0.681304931640625, 0.663168728351593, 0.5835955142974854, 0.56560218334198, 0.6317839622497559, 0.5940040946006775, 0.7804179787635803]...
Epoch 2, Loss: 1.4358
Sample predictions: [0.5435724854469299, 0.5380550026893616, 0.5182057023048401, 0.5127427577972412, 0.5226582288742065, 0.5166904926300049, 0.5658537745475769]...
Epoch 3, Loss: 1.3608
Sample predictions: [0.8385013937950134, 0.8025381565093994, 0.6362372040748596, 0.6009599566459656, 0.6945065855979919, 0.6121864318847656, 0.9179043769836426]...
Epoch 4, Loss: 1.4658
Sample predictions: [0.6765326857566833, 0.6509907841682434, 0.5616182088851929, 0.5455015897750854, 0.5877314805984497, 0.5516918897628784, 0.7435969114303589]...
Epoch 5, Loss: 1.3428
Sample predictions: [0.5872396230697632, 0.5701577663421631, 0.52924644947052, 0.5214332938194275,