# Exercise 4
Due:  Tue November 19, 8:00am

## Node2Vec
1. Implement custom dataset that samples pq-walks
    - Use the utility function from torch_cluster that actually performs the walks
2. Implement Node2Vec module and training
	- Node2Vec essentially consists of a torch.Embedding module and a loss function
3. Evaluate node classification performance on Cora
4. Evaluate on Link Prediction: Cora, PPI
    - use different ways to combine the node two embeddings for link prediction

Bonus Question: are the predictions stable wrt to the random seeds of the walks?

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch_geometric as pyg
from tqdm import tqdm
from torch.utils.data import IterableDataset, DataLoader, Dataset
from torch_cluster import random_walk
import sklearn
import torch_scatter 
import torch_sparse

In [3]:
# find device
if torch.cuda.is_available(): # NVIDIA
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # apple M1/M2
    device = torch.device('mps') 
else:
    device = torch.device('cpu')
device

device(type='cuda')

In [4]:
dataset = pyg.datasets.Planetoid(root='./dataset/cora', name='Cora')
cora = dataset[0]
dataset = pyg.datasets.PPI(root='./dataset/ppi')
ppi = dataset[0]

  if osp.exists(f) and torch.load(f) != _repr(self.pre_transform):
  if osp.exists(f) and torch.load(f) != _repr(self.pre_filter):
  return torch.load(f, map_location)


## node2vec embedding training
Here the main training and everything on the graph level is happening.

It might be a good idea to create a dataset of walks (fixed for the whole training process) first to get the whole training process running before attempting to create a train_loader that on-demand samples those walks on-demand.

In [None]:
class Node2VecDataset(Dataset):
    def __init__(self, data, p=1.0, q=1.0, walk_length=10, num_walks=10):
        self.data = data
        self.edge_index = data.edge_index
        self.num_nodes = data.num_nodes
        self.p = p
        self.q = q
        self.walk_length = walk_length
        self.num_walks = num_walks

        row, col = self.edge_index
        
        # Precompute the walks
        self.walks = []
        for _ in range(self.num_walks):
            start_nodes = torch.arange(self.num_nodes)
            walks = random_walk(row, col, start_nodes, walk_length=self.walk_length)
            self.walks.append(walks)
        self.walks = torch.cat(self.walks, dim=0)

    def __len__(self):
        return len(self.walks)

    def __getitem__(self, idx):
        return self.walks[idx]

In [15]:
cora_num_classes = cora.y.unique().shape[0]

In [118]:
train_dataloader = DataLoader(Node2VecDataset(cora), shuffle=True, batch_size=64)

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Node2Vec(nn.Module):
    def __init__(self, num_nodes, embedding_dim):
        super(Node2Vec, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_nodes = num_nodes

        # Embedding for nodes (input)
        self.embedding = nn.Embedding(num_nodes, embedding_dim)
        nn.init.xavier_uniform_(self.embedding.weight)

        # Embedding for nodes (output)
        self.context_embedding = nn.Embedding(num_nodes, embedding_dim)
        nn.init.xavier_uniform_(self.context_embedding.weight)

    def forward(self, nodes):
        node_embeds = self.embedding(nodes)
        return node_embeds


In [7]:
def negative_sampling_loss(center_embeddings, context_embeddings, negative_embeddings):
    # Positive score: dot product between center and context embeddings
    pos_score = torch.sum(center_embeddings * context_embeddings, dim=-1)
    pos_loss = F.logsigmoid(pos_score).squeeze()

    # Negative score: dot product between center and negative embeddings
    neg_score = torch.bmm(negative_embeddings, center_embeddings.unsqueeze(2)).squeeze()
    neg_loss = F.logsigmoid(-neg_score).sum(1)

    loss = -(pos_loss + neg_loss).mean()
    return loss


In [8]:
import random

def train_node2vec(model, data_loader, num_nodes, epochs=10, window_size=5, negative_samples=5, lr=0.01, device='cpu'):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0
        for walks in data_loader:
            walks = walks.to(device)
            batch_size, walk_length = walks.shape

            # For each position in the walk
            for pos in range(walk_length):
                center_nodes = walks[:, pos]

                # Determine context window
                start = max(0, pos - window_size)
                end = min(walk_length, pos + window_size + 1)
                context_positions = list(range(start, pos)) + list(range(pos + 1, end))
                context_nodes = walks[:, context_positions]

                # Flatten context nodes
                context_nodes = context_nodes.reshape(-1)

                # Get embeddings
                center_embeddings = model.embedding(center_nodes)
                context_embeddings = model.context_embedding(context_nodes)

                # Negative sampling
                negative_nodes = torch.randint(0, num_nodes, (batch_size * len(context_positions), negative_samples), device=device)
                negative_embeddings = model.context_embedding(negative_nodes)

                # Compute loss
                loss = negative_sampling_loss(center_embeddings.repeat_interleave(len(context_positions), dim=0),
                                              context_embeddings,
                                              negative_embeddings)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')


In [10]:
# Parameters
embedding_dim = 128
window_size = 5
negative_samples = 5
epochs = 10
lr = 0.01
batch_size = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_nodes = cora.num_nodes
model = Node2Vec(num_nodes, embedding_dim).to(device)

In [13]:
model.load_state_dict(torch.load('./node2vec_model.pth'))

  model.load_state_dict(torch.load('./node2vec_model.pth'))


<All keys matched successfully>

In [None]:
node2vec_dataset = Node2VecDataset(cora, p=1.0, q=1.0, walk_length=80, num_walks=10)
data_loader = DataLoader(node2vec_dataset, batch_size=batch_size, shuffle=True)

train_node2vec(model, data_loader, num_nodes, epochs=epochs, window_size=window_size,
               negative_samples=negative_samples, lr=lr, device=device)


Epoch 1/10, Loss: 60.0469
Epoch 2/10, Loss: 55.0850
Epoch 3/10, Loss: 54.4130
Epoch 4/10, Loss: 54.1472
Epoch 5/10, Loss: 53.9153
Epoch 6/10, Loss: 53.7944
Epoch 7/10, Loss: 53.7160
Epoch 8/10, Loss: 53.6526
Epoch 9/10, Loss: 53.6061
Epoch 10/10, Loss: 53.5482


In [135]:
# (model)
torch.save(model.state_dict(), 'node2vec_model.pth')

## Node classification performance
just a small MLP or even linear layer on the embeddings to predict node classes. Accuracy should be above 60%. Please compare your results to those you achieved with GNNs.

In [62]:
# as the simple MLP is pretty straightforward
cls_model = torch.nn.Sequential(
    torch.nn.Linear(embedding_dim, 256), # Input layer
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128), # Hidden layer 2
    torch.nn.ReLU(),
    torch.nn.Linear(128, cora_num_classes), # Output layer
)

cls_model = cls_model.to(device)

In [63]:
from sklearn.metrics import accuracy_score

In [64]:
optimizer = torch.optim.Adam(cls_model.parameters(), lr=0.001)  # define an optimizer
criterion = torch.nn.CrossEntropyLoss()  # define loss function

node2vec_embeddings = model.embedding.weight.to(device)
cora = cora.to(device)

for epoch in range(200):  # 100 epochs
    cls_model.train()
    optimizer.zero_grad()
    out = cls_model(node2vec_embeddings[cora.train_mask])  # forward pass
    loss = criterion(out, cora.y[cora.train_mask]) 
    loss.backward()  
    optimizer.step()

    # print out loss info
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.3e}")

def get_accuracy(cls_model, embeddings, y, mask):
    out = cls_model(embeddings[mask])
    pred = out.argmax(dim=1)
    acc = accuracy_score(y[mask].cpu().numpy(), pred.cpu().detach().numpy())
    return acc

train_acc = get_accuracy(cls_model, node2vec_embeddings, cora.y, cora.train_mask)
val_acc = get_accuracy(cls_model, node2vec_embeddings, cora.y, cora.val_mask)
test_acc = get_accuracy(cls_model, node2vec_embeddings, cora.y, cora.test_mask)
    
print(f"node classification accuracy for cora: {test_acc:.2f} (train: {train_acc:.2f}, val: {val_acc:.2f})")

Epoch 10, Loss: 1.724e+00
Epoch 20, Loss: 1.224e+00
Epoch 30, Loss: 5.340e-01
Epoch 40, Loss: 1.311e-01
Epoch 50, Loss: 2.840e-02
Epoch 60, Loss: 9.375e-03
Epoch 70, Loss: 4.949e-03
Epoch 80, Loss: 3.428e-03
Epoch 90, Loss: 2.718e-03
Epoch 100, Loss: 2.299e-03
Epoch 110, Loss: 2.007e-03
Epoch 120, Loss: 1.782e-03
Epoch 130, Loss: 1.598e-03
Epoch 140, Loss: 1.443e-03
Epoch 150, Loss: 1.310e-03
Epoch 160, Loss: 1.194e-03
Epoch 170, Loss: 1.094e-03
Epoch 180, Loss: 1.005e-03
Epoch 190, Loss: 9.264e-04
Epoch 200, Loss: 8.567e-04
node classification accuracy for cora: 0.58 (train: 1.00, val: 0.58)


## link prediction on trained embeddings
this should only train simple MLPs.

Note: for link prediction to be worthwhile, one needs to train the embeddings on a subset of the graph (less edges, same nodes) instead of the whole graph.

In [32]:
# for link prediction, do something like the following
link_splitter = pyg.transforms.RandomLinkSplit(is_undirected=True)
train_data, val_data, test_data = link_splitter(cora)
train_data
# the positive and negative edges are in "edge_label_index" with "edge_label" 
# indicating whether an edge is a true edge or not.

Data(x=[2708, 1433], edge_index=[2, 7392], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[7392], edge_label_index=[2, 7392])

In [28]:
test_data

Data(x=[2708, 1433], edge_index=[2, 8446], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[2110], edge_label_index=[2, 2110])

In [None]:
# retrain node2vec on train_data

In [33]:
# use those (new) embeddings for link prediction