# Exercise 2
Due:  Tue November 5, 8:00am

In [1]:
import torch
import torch_geometric as pyg
from torch_geometric.data import DataLoader
from ogb.graphproppred import PygGraphPropPredDataset
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
import torch_scatter
import matplotlib.pyplot as plt
import networkx as nx

# Determine the computational device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# GCN Layer definition
class GCNLayer(torch.nn.Module):
    def __init__(self, in_features, out_features, activation=torch.nn.functional.relu):
        super(GCNLayer, self).__init__()
        self.linear = torch.nn.Linear(in_features, out_features)
        self.activation = activation

    def forward(self, H, edge_index, edge_weight=None):
        edge_index, _ = pyg.utils.add_self_loops(edge_index, num_nodes=H.size(0))
        row, col = edge_index
        deg = pyg.utils.degree(row, H.size(0), dtype=H.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col] * edge_weight

        H = self.linear(H)
        H = H[col] * norm.unsqueeze(-1)
        H = torch_scatter.scatter_add(H, row, dim=0)

        if self.activation:
            H = self.activation(H)
        return H

# GraphNet definition using AtomEncoder and BondEncoder
class GraphNet(torch.nn.Module):
    def __init__(self, num_node_types, num_edge_types, out_features, hidden_features=32, activation=torch.nn.functional.relu, dropout=0.1):
        super(GraphNet, self).__init__()
        self.embedding = AtomEncoder(hidden_features)
        self.edge_embedding = BondEncoder(hidden_features)
        self.gcn1 = GCNLayer(hidden_features, hidden_features, activation)
        self.gcn2 = GCNLayer(hidden_features, hidden_features, activation)
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(hidden_features, out_features)

    def forward(self, x, edge_index, edge_attr, batch=None):
        H = self.embedding(x)
        edge_weights = self.edge_embedding(edge_attr)
        H = self.gcn1(H, edge_index, edge_weights)
        H = self.gcn2(H, edge_index, edge_weights)
        H = self.dropout(H)
        if batch is not None:
            H = pyg.nn.global_mean_pool(H, batch)
        out = self.linear(H)
        return out.squeeze()

# Load ZINC dataset
print("Loading ZINC dataset...")
dataset = PygGraphPropPredDataset(name="ogbg-molhiv", root="dataset/")
split_idx = dataset.get_idx_split()
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
val_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False)
print("ZINC dataset loaded.")

# Initialize the GraphNet model
model = GraphNet(num_node_types=28, num_edge_types=3, out_features=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
def train():
    model.train()
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.edge_attr, data.batch)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()

def evaluate(loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            pred = model(data.x, data.edge_index, data.edge_attr, data.batch)
            correct += ((pred > 0) == data.y).sum().item()
    return correct / len(loader.dataset)

# Training and evaluation
for epoch in range(10):
    train()
    val_acc = evaluate(val_loader)
    print(f'Epoch: {epoch+1}, Validation Accuracy: {val_acc:.4f}')

# Testing the model
test_acc = evaluate(test_loader)
print(f'Test Accuracy: {test_acc:.4f}')


Using device: cuda
Loading ZINC dataset...
ZINC dataset loaded.


  self.data, self.slices = torch.load(self.processed_paths[0])


RuntimeError: The size of tensor a (2523) must match the size of tensor b (32) at non-singleton dimension 1

In this exercise, we use sparse message passing to make our networks scale to larger graphs. 


1) In this exercise we are working with the node-classification dataset Cora and the graph regression dataset ZINC. When working with a new dataset, it makes sense to at least quickly look into the data and some statistics for it. So for Cora: which is the second-biggest label class and what does it stand for? And for ZINC: how many HCO molecules (i.e. molecules consisting only of Hydrogen, Carbon, and Oxygen) are in the train set?

1) When working on the Cora dataset your model should at least reach an accuracy of 0.6 (an accuracy of 0.7-0.8 is well within reach).
Cora is a node classification dataset, so there is only one graph and we perform message passing on the whole graph (but evaluate the loss only on the nodes selected by cora_graph.train_mask).
The dataset is mostly balanced, so we evaluate the accuracy.
When implementing the message passing step, keep in mind that the graph does not contain self-loops (so one needs to somehow treat the "old" state).
Since Cora is small enough to be run with dense tensors too, you can verify your implementation this way.

2) ZINC is a small molecular regression dataset. Please compare the difference in performance between the (trainable) Atomencoder provided by ogb and the one-hot encoding you implemented in the first exercise.
Note that since you need batches, you need to modify the pooling layer to respect the batches.


In [2]:
# find device
if torch.cuda.is_available(): # NVIDIA
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # apple M1/M2
    device = torch.device('mps') 
else:
    device = torch.device('cpu')
device

device(type='mps')

## Cora

In [194]:
cora = pyg.datasets.Planetoid(root = "dataset/cora", name="Cora")
cora_graph = cora[0]
cora_dense_adj = pyg.utils.to_dense_adj(cora_graph.edge_index).to(device)
# cora_graph.x = cora_graph.x.unsqueeze(0) # Add an empty batch dimension. I needed that for compatibility with MolHIV later.
cora_graph = cora_graph.to(device)

In [180]:
def get_accuracy(model, cora, mask):
    model.eval()
    with torch.no_grad():
        outputs = model(cora_graph.x, cora_graph.edge_index)
    correct = (outputs[mask].argmax(-1) == cora_graph.y[mask]).sum()
    return int(correct) / int(mask.sum())

In [179]:
class GCNLayer(torch.nn.Module):
    def __init__(self, in_features: int, out_features: int, activation=torch.nn.functional.relu):
        super(GCNLayer, self).__init__()
        raise NotImplementedError

    def forward(self, H: torch.Tensor, edge_index: torch.Tensor):
        raise NotImplementedError

In [202]:
class GraphNet(torch.nn.Module):
    def __init__(self, in_features:int, out_features:int, hidden_features:int, activation=torch.nn.functional.relu, dropout=0.1):
        super(GraphNet, self).__init__()
        raise NotImplementedError

    def forward(self, H: torch.Tensor, edge_index: torch.Tensor):
        raise NotImplementedError

        

In [1]:
# Training loop goes here

## ZINC

In [3]:
# Load the dataset
dataset = pyg.datasets.ZINC(root='dataset/ZINC', split='train', subset=True)
dataset_val = pyg.datasets.ZINC(root='dataset/ZINC', split='val', subset=True)
dataset_test = pyg.datasets.ZINC(root='dataset/ZINC', split='test', subset=True)

# Create data loaders
batch_size=128
num_workers = 8
train_loader = pyg.loader.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = pyg.loader.DataLoader(dataset_val, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = pyg.loader.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [2]:
# your implementation goes here