# Exercise 2
Due:  Tue November 5, 8:00am

In [28]:
import torch
import torch_geometric as pyg
import numpy as np
import scipy
from ogb.graphproppred import PygGraphPropPredDataset
# from ogb.graphproppred.mol_encoder import AtomEncoder
import ogb
import torch_scatter
import copy

import time
import random
from tqdm import tqdm

In this exercise, we use sparse message passing to make our networks scale to larger graphs. 


1) In this exercise we are working with the node-classification dataset Cora and the graph regression dataset ZINC. When working with a new dataset, it makes sense to at least quickly look into the data and some statistics for it. So for Cora: which is the second-biggest label class and what does it stand for? And for ZINC: how many HCO molecules (i.e. molecules consisting only of Hydrogen, Carbon, and Oxygen) are in the train set?

1) When working on the Cora dataset your model should at least reach an accuracy of 0.6 (an accuracy of 0.7-0.8 is well within reach).
Cora is a node classification dataset, so there is only one graph and we perform message passing on the whole graph (but evaluate the loss only on the nodes selected by cora_graph.train_mask).
The dataset is mostly balanced, so we evaluate the accuracy.
When implementing the message passing step, keep in mind that the graph does not contain self-loops (so one needs to somehow treat the "old" state).
Since Cora is small enough to be run with dense tensors too, you can verify your implementation this way.

2) ZINC is a small molecular regression dataset. Please compare the difference in performance between the (trainable) Atomencoder provided by ogb and the one-hot encoding you implemented in the first exercise.
Note that since you need batches, you need to modify the pooling layer to respect the batches.


In [2]:
# find device
if torch.cuda.is_available(): # NVIDIA
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # apple M1/M2
    device = torch.device('mps') 
else:
    device = torch.device('cpu')
device

device(type='mps')

## Cora

In [194]:
cora = pyg.datasets.Planetoid(root = "dataset/cora", name="Cora")
cora_graph = cora[0]
cora_dense_adj = pyg.utils.to_dense_adj(cora_graph.edge_index).to(device)
# cora_graph.x = cora_graph.x.unsqueeze(0) # Add an empty batch dimension. I needed that for compatibility with MolHIV later.
cora_graph = cora_graph.to(device)

In [180]:
def get_accuracy(model, cora, mask):
    model.eval()
    with torch.no_grad():
        outputs = model(cora_graph.x, cora_graph.edge_index)
    correct = (outputs[mask].argmax(-1) == cora_graph.y[mask]).sum()
    return int(correct) / int(mask.sum())

In [179]:
class GCNLayer(torch.nn.Module):
    def __init__(self, in_features: int, out_features: int, activation=torch.nn.functional.relu):
        super(GCNLayer, self).__init__()
        raise NotImplementedError

    def forward(self, H: torch.Tensor, edge_index: torch.Tensor):
        raise NotImplementedError

In [202]:
class GraphNet(torch.nn.Module):
    def __init__(self, in_features:int, out_features:int, hidden_features:int, activation=torch.nn.functional.relu, dropout=0.1):
        super(GraphNet, self).__init__()
        raise NotImplementedError

    def forward(self, H: torch.Tensor, edge_index: torch.Tensor):
        raise NotImplementedError

        

In [1]:
# Training loop goes here

## ZINC

In [3]:
# Load the dataset
dataset = pyg.datasets.ZINC(root='dataset/ZINC', split='train', subset=True)
dataset_val = pyg.datasets.ZINC(root='dataset/ZINC', split='val', subset=True)
dataset_test = pyg.datasets.ZINC(root='dataset/ZINC', split='test', subset=True)

# Create data loaders
batch_size=128
num_workers = 8
train_loader = pyg.loader.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = pyg.loader.DataLoader(dataset_val, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = pyg.loader.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [2]:
# your implementation goes here