# Dense GNN implementation

In this exercise we are implementing a GNN from scratch using dense matrices.
Note that as the memory requirement of a dense matrix scales quadratically with the number of nodes in a graph, this limits us to datasets with only small graphs. 

We will use the following dataset molHIV.

For the network we need a message-passing layer and pooling function.

1. Describe the datasets in your own words. Also talk about its features and statistical properties of the graphs and labels.
1. Implement the class GCNLayer to perform one round of message passing. You may use any variant of message passing here.
1. Implement a pooling layer like MeanPooling or SumPooling (or both).
1. Implement a one-hot-encoding of the atom type (this will positively affect classification performance)
1. Implement the model class GraphGCN that builds upon your GCNLayer and Pooling layer.
1. Create and train a GraphGCN model on MolHIV. As MOlHIV is highly imbalanced, it will make sense to adapt class weights in your loss function.

For the dataset molHIV we aim to reach something like 0.64 ROC (or higher). Note that for me the training was quite unstable, so several runs got stuck at 0.5.

Note: In this exercise, we use PyG only for utilities and not to build models. Feel free to edit/ignore any of the provided code as you see fit.

In [109]:
import torch
import torch_geometric as pyg
import numpy as np
from ogb.graphproppred import PygGraphPropPredDataset,Evaluator

from tqdm import tqdm

In [110]:
# find device
if torch.cuda.is_available(): # NVIDIA
    device = torch.device('cuda')
elif torch.backends.mps.is_available(): # apple silicon
    device = torch.device('mps') 
else:
    device = torch.device('cpu') # fallback
device

device(type='cuda')

In [111]:
class GCNLayer(torch.nn.Module):
    def __init__(self, in_features: int, out_features: int, activation=torch.nn.functional.relu):
        super(GCNLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.activation = activation
        self.linear = torch.nn.Linear(in_features, out_features, bias=False)

    def forward(self, H: torch.Tensor, adj: torch.Tensor):
        # Normalize adjacency matrix
        adj = adj + torch.eye(adj.size(0)).to(adj.device)  # Add self-loops
        D = torch.diag(torch.sum(adj, dim=1))  # Degree matrix
        D_inv_sqrt = torch.pow(D, -0.5)
        D_inv_sqrt[torch.isinf(D_inv_sqrt)] = 0.0
        adj_norm = D_inv_sqrt @ adj @ D_inv_sqrt

        # Apply GCN transformation
        H = self.linear(H)
        H = adj_norm @ H
        if self.activation:
            H = self.activation(H)
        return H


In [112]:
class MeanPooling(torch.nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, H: torch.Tensor):
        # Since we're not provided with batch indices, we assume H contains nodes from a single graph
        graph_embedding = H.mean(dim=0)
        return graph_embedding


In [113]:
class SumPooling(torch.nn.Module):
    def __init__(self):
        super(SumPooling, self).__init__()

    def forward(self, H: torch.Tensor):
        graph_embedding = H.sum(dim=0)
        return graph_embedding


In [114]:
class GraphGCN(torch.nn.Module):
    def __init__(self, in_features: int, hidden_features: int, num_classes: int, activation=torch.nn.functional.relu):
        super(GraphGCN, self).__init__()
        self.gcn1 = GCNLayer(in_features, hidden_features, activation)
        self.gcn2 = GCNLayer(hidden_features, hidden_features, activation)
        self.pool = MeanPooling()  # You can choose SumPooling() instead
        self.classifier = torch.nn.Linear(hidden_features, num_classes)

    def forward(self, H_in: torch.Tensor, adj: torch.Tensor):
        H = self.gcn1(H_in, adj)
        H = self.gcn2(H, adj)
        graph_embedding = self.pool(H)
        out = self.classifier(graph_embedding)
        return out


## MolHIV

Pytorch Geometric stores its graphs in a sparse format using the variable edge_index.
We will thus need to create our own (torch) dataloader and extract the graphs into dense adjacency matrices.

In terms of model accuracy, it really helped me to add an "Atom encoding", i.e. a one-hot-encoding of the atoms instead of just having the atomic numbers appear in the first column of the node features.

In [115]:
class GraphDataset(torch.utils.data.Dataset):
    def __init__(self, adjacencies, features, targets):
        self.adjacencies = torch.tensor(adjacencies, dtype=torch.float32)
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.adjacencies[idx], self.features[idx], self.targets[idx]

    def num_features(self):
        return self.features.shape[-1]

    def compute_class_weights(self):
        """
        Computes class weights inversely proportional to the class frequencies.
        
        Returns:
            torch.Tensor: A tensor containing the weight for each class.
        """
        # Count the number of samples per class
        class_counts = torch.bincount(self.targets)
        num_classes = len(class_counts)

        # Handle cases where a class might be missing in the dataset
        if num_classes < 2:
            raise ValueError("The dataset must have at least two classes for classification.")

        total_samples = self.targets.size(0)

        # Compute class weights: weight = total_samples / (num_classes * class_count)
        class_weights = total_samples / (num_classes * class_counts.float())

        # Normalize class weights to have a mean of 1.0 (optional)
        class_weights = class_weights * (num_classes / class_weights.sum())

        return class_weights


In [116]:
def extract_graphs_and_features(dataset):
    adjacencies = []
    features = []
    targets = []
    atoms_to_index = {}  # Optional mapping if needed

    # Determine the total number of unique atom types for one-hot encoding
    all_atom_types = []
    for data in dataset:
        atom_types = data.x[:, 0].long()  # Adjust index if atom type is in a different column
        all_atom_types.append(atom_types)
    all_atom_types = torch.cat(all_atom_types)
    num_atom_types = all_atom_types.max().item() + 1  # Assuming atom types are zero-indexed

    for data in dataset:
        # Extract node features and target labels
        x = data.x  # Node features tensor of shape [num_nodes, num_node_features]
        y = data.y  # Target label tensor

        # One-hot encode the atom types
        atom_types = x[:, 0].long()  # Adjust index if necessary
        one_hot_atom_types = torch.zeros((atom_types.size(0), num_atom_types))
        one_hot_atom_types[torch.arange(atom_types.size(0)), atom_types] = 1.0

        # Optionally include other node features
        # For simplicity, we'll use only the one-hot encoded atom types
        features_combined = one_hot_atom_types

        # Convert edge_index to a dense adjacency matrix
        edge_index = data.edge_index  # Edge indices tensor of shape [2, num_edges]
        num_nodes = x.size(0)
        adj = torch.zeros((num_nodes, num_nodes), dtype=torch.float32)
        adj[edge_index[0], edge_index[1]] = 1.0

        # Append to the lists
        adjacencies.append(adj)
        features.append(features_combined)
        targets.append(y.squeeze())  # Ensure targets are scalar

    return adjacencies, features, targets, atoms_to_index


### Create Data Loaders for MolHIV

In [117]:
batch_size = 32

molHIV = PygGraphPropPredDataset(name = "ogbg-molhiv") 
split_idx = molHIV.get_idx_split() 
all_adjacencies, all_features, all_targets, atoms_to_index = extract_graphs_and_features(molHIV)
all_targets = all_targets.to(torch.int64)

# Create datasets using split_idx indices
graph_dataset = GraphDataset(all_adjacencies, all_features, all_targets)
train_dataset = torch.utils.data.Subset(graph_dataset, split_idx["train"])
val_dataset = torch.utils.data.Subset(graph_dataset, split_idx["valid"])
test_dataset = torch.utils.data.Subset(graph_dataset, split_idx["test"]) 

# Create DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


  self.data, self.slices = torch.load(self.processed_paths[0])


AttributeError: 'list' object has no attribute 'to'

### Model and Training for MolHIV

The evaluation of MolHIV (and all other datasets from ogb) should happen through an Evaluator. You can also try playing around with learning rate schedulers.

In [None]:
evaluator = Evaluator(name='ogbg-molhiv')

def evaluate(model, loader):
    model.eval()

    y_true = list()
    y_pred = list()

    for adjacencies, features, targets in loader:
        adjacencies, features = adjacencies.to(device), features.to(device)

        with torch.no_grad():
            pred = model(features, adjacencies)
        y_pred.append(pred.argmax(dim=-1, keepdims=True))
        y_true.append(targets)

    y_true = torch.cat(y_true, dim=0).detach().cpu()
    y_pred = torch.cat(y_pred, dim=0).detach().cpu()

    input_dict = {"y_true": y_true, "y_pred": y_pred}

    return evaluator.eval(input_dict)['rocauc']

In [None]:
# Model definition and Training loop
raise NotImplementedError