### Homework 2: Graph Neural Networks

- Author: Ludek Cizinsky (`ludek.cizinsky@epfl.ch`)

### Notebeook setup

In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# Hugging face util to download dataset
from datasets import load_dataset

# Scikit-learn
from sklearn.model_selection import train_test_split

# PyTorch
import torch
from torch import nn
# - Dataloader
from torch.utils.data import Dataset, DataLoader
# - Optimization
from torch.optim import Adam, SGD
# - Loss
from torch.nn import BCELoss, BCEWithLogitsLoss

# Custom scripts
from scripts.dataset import GraphDataset
from scripts.layers import (
    # Normal
    GraphConv,
    # Graph Sage
    GraphSAGEConv,
    MeanAggregation,
    SumAggregation,
    SqrtDegAggregation,
    MaxPoolAggregation,
    # Graph Attention
    GraphAttentionConv,
    # Pooling
    MeanPool,
    MaxPool,
)

from scripts.architecture import (
    GNN
)

from scripts.utils import (
    train,
    evaluate,
)

### Load the dataset

Some important notes:

- the dataset includes collection of chemical compounds represented as graphs (details are specified below), i.e., one sample is a graph and we have a corresponding ground truth label indicating whether the compound is mutagenic or not
- On the lower level, each node has associated embedding (one hot encoding) indicating its type, same goes for the edges, **the dimensions of these embeddings are different**

#### Download the dataset from Hugging Face (HF)

In [20]:
dataset_hf = load_dataset("graphs-datasets/MUTAG")['train']

#### Train, validation and test split

In [21]:
# Parse the dataset into X and y
X, y = [], []
for s in dataset_hf:
    X.append(s)
    y.extend(s['y'])

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the train dataset into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#### Load it using custom dataloader

In [22]:
# Define hyperparameters for the dataloader
batch_size = 30
shuffle = True

# Define custom stacking behavior for the dataloader
# def collate_fn(batch):
#     return list(zip(*batch))

# Define custom stacking behavior for the dataloader
def collate_fn(batch):
    inputs, labels = [], []
    
    for item in batch:
        input_dict, label = item
        inputs.append(input_dict)
        labels.append(label)
    
    # Convert the list of labels to a 1D tensor
    labels = torch.tensor(labels).view(-1).type(torch.FloatTensor)
    
    return inputs, labels

# Define custom datasets
train_dataset = GraphDataset(X_train, y_train)
val_dataset = GraphDataset(X_val, y_val)
test_dataset = GraphDataset(X_test, y_test)

# Define custom dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size, 
    shuffle=shuffle,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size, 
    shuffle=shuffle,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size, 
    shuffle=shuffle,
    collate_fn=collate_fn
)

### Implementation of different graph and convolution and pooling layers

See the [layers.py file](scripts/layers.py) for the implementation details.

TODOs:
- [ ] Implement the attention conv

### Custom Network Design with Node Features

In [33]:
# Define the model's architecture
num_features = len(X_train[0]['node_feat'][0])
activation = nn.LeakyReLU()
pooling = MeanPool()
aggregation = SumAggregation()
dropout = 0.0
# architecture = [
    # [GraphSAGEConv, {'in_features': num_features, 'out_features': 256, 'aggregation': aggregation, 'activation': activation}],
    # [GraphSAGEConv, {'in_features': 256, 'out_features': 128, 'aggregation': aggregation, 'activation': activation}],
# ]
# architecture = [
    # [GraphConv, {'in_features': num_features, 'out_features': 64, 'activation': activation}],
    # [GraphConv, {'in_features': 64, 'out_features': 32, 'activation': activation}],
# ]

architecture = [
    [GraphAttentionConv, {'in_features': num_features, 'out_features': 64}],
    [GraphAttentionConv, {'in_features': 64, 'out_features': 32}],
]

# Define the model
model = GNN(architecture, pooling, dropout=dropout)

# Define hyperparameters for the model
lr = 8e-1
epochs = 5
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(0.7))
optimizer = SGD(model.parameters(), lr=lr)
# optimizer = Adam(model.parameters(), lr=lr)

# Train the model
train(model, train_loader, criterion, optimizer, num_epochs=epochs)

# Evaluate the model on validation dataset
evaluate(model, val_loader, criterion)

torch.Size([64])


RuntimeError: expand(torch.FloatTensor{[64, 1]}, size=[64]): the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)