In [6]:
# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.data import DataLoader

# OGB
from ogb.graphproppred import PygGraphPropPredDataset

# Utils
import tqdm

In [7]:
# useful stuff
# https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb#scrollTo=HvhgQoO8Svw4
# https://ogb.stanford.edu/docs/graphprop/#ogbg-mol

### Data loading

In [8]:
BATCH_SIZE = 32

dataset = PygGraphPropPredDataset(name = "ogbg-molhiv")

split_idx = dataset.get_idx_split() 
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False)

Downloading http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip


Downloaded 0.00 GB: 100%|██████████| 3/3 [00:01<00:00,  1.79it/s]
Processing...


Extracting dataset/hiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 41127/41127 [00:00<00:00, 223580.89it/s]


Converting graphs into PyG objects...


100%|██████████| 41127/41127 [00:00<00:00, 53306.90it/s]


Saving...


Done!


In [9]:
# Example of molecule graph and target label
print(dataset[0])
print(dataset.num_node_features)
print(dataset.y)
print(f"Number of graphs: {len(dataset)}")
print(f"Class balance: {dataset.y.sum()}")

Data(edge_index=[2, 40], edge_attr=[40, 3], x=[19, 9], y=[1, 1], num_nodes=19)
9
tensor([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]])
Number of graphs: 41127
Class balance: 1443


### GCN model

In [10]:
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(hidden_channels, dataset.num_classes )

    def forward(self, x, edge_index, batch):
        # Conv layers
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        # Readout layer
        x = global_mean_pool(x, batch)
        x = self.lin(x)
        return x

In [11]:
def train(model, optimizer, loss_fun):
    for data in train_loader:
        out = model(data.x.float(), data.edge_index, data.batch)
        loss = loss_fun(out, torch.reshape(data.y, (-1,)))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [12]:
def validate(loader, model):
    model.eval()
    correct = 0
    for data in loader:
        out = model(data.x.float(), data.edge_index, data.batch)  
        pred = out.argmax(dim=1)
        correct += int((pred == torch.reshape(data.y, (-1,))).sum())
    acc = correct / len(loader.dataset)
    return acc

In [13]:
model = GCN(4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fun = nn.CrossEntropyLoss()

model.train()
for epoch in tqdm.tqdm(range(20)):
    train(model, optimizer, loss_fun)

print(f"Epoch: {epoch}, Train ACC: {validate(train_loader, model)}, Val ACC: {validate(valid_loader, model)}")
    

100%|██████████| 20/20 [00:34<00:00,  1.74s/it]


Epoch: 19, Train ACC: 0.9625543296556336, Val ACC: 0.9803063457330415


In [14]:
print(f"Test ACC: {validate(test_loader, model)}")

Test ACC: 0.9683929005592026


In [15]:
# calculating AUC 

from sklearn.metrics import roc_auc_score


def calc_auc(loader, model):
    model.eval()
    y_true = []
    y_pred = []
    for data in loader:
        out = model(data.x.float(), data.edge_index, data.batch)  
        
        y_true.append(data.y)
        
        # softmax to get probabilities
        y_pred.append(F.softmax(out, dim=1)[:, 1])

    return roc_auc_score(torch.cat(y_true).detach().numpy(), torch.cat(y_pred).detach().numpy())

In [18]:
## Hyperparam training 
hidden_channels = [2,8,16,32]

for hidden_channel in hidden_channels:
    model = GCN(hidden_channel)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fun = nn.CrossEntropyLoss()

    model.train()
    for epoch in tqdm.tqdm(range(20)):
        train(model, optimizer, loss_fun)

    print(f"hidden_channel: {hidden_channel}")
    print(f"Epoch: {epoch}, Train AUC: {calc_auc(train_loader, model)}, Val AUC: {calc_auc(valid_loader, model)}")
    #print(f"Test ACC: {validate(test_loader, model)}")
    print(f"Test AUC: {calc_auc(test_loader,model)}")

100%|██████████| 20/20 [00:29<00:00,  1.49s/it]


hidden_channel: 2
Epoch: 19, Train AUC: 0.5828548612412564, Val AUC: 0.6083324147560258
Test AUC: 0.6282102782981518


100%|██████████| 20/20 [00:33<00:00,  1.66s/it]


hidden_channel: 8
Epoch: 19, Train AUC: 0.6275752374500362, Val AUC: 0.5756846462864981
Test AUC: 0.6153575773962419


100%|██████████| 20/20 [00:49<00:00,  2.49s/it]


hidden_channel: 16
Epoch: 19, Train AUC: 0.5829023671393181, Val AUC: 0.6076526675485009
Test AUC: 0.6221518376175669


100%|██████████| 20/20 [00:56<00:00,  2.83s/it]


hidden_channel: 32
Epoch: 19, Train AUC: 0.5647432856622048, Val AUC: 0.6231919336664706
Test AUC: 0.5130332760385485


In [26]:
# class unbalance 
print(f'Percent of 1 class : {dataset.y.detach().numpy().sum()/dataset.y.detach().numpy().shape[0]}%')

Percent of 1 class : 0.03508643956524911%
