In [39]:
%load_ext autoreload
%autoreload 2

from torch_geometric.datasets import QM9
import torch_geometric.transforms as T
import torch
from torch_geometric.loader import DataLoader
from data_utils import SelectQM9TargetProperties, create_qm9_data_split, SelectQM9NodeFeatures
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

transform = T.Compose([
    SelectQM9TargetProperties(properties=["homo", "lumo"]),
    SelectQM9NodeFeatures(features=["atom_type"]),
    T.ToDevice(device=device)
])

dataset = QM9(root="./data", transform=transform)

train_dataset, val_dataset, test_dataset = create_qm9_data_split(dataset=dataset)

print(f"Training dataset size = {len(train_dataset)}")
print(f"Validation dataset size = {len(val_dataset)}")
print(f"Test dataset size = {len(test_dataset)}")

num_node_features = dataset.num_node_features
num_targets = dataset.num_classes
num_edge_features = dataset.num_edge_features

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
cuda
Training dataset size = 104665
Validation dataset size = 13083
Test dataset size = 13083


## Create Dataloaders

In [40]:
from typing import List
import math

batch_size = 128

dataloaders = {
    "train_single": DataLoader(train_dataset[:1], batch_size=batch_size, shuffle=True),
    "train_tiny": DataLoader(train_dataset[:16], batch_size=batch_size, shuffle=True),
    "train_small": DataLoader(train_dataset[:4096], batch_size=batch_size, shuffle=True),
    "train": DataLoader(train_dataset, batch_size=batch_size, shuffle=True),

    "val_small": DataLoader(val_dataset[:512], batch_size=batch_size, shuffle=False),
    "val": DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
}

def create_validation_subset_loaders(validation_dataset, subset_count):
    """ Create random subsets of the validation set for fast validation. """
    validation_subsets = []
    generator = torch.manual_seed(420)
    validation_indices = torch.randperm(len(validation_dataset), generator=generator).tolist()
    subset_size = math.ceil(len(validation_dataset) / subset_count)
    for i in range(subset_count):
        start_index = subset_size * i
        end_index = min(subset_size * (i + 1), len(validation_dataset))
        val_subset = torch.utils.data.Subset(validation_dataset, validation_indices[start_index:end_index])
        validation_subsets.append(DataLoader(val_subset, batch_size=batch_size, shuffle=False))
    return validation_subsets

val_subset_count = 32
dataloaders["val_subsets"] = create_validation_subset_loaders(validation_dataset=val_dataset, subset_count=32)

## Baseline model (mean prediction)

### Training

In [41]:
import torch.nn as nn

train_loader = dataloaders["train"]
homo_mean_list = []
lumo_mean_list = []
for batch in tqdm(train_loader):
    batch_mean = torch.mean(batch.y, dim=0)
    homo_mean_list.append(batch_mean[0])
    lumo_mean_list.append(batch_mean[1])

homo_mean_pred = torch.tensor(homo_mean_list).mean()
lumo_mean_pred = torch.tensor(lumo_mean_list).mean()
print(f"HOMO Mean = {homo_mean_pred}")
print(f"LUMO Mean = {lumo_mean_pred}")

class MeanPredictor(nn.Module):
    def __init__(self, property_mean_values: List[float]):
        super().__init__()
        # create mean prediction and add batch dimension
        self.register_buffer('mean_prediction', torch.tensor(property_mean_values).unsqueeze(0))

    def forward(self, x):
        batch_size = x.y.shape[0]
        return self.mean_prediction.expand(batch_size, -1)

baseline_model = MeanPredictor(property_mean_values=[homo_mean_pred, lumo_mean_pred]).to(device)

  0%|          | 0/818 [00:00<?, ?it/s]

100%|██████████| 818/818 [00:42<00:00, 19.25it/s]


HOMO Mean = -6.53601598739624
LUMO Mean = 0.3205300271511078


### Validation

In [42]:
def evaluate_model_performance(validation_loader, model):
    mae_sum = 0
    for batch in tqdm(validation_loader):
        prediction = model(batch)
        mae_sum += torch.mean(torch.abs(prediction - batch.y), dim=0)

    mean_absolute_error = mae_sum / len(validation_loader)

    print(f"HOMO MAE (mean prediction) = {mean_absolute_error[0]}")
    print(f"LUMO MAE (mean prediction) = {mean_absolute_error[1]}")

val_loader = dataloaders["val"]
evaluate_model_performance(validation_loader=val_loader, model=baseline_model)

  0%|          | 0/103 [00:00<?, ?it/s]

100%|██████████| 103/103 [00:07<00:00, 14.52it/s]

HOMO MAE (mean prediction) = 0.44092249870300293
LUMO MAE (mean prediction) = 1.0540993213653564





## Graph Property Predictor

In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool, NNConv

class GraphPropertyPredictor(nn.Module):
    def __init__(self, num_node_features: int, num_edge_features: int, num_targets: int):
        super().__init__()
        
        conv_features = 16
        self.nn_conv = NNConv(
            in_channels=num_node_features,
            out_channels=conv_features,
            nn=nn.Sequential(
                nn.Linear(num_edge_features, num_node_features * conv_features),
            )
        )
        self.gcn_conv1 = GCNConv(conv_features, conv_features)
        self.gcn_conv2 = GCNConv(conv_features, conv_features)
        self.fc1 = nn.Linear(conv_features, conv_features)
        self.fc2 = nn.Linear(conv_features, num_targets)

    def forward(self, data):
        x, edge_index, batch, edge_attr = data.x, data.edge_index, data.batch, data.edge_attr

        x = self.nn_conv(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.gcn_conv1(x, edge_index)
        x = F.relu(x)
        x = self.gcn_conv2(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [66]:
from data_utils import create_tensorboard_writer
from tqdm import tqdm
import itertools

model = GraphPropertyPredictor(
    num_node_features=num_node_features,
    num_edge_features=num_edge_features,
    num_targets=num_targets
).to(device=device)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

learning_rate = 2e-3
epochs = 32

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.MSELoss()

writer = create_tensorboard_writer(experiment_name="property-predictor")

train_loader = dataloaders["train"]
val_subset_loader_iterator = itertools.cycle(dataloaders["val_subsets"])

# After how many iterations to validate
validation_interval = 128

for epoch in range(epochs):
    # Training
    model.train()
    for batch_index, train_batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1} Training")):
        optimizer.zero_grad()
        train_prediction = model(train_batch)
        train_loss = loss_function(train_prediction, train_batch.y)
        train_loss.backward()
        optimizer.step()

        iteration = len(train_loader) * epoch + batch_index
        writer.add_scalars("Loss", {"Training": train_loss.item()}, iteration)

        # Validation
        if iteration % validation_interval == 0:
            model.eval()
            val_loss_sum = 0
            mae_sum = 0

            # Get the next subset of the validation set
            val_loader = next(val_subset_loader_iterator)
            with torch.no_grad():
                for val_batch in val_loader:
                    val_prediction = model(val_batch)
                    val_loss_sum += loss_function(val_prediction, val_batch.y)
                    mae_sum += torch.mean(torch.abs(val_prediction - val_batch.y), dim=0)
            
            val_loss = val_loss_sum / len(val_loader)
            writer.add_scalars("Loss", {"Validation": val_loss.item()}, iteration)

            mean_absolute_error = mae_sum / len(val_loader)
            writer.add_scalar("MAE (HOMO)", mean_absolute_error[0], iteration)
            writer.add_scalar("MAE (LUMO)", mean_absolute_error[1], iteration)
            
            model.train()

Number of parameters: 1346


Epoch 1 Training:   0%|          | 0/818 [00:00<?, ?it/s]

Epoch 1 Training: 100%|██████████| 818/818 [00:32<00:00, 25.53it/s]
Epoch 2 Training: 100%|██████████| 818/818 [00:32<00:00, 25.55it/s]
Epoch 3 Training: 100%|██████████| 818/818 [00:34<00:00, 23.81it/s]
Epoch 4 Training: 100%|██████████| 818/818 [00:32<00:00, 25.35it/s]
Epoch 5 Training: 100%|██████████| 818/818 [00:32<00:00, 25.52it/s]
Epoch 6 Training: 100%|██████████| 818/818 [00:32<00:00, 25.46it/s]
Epoch 7 Training: 100%|██████████| 818/818 [00:32<00:00, 25.47it/s]
Epoch 8 Training: 100%|██████████| 818/818 [00:34<00:00, 24.05it/s]
Epoch 9 Training: 100%|██████████| 818/818 [00:32<00:00, 25.41it/s]
Epoch 10 Training: 100%|██████████| 818/818 [00:31<00:00, 25.62it/s]
Epoch 11 Training: 100%|██████████| 818/818 [00:32<00:00, 25.48it/s]
Epoch 12 Training: 100%|██████████| 818/818 [00:32<00:00, 25.48it/s]
Epoch 13 Training: 100%|██████████| 818/818 [00:33<00:00, 24.06it/s]
Epoch 14 Training: 100%|██████████| 818/818 [00:32<00:00, 25.34it/s]
Epoch 15 Training: 100%|██████████| 818/818

In [67]:
evaluate_model_performance(validation_loader=dataloaders["val"], model=model)

  0%|          | 0/103 [00:00<?, ?it/s]

100%|██████████| 103/103 [00:03<00:00, 28.80it/s]

HOMO MAE (mean prediction) = 0.2153443694114685
LUMO MAE (mean prediction) = 0.2144031971693039



