In [2]:
%load_ext autoreload
%autoreload 2

from torch_geometric.datasets import QM9
import torch_geometric.transforms as T
import torch
from torch_geometric.loader import DataLoader
from data_utils import SelectQM9TargetProperties, create_qm9_data_split, SelectQM9NodeFeatures
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

transform = T.Compose([
    SelectQM9TargetProperties(properties=["homo", "lumo"]),
    SelectQM9NodeFeatures(features=["atom_type"]),
    T.ToDevice(device=device)
])

dataset = QM9(root="./data", transform=transform)

train_dataset, val_dataset, test_dataset = create_qm9_data_split(dataset=dataset)

print(f"Training dataset size = {len(train_dataset)}")
print(f"Validation dataset size = {len(val_dataset)}")
print(f"Test dataset size = {len(test_dataset)}")

num_node_feature = dataset.num_node_features
num_targets = dataset.num_classes

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
cuda
Training dataset size = 104665
Validation dataset size = 13083
Test dataset size = 13083


## Create Dataloaders

In [4]:
from typing import List

batch_size = 128

dataloaders = {
    "train_single": DataLoader(train_dataset[:1], batch_size=batch_size, shuffle=True),
    "train_tiny": DataLoader(train_dataset[:16], batch_size=batch_size, shuffle=True),
    "train_small": DataLoader(train_dataset[:4096], batch_size=batch_size, shuffle=True),
    "train": DataLoader(train_dataset, batch_size=batch_size, shuffle=True),

    "val_small": DataLoader(val_dataset[:512], batch_size=batch_size, shuffle=False),
    "val": DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
}

# subset_count = 64
# validation_subsets = []
# validation_indices = torch.randperm(len(val_dataset)).tolist()

# def get_validation_subset()

## Baseline model (mean prediction)

### Training

In [50]:
import torch.nn as nn

train_loader = dataloaders["train"]
homo_mean_list = []
lumo_mean_list = []
for batch in tqdm(train_loader):
    batch_mean = torch.mean(batch.y, dim=0)
    homo_mean_list.append(batch_mean[0])
    lumo_mean_list.append(batch_mean[1])

homo_mean_pred = torch.tensor(homo_mean_list).mean()
lumo_mean_pred = torch.tensor(lumo_mean_list).mean()
print(f"HOMO Mean = {homo_mean_pred}")
print(f"LUMO Mean = {lumo_mean_pred}")

class MeanPredictor(nn.Module):
    def __init__(self, property_mean_values: List[float]):
        super().__init__()
        # create mean prediction and add batch dimension
        self.register_buffer('mean_prediction', torch.tensor(property_mean_values).unsqueeze(0))

    def forward(self, x):
        batch_size = x.y.shape[0]
        return self.mean_prediction.expand(batch_size, -1)

baseline_model = MeanPredictor(property_mean_values=[homo_mean_pred, lumo_mean_pred]).to(device)

  0%|          | 0/818 [00:00<?, ?it/s]

100%|██████████| 818/818 [00:27<00:00, 30.19it/s]

HOMO Mean = -6.535999774932861
LUMO Mean = 0.32042407989501953





### Validation

In [91]:
def evaluate_model_performance(validation_loader, model):
    mae_sum = 0
    for batch in tqdm(validation_loader):
        prediction = model(batch)
        mae_sum += torch.mean(torch.abs(prediction - batch.y), dim=0)

    mean_absolute_error = mae_sum / len(validation_loader)

    print(f"HOMO MAE (mean prediction) = {mean_absolute_error[0]}")
    print(f"LUMO MAE (mean prediction) = {mean_absolute_error[1]}")

val_loader = dataloaders["val"]
evaluate_model_performance(validation_loader=val_loader, model=baseline_model)

  0%|          | 0/103 [00:00<?, ?it/s]

100%|██████████| 103/103 [00:03<00:00, 30.08it/s]

HOMO MAE (mean prediction) = 0.440923273563385
LUMO MAE (mean prediction) = 1.054100751876831





## Graph Property Predictor

In [92]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GCN(torch.nn.Module):
    def __init__(self, num_node_features: int, num_targets: int):
        super().__init__()
        
        conv_features = 16

        self.conv1 = GCNConv(num_node_features, conv_features)
        self.conv2 = GCNConv(conv_features, conv_features)
        self.conv3 = GCNConv(conv_features, conv_features)
        self.fc1 = nn.Linear(conv_features, conv_features)
        self.fc2 = nn.Linear(conv_features, num_targets)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

In [95]:
from data_utils import create_tensorboard_writer
from tqdm import tqdm

model = GCN(num_node_features=num_node_feature, num_targets=num_targets).to(device=device)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

learning_rate = 2e-2
epochs = 16

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.MSELoss()

writer = create_tensorboard_writer(experiment_name="property-predictor")

train_loader = dataloaders["train"]
val_loader = dataloaders["val_small"]

# After how many iterations to validate
validation_interval = 128

for epoch in range(epochs):
    # Training
    model.train()
    for batch_index, train_batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1} Training")):
        optimizer.zero_grad()
        train_prediction = model(train_batch)
        train_loss = loss_function(train_prediction, train_batch.y)
        train_loss.backward()
        optimizer.step()

        iteration = len(train_loader) * epoch + batch_index
        writer.add_scalars("Loss", {"Training": train_loss.item()}, iteration)

        # Validation
        if iteration % validation_interval == 0:
            model.eval()
            val_loss_sum = 0
            mae_sum = 0

            # TODO: Validate only with a random subset (create 64 subsets)
            with torch.no_grad():
                for val_batch in val_loader:
                    val_prediction = model(val_batch)
                    val_loss_sum += loss_function(val_prediction, val_batch.y)
                    mae_sum = torch.mean(torch.abs(val_prediction - val_batch.y), dim=0)
                
            val_loss = val_loss_sum / len(val_loader)
            writer.add_scalars("Loss", {"Validation": val_loss.item()}, iteration)

            mean_absolute_error = mae_sum / len(val_loader)
            writer.add_scalar("MAE (HOMO)", mean_absolute_error[0], iteration)
            writer.add_scalar("MAE (LUMO)", mean_absolute_error[1], iteration)
            
            model.train()

Number of parameters: 946


Epoch 1 Training: 100%|██████████| 818/818 [00:32<00:00, 25.29it/s]
Epoch 2 Training: 100%|██████████| 818/818 [00:32<00:00, 25.54it/s]
Epoch 3 Training: 100%|██████████| 818/818 [00:32<00:00, 24.92it/s]
Epoch 4 Training: 100%|██████████| 818/818 [00:32<00:00, 25.41it/s]
Epoch 5 Training: 100%|██████████| 818/818 [00:31<00:00, 25.64it/s]
Epoch 6 Training: 100%|██████████| 818/818 [00:32<00:00, 24.88it/s]
Epoch 7 Training: 100%|██████████| 818/818 [00:32<00:00, 25.07it/s]
Epoch 8 Training: 100%|██████████| 818/818 [00:32<00:00, 25.47it/s]
Epoch 9 Training: 100%|██████████| 818/818 [00:32<00:00, 25.04it/s]
Epoch 10 Training: 100%|██████████| 818/818 [00:32<00:00, 25.48it/s]
Epoch 11 Training: 100%|██████████| 818/818 [00:32<00:00, 25.52it/s]
Epoch 12 Training: 100%|██████████| 818/818 [00:32<00:00, 25.16it/s]
Epoch 13 Training: 100%|██████████| 818/818 [00:32<00:00, 25.26it/s]
Epoch 14 Training: 100%|██████████| 818/818 [00:32<00:00, 25.46it/s]
Epoch 15 Training: 100%|██████████| 818/818

In [98]:
evaluate_model_performance(validation_loader=dataloaders["val"], model=model)

  0%|          | 0/103 [00:00<?, ?it/s]

100%|██████████| 103/103 [00:03<00:00, 28.25it/s]

HOMO MAE (mean prediction) = 0.2609836757183075
LUMO MAE (mean prediction) = 0.2891755700111389



