In [4]:
%load_ext autoreload
%autoreload 2

from typing import Dict, Any
import torch.nn as nn
from torch_geometric.datasets import QM9
import torch_geometric.transforms as T
import torch
from torch_geometric.loader import DataLoader
from data_utils import *
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
include_hydrogen = False
properties=["homo", "lumo"]

transform_list = [
    SelectQM9TargetProperties(properties=properties),
    SelectQM9NodeFeatures(features=["atom_type"]),
]
if not include_hydrogen:
    transform_list.append(DropQM9Hydrogen())

max_num_nodes = 29 if include_hydrogen else 9
transform_list += [
    AddAdjacencyMatrix(max_num_nodes=max_num_nodes),
    AddNodeAttributeMatrix(max_num_nodes=max_num_nodes),
    AddEdgeAttributeMatrix(max_num_nodes=max_num_nodes),
]

pre_transform = T.Compose(transform_list)
transform = T.Compose([
    #RandomPermutation(max_num_nodes=max_num_nodes),
    T.ToDevice(device=device)
])

dataset = QM9(root="./data", pre_transform=pre_transform, pre_filter=qm9_pre_filter, transform=transform)

train_dataset, val_dataset, test_dataset = create_qm9_data_split(dataset=dataset)

print(f"Training dataset size = {len(train_dataset)}")
print(f"Validation dataset size = {len(val_dataset)}")
print(f"Test dataset size = {len(test_dataset)}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training dataset size = 102445
Validation dataset size = 12806
Test dataset size = 12805


In [5]:
from typing import List
from data_utils import create_validation_subset_loaders

batch_size = 128

dataloaders = {
    "train_single": DataLoader(train_dataset[:1], batch_size=batch_size, shuffle=True),
    "train_tiny": DataLoader(train_dataset[:16], batch_size=batch_size, shuffle=True),
    "train_small": DataLoader(train_dataset[:4096], batch_size=batch_size, shuffle=True),
    "train": DataLoader(train_dataset, batch_size=batch_size, shuffle=True),

    "val_small": DataLoader(val_dataset[:512], batch_size=batch_size, shuffle=False),
    "val": DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
}

val_subset_count = 32
dataloaders["val_subsets"] = create_validation_subset_loaders(validation_dataset=val_dataset, subset_count=32, batch_size=batch_size)

In [6]:
def evaluate_model_performance(validation_loader, model):
    mae_sum = 0
    for batch in tqdm(validation_loader):
        prediction = model(batch)
        mae_sum += torch.mean(torch.abs(prediction - batch.y), dim=0)

    mean_absolute_error = mae_sum / len(validation_loader)

    print(f"HOMO MAE = {mean_absolute_error[0]}")
    print(f"LUMO MAE = {mean_absolute_error[1]}")

val_loader = dataloaders["val"]

In [24]:
from graph_vae.encoder import Encoder

hparams = {
    "max_num_nodes": 29 if include_hydrogen else 9,
    "adam_beta_1": 0.5,
    "num_node_features": dataset.num_node_features,
    "num_edge_features": dataset.num_edge_features,
    "latent_dim": 64,
    "include_hydrogen": include_hydrogen,
    "properties": properties,
}

class PropertyPredictor(nn.Module):

    def __init__(self, hparams: Dict[str, Any]) -> None:
        super().__init__()
        self.graph_encoder = Encoder(hparams=hparams)
        property_count = len(hparams["properties"])
        dim = hparams["latent_dim"] * 2
        self.fc = nn.Sequential(
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Linear(dim, property_count)
        )

    def forward(self, x: Data):
        z = self.graph_encoder(x)
        # combine tuple (mu, log_sigma) into single latent
        z = torch.cat(list(z), dim=1)
        return self.fc(z)


In [31]:
from data_utils import create_tensorboard_writer
from tqdm import tqdm
import itertools
from graph_vae.vae import GraphVAE


def train_property_predictor(
        model: PropertyPredictor,
        train_loader: DataLoader,
        val_subset_loaders: List[DataLoader],
        epochs: int,
        tb_writer: SummaryWriter,
    ):
    
    learning_rate = 4e-4
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.MSELoss()

    # After how many iterations to validate
    validation_interval = 10

    val_subset_loader_iterator = itertools.cycle(val_subset_loaders)

    for epoch in range(epochs):
        # Training
        model.train()
        for batch_index, train_batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1} Training")):
            optimizer.zero_grad()
            train_prediction = model(train_batch)
            train_loss = loss_function(train_prediction, train_batch.y)
            train_loss.backward()
            optimizer.step()

            iteration = len(train_loader) * epoch + batch_index
            tb_writer.add_scalars("Loss", {"Training": train_loss.item()}, iteration)

            # Validation
            if iteration % validation_interval == 0:
                model.eval()
                val_loss_sum = 0
                mae_sum = 0

                # Get the next subset of the validation set
                val_loader = next(val_subset_loader_iterator)
                with torch.no_grad():
                    for val_batch in val_loader:
                        val_prediction = model(val_batch)
                        val_loss_sum += loss_function(val_prediction, val_batch.y)
                        mae_sum += torch.mean(torch.abs(val_prediction - val_batch.y), dim=0)
                
                val_loss = val_loss_sum / len(val_loader)
                tb_writer.add_scalars("Loss", {"Validation": val_loss.item()}, iteration)

                mean_absolute_error = mae_sum / len(val_loader)
                tb_writer.add_scalar("MAE (HOMO)", mean_absolute_error[0], iteration)
                tb_writer.add_scalar("MAE (LUMO)", mean_absolute_error[1], iteration)
                
                model.train()

In [32]:
writer = create_tensorboard_writer(experiment_name="property-predictor-2")

train_loader = dataloaders["train"]
val_subset_loaders = dataloaders["val_subsets"]

model = PropertyPredictor(hparams=hparams).to(device)

train_property_predictor(
    model=model,
    train_loader=train_loader, 
    val_subset_loaders=val_subset_loaders,
    epochs=10,
    tb_writer=writer,
)

Epoch 1 Training: 100%|██████████| 801/801 [04:08<00:00,  3.22it/s]
Epoch 2 Training: 100%|██████████| 801/801 [05:16<00:00,  2.53it/s]
Epoch 3 Training: 100%|██████████| 801/801 [05:51<00:00,  2.28it/s]
Epoch 4 Training: 100%|██████████| 801/801 [05:18<00:00,  2.51it/s]
Epoch 5 Training: 100%|██████████| 801/801 [04:53<00:00,  2.73it/s]
Epoch 6 Training: 100%|██████████| 801/801 [04:49<00:00,  2.77it/s]
Epoch 7 Training: 100%|██████████| 801/801 [04:53<00:00,  2.73it/s]
Epoch 8 Training: 100%|██████████| 801/801 [04:49<00:00,  2.77it/s]
Epoch 9 Training: 100%|██████████| 801/801 [04:56<00:00,  2.70it/s]
Epoch 10 Training: 100%|██████████| 801/801 [05:10<00:00,  2.58it/s]


In [33]:
evaluate_model_performance(val_loader, model)

100%|██████████| 101/101 [00:24<00:00,  4.05it/s]

HOMO MAE = 0.1802791953086853
LUMO MAE = 0.2032887041568756



