In [1]:
%load_ext autoreload
%autoreload 2

from typing import Dict, Any
import torch.nn as nn
from torch_geometric.datasets import QM9
import torch_geometric.transforms as T
import torch
from torch_geometric.loader import DataLoader
from data_utils import *
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
include_hydrogen = False
properties=["homo", "lumo", "r2"]

transform_list = [
    SelectQM9TargetProperties(properties=properties),
    SelectQM9NodeFeatures(features=["atom_type"]),
]
if not include_hydrogen:
    transform_list.append(DropQM9Hydrogen())

max_num_nodes = 29 if include_hydrogen else 9
transform_list += [
    AddAdjacencyMatrix(max_num_nodes=max_num_nodes),
    AddNodeAttributeMatrix(max_num_nodes=max_num_nodes),
    AddEdgeAttributeMatrix(max_num_nodes=max_num_nodes),
]

pre_transform = T.Compose(transform_list)
transform = T.Compose([
    T.ToDevice(device=device)
])

dataset = QM9(root="./data/property_prediction", pre_transform=pre_transform, pre_filter=qm9_pre_filter, transform=transform)

train_dataset, val_dataset, test_dataset = create_qm9_data_split(dataset=dataset)

print(f"Training dataset size = {len(train_dataset)}")
print(f"Validation dataset size = {len(val_dataset)}")
print(f"Test dataset size = {len(test_dataset)}")

batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Training dataset size = 102443
Validation dataset size = 12805
Test dataset size = 12805


In [3]:
def evaluate_model_performance(validation_loader, model):
    mae_sum = 0
    for batch in tqdm(validation_loader):
        prediction = model(batch)
        mae_sum += torch.mean(torch.abs(prediction - batch.y), dim=0)

    mean_absolute_error = mae_sum / len(validation_loader)

    print(f"HOMO MAE = {mean_absolute_error[0]}")
    print(f"LUMO MAE = {mean_absolute_error[1]}")
    print(f"R2 MAE = {mean_absolute_error[2]}")

In [4]:
homo_mean_list = []
lumo_mean_list = []
r2_mean_list = []
for batch in tqdm(train_loader):
    batch_mean = torch.mean(batch.y, dim=0)
    homo_mean_list.append(batch_mean[0])
    lumo_mean_list.append(batch_mean[1])
    r2_mean_list.append(batch_mean[2])

homo_mean_pred = torch.tensor(homo_mean_list).mean()
lumo_mean_pred = torch.tensor(lumo_mean_list).mean()
r2_mean_pred = torch.tensor(r2_mean_list).mean()
print(f"HOMO Mean = {homo_mean_pred}")
print(f"LUMO Mean = {lumo_mean_pred}")
print(f"R2 Mean = {r2_mean_pred}")

class MeanPredictor(nn.Module):
    def __init__(self, property_mean_values: List[float]):
        super().__init__()
        # create mean prediction and add batch dimension
        self.register_buffer('mean_prediction', torch.tensor(property_mean_values).unsqueeze(0))

    def forward(self, x):
        batch_size = x.y.shape[0]
        return self.mean_prediction.expand(batch_size, -1)

mean_baseline_model = MeanPredictor(property_mean_values=[homo_mean_pred, lumo_mean_pred, r2_mean_pred]).to(device)

evaluate_model_performance(val_loader, mean_baseline_model)

  0%|          | 0/101 [00:00<?, ?it/s]

100%|██████████| 101/101 [00:48<00:00,  2.08it/s]


HOMO Mean = -6.5462236404418945
LUMO Mean = 0.33497878909111023
R2 Mean = 1191.4244384765625


100%|██████████| 13/13 [00:05<00:00,  2.18it/s]

HOMO MAE = 0.4391568899154663
LUMO MAE = 1.0499714612960815
R2 MAE = 198.45355224609375



