# Training a GCN with NAGL

## Imports

In [1]:
import os
from pathlib import Path

from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar

from openff.toolkit import Molecule

from openff.nagl import GNNModel
from openff.nagl.features import atoms, bonds
from openff.nagl.nn.dataset import DGLMoleculeLightningDataModule


  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [2]:
output_directory = Path('output') # The path to an output directory
checkpoint_directory = output_directory / "checkpoints"
checkpoint_file = checkpoint_directory / "checkpoint"

n_epochs = 200 # Number of epochs"

n_gpus = 1 # Number of gpus

partial_charge_method = "am1" # Method

dataset_paths = [Path("alkanes.sqlite")]


## Create the model

In [3]:
atom_features = (
    atoms.AtomicElement(["C", "O", "H", "N", "S", "F", "Br", "Cl", "I", "P"]),
    atoms.AtomConnectivity(),
    atoms.AtomAverageFormalCharge(),
    atoms.AtomHybridization(),
    atoms.AtomInRingOfSize(3),
    atoms.AtomInRingOfSize(4),
    atoms.AtomInRingOfSize(5),
    atoms.AtomInRingOfSize(6),
)

bond_features = (
    bonds.BondInRingOfSize(3),
    bonds.BondInRingOfSize(4),
    bonds.BondInRingOfSize(5),
    bonds.BondInRingOfSize(6),
)

In [4]:

model = GNNModel(
    convolution_architecture="SAGEConv",
    n_convolution_hidden_features=128,
    n_convolution_layers=3,
    n_readout_hidden_features=128,
    n_readout_layers=4,
    activation_function="ReLU",
    postprocess_layer="compute_partial_charges",
    readout_name=f"{partial_charge_method}-charges",
    learning_rate=0.001,
    atom_features=atom_features,
    bond_features=bond_features,
)

## Specify the training, validation and test data

In [5]:
data_module = DGLMoleculeLightningDataModule(
    atom_features=atom_features,
    bond_features=bond_features,
    partial_charge_method=partial_charge_method,
    training_set_paths=dataset_paths,
    training_batch_size=1000,
    validation_set_paths=dataset_paths,
    validation_batch_size=1000,
    test_set_paths=dataset_paths,
    test_batch_size=1000,
)

## Train the model

In [6]:
os.makedirs(str(output_directory), exist_ok=True)
os.makedirs(str(checkpoint_directory), exist_ok=True)

logger = TensorBoardLogger(output_directory)

callbacks = [TQDMProgressBar(), ModelCheckpoint(save_top_k=1, monitor="val_loss")]

trainer = Trainer(
    gpus=n_gpus,
    min_epochs=n_epochs,
    max_epochs=n_epochs,
    logger=logger,
    callbacks=callbacks,
)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
trainer.fit(
    model, 
    datamodule=data_module, 
    ckpt_path=checkpoint_file,
)

featurizing molecules: 100%|████████████████████████████████████████| 10/10 [00:00<00:00, 66.02it/s]


FileExistsError: [Errno 17] File exists: PosixPath('/home/joshmitchell/Documents/openff/nagl/examples/train-gnn-notebook/data/charge-am1_bond-None_feat-806f51f8669c6ed90c6e5f8bb155490c65465513db8d5cde47765027636c80a6_paths-585232fe314a8b881e982c3270295f860e6963fb124266bff3cad0ad2d7d5839.pkl')

In [None]:
trainer.test(model, data_module)

## Results!

In [None]:
print("--- Best model ---")
print(callbacks[0].best_model_path)
print(callbacks[0].best_model_score)
metrics_file = pathlib.Path(output_directory) / trainer_hash / "metrics.pkl"
with open(str(metrics_file), "wb") as f:
    metrics = (trainer._trainer.callback_metrics, trainer._trainer.logged_metrics)
    pickle.dump(metrics, f)

print(f"Wrote metrics to {str(metrics_file)}")

In [None]:
model.compute_property(Molecule.from_smiles("CC(C)(C)C"))