# OpenFF Additional Generated Guanidine and Amidine Derivative TorsionDrives 4.0

This notebook generates additional torsiondrives from molecules for the t18b (amidine), t87a (guanidine) torsions in Sage 2.2.1.

In [1]:
import zstandard
import qcportal
import pathlib

from openff.toolkit import Molecule, ForceField
import numpy as np

from openff.qcsubmit.utils import get_symmetry_classes, get_symmetry_group
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.utils.visualize import molecules_to_pdf

In [2]:
def load_molecules(
    files: list[str],
    parameter_id: str,
) -> list[Molecule]:
    """Load SMILES from file and assign dihedrals to rotate around by pattern"""
    case_molecules = []
    forcefield = ForceField("inputs/openff_unconstrained-2.2.1.offxml")

    for file in files:
        molecules = Molecule.from_file(file, allow_undefined_stereo=True)
    
        for mol in molecules:
            
            unique_central_bonds = set()
            torsion_indexer = TorsionIndexer()
            symmetry_classes = get_symmetry_classes(mol)

            labels = forcefield.label_molecules(mol.to_topology())[0]["ProperTorsions"]
            for (i, j, k, l), parameter in labels.items():
                if parameter.id != parameter_id:
                    continue
                central_bond = tuple(sorted([j, k]))
                # ignore torsions around rings
                if mol.get_bond_between(j, k).is_in_ring():
                    continue
                    
                symmetry_group = get_symmetry_group(central_bond, symmetry_classes)
                if central_bond in unique_central_bonds:
                    continue
                    
                unique_central_bonds.add(central_bond)
                torsion_indexer.add_torsion((i, j, k, l), symmetry_group, (-165, 180))
    
            assert len(torsion_indexer.torsions)
            mol.properties["dihedrals"] = torsion_indexer
            case_molecules.append(mol)
    return case_molecules


def visualize(mols, filename):
    """Draw output molecules as PDF"""
    new_mols = []
    for mol in mols:
        for val in mol.properties["dihedrals"].torsions.values():
            new_mol = Molecule(mol)
            new_mol.properties["dihedrals"] = val.get_dihedrals
            new_mols.append(new_mol)
    molecules_to_pdf(new_mols, filename)

In [3]:
t18b_molecules = load_molecules(
    files=["inputs/t18b.smi"],
    parameter_id="t18b",
)
visualize(t18b_molecules, "inputs/t18b_molecules.pdf")
len(t18b_molecules)

10

In [4]:
t87a_molecules = load_molecules(
    files=["inputs/t87a.smi"],
    parameter_id="t87a",
)
visualize(t18b_molecules, "inputs/t87a_molecules.pdf")
len(t18b_molecules)

10

In [5]:
all_molecules = [
    *t18b_molecules,
    *t87a_molecules
]
len(all_molecules)

20

In [6]:
dataset_factory = TorsiondriveDatasetFactory()
dataset_factory.add_workflow_components(
    workflow_components.StandardConformerGenerator(max_conformers=5)
)

description = """\
Molecules generated to add more coverage for the rare torsions t18b (matching amidine derivatives) and t87a (matching guanidines derivatives) in Sage 2.2.1.
Charged molecules were generated by hand to match rare patterns with the elements C, N, O, H.
10 molecules were generated per torsion.
"""

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Additional Generated Guanidine and Amidine Derivative TorsionDrives 4.0",
    tagline="Additional TorsionDrives for t18b and t87a torsions in Sage 2.2.1",
    description=description,
    molecules=all_molecules,
)

dataset.metadata.submitter = "lilyminium"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path.cwd().name)
)

Deduplication                 : 100%|██████████| 20/20 [00:00<00:00, 986.67it/s]
StandardConformerGenerator    : 100%|███████████| 20/20 [00:03<00:00,  5.09it/s]
Preparation                   : 100%|███████████| 20/20 [00:00<00:00, 32.27it/s]


In [7]:
print(description)

Molecules generated to add more coverage for the rare torsions t18b (matching amidine derivatives) and t87a (matching guanidines derivatives) in Sage 2.2.1.
Charged molecules were generated by hand to match rare patterns with the elements C, N, O, H.
10 molecules were generated per torsion.



In [7]:
# summarize dataset for readme
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
# With multiple torsions per unique molecule, n_molecules * confs.mean() no
# longer equals the number of conformers. instead, the number of dihedrals *
# confs.mean() should equal the number of conformers. The dataset contains one
# record per driven torsion (rather than combining multiple dihedrals into the
# same record), so n_records is the same as manually adding up len(dihedrals)
# for each record.
print("* Number of driven torsions:", dataset.n_records)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print(
    "* Number of conformers per molecule (min, mean, max): "
    f"{confs.min()}, {confs.mean():.2f}, {confs.max()}"
)

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f"* Mean molecular weight: {np.mean(np.array(masses)):.2f}")
print(f"* Max molecular weight: {np.max(np.array(masses)):.2f}")
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))


print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")


fields = [
    "basis",
    "implicit_solvent",
    "keywords",
    "maxiter",
    "method",
    "program",
]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print(f"\t * {field}: {od[field]}")
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")


# export the dataset
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)

* Number of unique molecules: 20
* Number of driven torsions: 38
* Number of filtered molecules: 0
* Number of conformers: 74
* Number of conformers per molecule (min, mean, max): 1, 1.95, 5
* Mean molecular weight: 113.72
* Max molecular weight: 178.26
* Charges: [1.0]
## Metadata
* Elements: {C, N, H, O}
* Spec: default
	 * basis: DZVP
	 * implicit_solvent: None
	 * keywords: {}
	 * maxiter: 200
	 * method: B3LYP-D3BJ
	 * program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices
