# Add Cresset TorsionDrives

This notebook generates a TorsionDrive dataset based on particular molecules contributed by Cresset to address gaps in data coverage for five particular failure cases:

1) Sage 2.1 t48a is fit only to one molecule,
where the complementary torsion contributes most of the profile

2) Sage 2.1 t17 may benefit from splitting and a different shape for non-symmetric rings;
the current n=3 shape sums to a constant

3) Sage 2.1 t19 is mostly trained to terminal methyls.
It has a functional form where the n=1 term dominates unexpectedly,
instead of the more expected equal n=3 contributions.
More data with non-terminal methyls is added

4) Sage 2.1 t18 covers amide-adjacent torsions but is not trained to many.

5) Sage 2.1 t105 covers an O linker with an sp2 or sp3 terminus.
While the sp3 profiles match the QM well, the sp2 profiles look too stiff.
This adds more data.

In [1]:
import qcportal
import pathlib

from openff.toolkit import Molecule
import numpy as np

from openff.qcsubmit.utils import get_symmetry_classes, get_symmetry_group
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.utils.visualize import molecules_to_pdf

In [2]:
def load_case_molecules(
    files: list[str],
    patterns: list[str]
) -> list[Molecule]:
    """Load SMILES from file and assign dihedrals to rotate around by pattern"""
    case_molecules = []

    for file in files:
        molecules = Molecule.from_file(file, allow_undefined_stereo=True)
    
        for mol in molecules:
            
            unique_central_bonds = set()
            torsion_indexer = TorsionIndexer()
            symmetry_classes = get_symmetry_classes(mol)
            
            for pattern in patterns:
                for match in mol.chemical_environment_matches(pattern):
                    # ignore torsions around rings
                    i, j, k, l = match
                    if mol.get_bond_between(j, k).is_in_ring():
                        continue
                        
                    symmetry_group = get_symmetry_group(match[1:3], symmetry_classes)
                    central_bond = tuple(sorted(match[1:3]))
                    if central_bond in unique_central_bonds:
                        continue
                        
                    unique_central_bonds.add(central_bond)
                    torsion_indexer.add_torsion(match, symmetry_group, (-165, 180))
    
            assert len(torsion_indexer.torsions)
            mol.properties["dihedrals"] = torsion_indexer
            case_molecules.append(mol)
    return case_molecules


def visualize(mols, filename):
    """Draw output molecules as PDF"""
    new_mols = []
    for mol in mols:
        for val in mol.properties["dihedrals"].torsions.values():
            new_mol = Molecule(mol)
            new_mol.properties["dihedrals"] = val.get_dihedrals
            new_mols.append(new_mol)
    molecules_to_pdf(new_mols, filename)

In [3]:
case_1_molecules = load_case_molecules(
    files=["inputs/type1.smi", "inputs/type1_aro.smi"],
    patterns=[
        "[#6X3:1]=[#6X3:2]-[#6X3:3](~[#8X1])~[#8X1:4]",
        "[*:1]~[#6X3:2]-[#6X3$(*=[#8,#16,#7]):3]~[*:4]"
    ],
)
visualize(case_1_molecules, "inputs/type1_molecules.pdf")
len(case_1_molecules)

24

In [4]:
case_2_molecules = load_case_molecules(
    files=["inputs/type2.smi"],
    patterns=["[*:1]~[#6X3:2]-[#6X4:3]~[*:4]"],
)
visualize(case_2_molecules, "inputs/type2_molecules.pdf")
len(case_2_molecules)

17

In [5]:
case_3_molecules = load_case_molecules(
    files=["inputs/type3+4.smi"],
    patterns=["[*:1]-[#6X4:2]-[#6X3:3]=[*:4]"],
)
visualize(case_3_molecules, "inputs/type3+4_molecules.pdf")
len(case_3_molecules)

15

In [6]:
case_5_molecules = load_case_molecules(
    files=["inputs/type5.smi"],
    patterns=["[*:1]-[#8X2:2]-[#6X3:3]-[*:4]"],
)
visualize(case_5_molecules, "inputs/type5_molecules.pdf")
len(case_5_molecules)

14

In [7]:
all_molecules = [
    *case_1_molecules,
    *case_2_molecules,
    *case_3_molecules,
    *case_5_molecules,
]
len(all_molecules)

70

In [8]:
dataset_factory = TorsiondriveDatasetFactory()
dataset_factory.add_workflow_components(
    workflow_components.StandardConformerGenerator(max_conformers=5)
)

description = """\
Molecules contributed by Cresset to address lack of data coverage for particular torsion drives.
These molecules are contributed to address five failure cases:

1) Sage 2.1 t48a is fit only to one molecule,
where the complementary torsion contributes most of the profile

2) Sage 2.1 t17 may benefit from splitting and a different shape for non-symmetric rings;
the current n=3 shape sums to a constant

3) Sage 2.1 t19 is mostly trained to terminal methyls.
It has a functional form where the n=1 term dominates unexpectedly,
instead of the more expected equal n=3 contributions.
More data with non-terminal methyls is added

4) Sage 2.1 t18 covers amide-adjacent torsions but is not trained to many.

5) Sage 2.1 t105 covers an O linker with an sp2 or sp3 terminus.
While the sp3 profiles match the QM well, the sp2 profiles look too stiff.
This adds more data.
"""

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Cresset Additional Coverage TorsionDrives v4.0",
    tagline="OpenFF Cresset Additional Coverage TorsionDrives v4.0",
    description=description,
    molecules=all_molecules,
)

dataset.metadata.submitter = "lilyminium"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path.cwd().name)
)

Deduplication                 : 100%|█████████| 70/70 [00:00<00:00, 1316.59it/s]
StandardConformerGenerator    : 100%|███████████| 70/70 [00:04<00:00, 15.95it/s]
Preparation                   : 100%|███████████| 70/70 [00:01<00:00, 56.34it/s]


In [9]:
# summarize dataset for readme
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
# With multiple torsions per unique molecule, n_molecules * confs.mean() no
# longer equals the number of conformers. instead, the number of dihedrals *
# confs.mean() should equal the number of conformers. The dataset contains one
# record per driven torsion (rather than combining multiple dihedrals into the
# same record), so n_records is the same as manually adding up len(dihedrals)
# for each record.
print("* Number of driven torsions:", dataset.n_records)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print(
    "* Number of conformers per molecule (min, mean, max): "
    f"{confs.min()}, {confs.mean():.2f}, {confs.max()}"
)

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f"* Mean molecular weight: {np.mean(np.array(masses)):.2f}")
print(f"* Max molecular weight: {np.max(np.array(masses)):.2f}")
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))


print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")


fields = [
    "basis",
    "implicit_solvent",
    "keywords",
    "maxiter",
    "method",
    "program",
]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print(f"\t * {field}: {od[field]}")
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")


# export the dataset
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)

* Number of unique molecules: 70
* Number of driven torsions: 82
* Number of filtered molecules: 0
* Number of conformers: 171
* Number of conformers per molecule (min, mean, max): 1, 2.09, 5
* Mean molecular weight: 145.98
* Max molecular weight: 280.75
* Charges: [-1.0, 0.0, 1.0]
## Metadata
* Elements: {S, Br, C, N, Cl, O, F, H}
* Spec: default
	 * basis: DZVP
	 * implicit_solvent: None
	 * keywords: {}
	 * maxiter: 200
	 * method: B3LYP-D3BJ
	 * program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices
