# OpenFF Lipid Torsion Drives v 1.0

This notebook generates additional torsiondrives from lipid-like molecules for Sage training 

In [1]:
import zstandard
import qcportal
import pathlib

from openff.toolkit import Molecule, ForceField
import numpy as np

from openff.qcsubmit.utils import get_symmetry_classes, get_symmetry_group
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.utils.visualize import molecules_to_pdf

  from pkg_resources import resource_filename


In [2]:
def load_molecules(
    files: list[str],
    target_params: list[str],
) -> list[Molecule]:
    """Load SMILES from files and assign dihedrals to rotate around any of the given target SMIRKS parameters"""
    
    case_molecules = []
    forcefield = ForceField("openff-2.1.0.offxml")

    for file in files:
        molecules = Molecule.from_file(file, allow_undefined_stereo=True)
    
        for mol in molecules:
            unique_central_bonds = set()
            torsion_indexer = TorsionIndexer()
            symmetry_classes = get_symmetry_classes(mol)

            labels = forcefield.label_molecules(mol.to_topology())[0]["ProperTorsions"]
            for (i, j, k, l), parameter in labels.items():
                if parameter.id not in target_params:
                    continue
                central_bond = tuple(sorted([j, k]))
                # skip ring torsions
                if mol.get_bond_between(j, k).is_in_ring():
                    continue
                # skip duplicates
                if central_bond in unique_central_bonds:
                    continue
                    
                symmetry_group = get_symmetry_group(central_bond, symmetry_classes)
                torsion_indexer.add_torsion((i, j, k, l), symmetry_group, (-165, 180))
                unique_central_bonds.add(central_bond)
    
            if len(torsion_indexer.torsions) == 0:
                continue  # skip molecule if no matching torsions found

            mol.properties["dihedrals"] = torsion_indexer
            case_molecules.append(mol)

    return case_molecules


def visualize(mols, filename):
    """Draw output molecules as PDF"""
    new_mols = []
    for mol in mols:
        for val in mol.properties["dihedrals"].torsions.values():
            new_mol = Molecule(mol)
            new_mol.properties["dihedrals"] = val.get_dihedrals
            new_mols.append(new_mol)
    molecules_to_pdf(new_mols, filename)

In [3]:
target_param = [
    # glycerol backbone parameters
    "t5",
    "t95", # also applies to phosphate headgroup
    "t97", 
    "t107",
    "t110",
    # ester backbone parameters
    "t9",
    "t17",
    "t18", # also applies to alkenes 
    "t19",
    # alkene parameters
    "t45",
    "t46",
    # amine headgroup parameters
    "t50",
    "t58",
    # phosphate headgroup parameters
    "t159",
    "t160"
]

lipid_molecules = load_molecules(files=["input.smi"], target_params=target_param)
len(lipid_molecules)

16

In [4]:
visualize(lipid_molecules, "dataset.pdf")

# Dataset Preparation

In [6]:
dataset_factory = TorsiondriveDatasetFactory()
dataset_factory.add_workflow_components(
    workflow_components.StandardConformerGenerator(max_conformers=5)
)

description = "A torsiondrive data set created to improve the coverage of lipid-like parameters in Sage."
dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Lipid Torsion Drives v4.0",
    tagline="Improve lipid torsiondrive coverage in Sage",
    description=description,
    molecules=lipid_molecules,
)

dataset.metadata.submitter = "JHoeflich1"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path.cwd().name)
)

Deduplication                 : 100%|██████████| 16/16 [00:00<00:00, 322.46it/s]
StandardConformerGenerator    : 100%|███████████| 16/16 [00:04<00:00,  3.81it/s]
Preparation                   : 100%|███████████| 16/16 [00:05<00:00,  3.19it/s]


In [7]:
# summarize dataset for readme
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
# With multiple torsions per unique molecule, n_molecules * confs.mean() no
# longer equals the number of conformers. instead, the number of dihedrals *
# confs.mean() should equal the number of conformers. The dataset contains one
# record per driven torsion (rather than combining multiple dihedrals into the
# same record), so n_records is the same as manually adding up len(dihedrals)
# for each record.
print("* Number of driven torsions:", dataset.n_records)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print(
    "* Number of conformers per molecule (min, mean, max): "
    f"{confs.min()}, {confs.mean():.2f}, {confs.max()}"
)

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f"* Mean molecular weight: {np.mean(np.array(masses)):.2f}")
print(f"* Min molecular weight: {np.min(np.array(masses)):.2f}")
print(f"* Max molecular weight: {np.max(np.array(masses)):.2f}")
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))


print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")


fields = [
    "basis",
    "implicit_solvent",
    "keywords",
    "maxiter",
    "method",
    "program",
]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print(f"\t * {field}: {od[field]}")
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")


# export the dataset
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)

* Number of unique molecules: 16
* Number of driven torsions: 78
* Number of filtered molecules: 0
* Number of conformers: 321
* Number of conformers per molecule (min, mean, max): 1, 4.12, 5
* Mean molecular weight: 194.30
* Min molecular weight: 74.08
* Max molecular weight: 297.22
* Charges: [-1.0, 0.0, 1.0]
## Metadata
* Elements: {C, H, O, P, N}
* Spec: default
	 * basis: DZVP
	 * implicit_solvent: None
	 * keywords: {}
	 * maxiter: 200
	 * method: B3LYP-D3BJ
	 * program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices
