In [1]:
import logging
import warnings
from collections import defaultdict
from dataclasses import dataclass
from pprint import pprint

import qcportal
import openeye # these two lines prevent zstd header error

import numpy as np
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.toolkit import ForceField, Molecule
from openff.toolkit.utils import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
from tqdm import tqdm

In [2]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [3]:
# deregister openeye wrapper to use rdkit throughout
GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)

# Dataset Preparation

## Load Smiles and Force Field

In [4]:
p_smiles = defaultdict(list) # pid -> [smiles]
with open("all.smiles") as inp:
    for line in inp:
        [pid, smiles] = line.split() # lines of "pid smiles"
        p_smiles[pid].append(smiles)

ff = ForceField("tm.v2.offxml")

## Tag the torsions to drive

In [5]:
# adapted from 2022-04-29-OpenFF-multiplicity-correction-torsion-drive-data
# in the function create_tagged_molecules
remove = {5, 14, 33} # B, Si, and As
filtered_elements = []
filtered_charge = []
molecules: list[Molecule] = []
for pid, smiles in tqdm(p_smiles.items(), desc="labeling molecules"):
    for smile in smiles:
        mol = Molecule.from_smiles(smile, allow_undefined_stereo=True)
        elements = {a.atomic_number for a in mol.atoms}
        if len(elements & remove) > 0:
            filtered_elements.append(smile)
            continue
        if abs(mol.total_charge.magnitude) > 2.0:
            filtered_charge.append(smile)
            continue
        # avoids an error with one smiles for t91 that passes the Molecule.from_smiles but
        # can't be labeled or turned back into smiles
        try:
            labels = ff.label_molecules(mol.to_topology())[0]["ProperTorsions"]
        except ValueError:
            print(f"warning: failed to label molecule for {pid} with input smiles: {smile}")
            continue
        for atom_indices in (k for k, v in labels.items() if v.id == pid):
            torsion_indexer = TorsionIndexer()
            torsion_indexer.add_torsion(atom_indices, (0, 3), (-165, 180))
            mol.properties["dihedrals"] = torsion_indexer
            molecules.append(mol)
            break # so Pavan only takes one from each, at most

print(f"removed {len(filtered_elements)} molecules containing B, Si, or As:")
pprint(filtered_elements)

print(f"removed {len(filtered_charge)} molecules containing charges > 2:")
pprint(filtered_charge)

labeling molecules:  44%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 37/85 [00:09<00:13,  3.59it/s]



labeling molecules: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85/85 [00:21<00:00,  4.00it/s]

removed 6 molecules containing B, Si, or As:
['[H]OC(=O)c1c(Cl)c(Cl)c(Cl)c(Cl)c1C1=C2C([H])=C([H])C(=O)C([As]3SC([H])([H])C([H])([H])S3)=C2Oc2c1c([H])c([H])c(O[H])c2[As]1SC([H])([H])C([H])([H])S1',
 '[H]OB(O[H])C([H])([H])C([H])([H])C([H])([H])[C@@]1([H])C([H])([H])[C@@]2([H])N([H])C([H])([H])C([H])([H])[C@@]2([H])[C@]1(C(=O)O[H])N([H])[H]',
 '[H]c1c(Br)nc([C@@](N([H])[S@+]([O-])C(C([H])([H])[H])(C([H])([H])[H])C([H])([H])[H])(C([H])(F)F)C([H])([H])C(=O)OC(C([H])([H])[H])(C([H])([H])[H])C([H])([H])[H])c(F)c1[Si](C([H])([H])C([H])([H])[H])(C([H])([H])C([H])([H])[H])C([H])([H])C([H])([H])[H]',
 '[H]C1=C2C(=O)[C@](O[Si](C([H])([H])[H])(C([H])([H])[H])C(C([H])([H])[H])(C([H])([H])[H])C([H])([H])[H])(C#C/C([H])=C(/[H])C#CC2([H])[H])C([H])([H])C1([H])[H]',
 '[H]C1=C([H])N([H])C(C([H])([H])[H])=C2O[B-](c3c([H])c([H])c([H])c([H])c3[H])(c3c([H])c([H])c([H])c([H])c3[H])[O+]=C12',
 '[H]c1c([H])c([H])c([B-]2(c3c([H])c([H])c([H])c([H])c3[H])[O+]=C(C([H])([H])[H])C(C(=O)C([H])([H])[H])=C(C(Cl)(Cl)Cl




# Prepare the dataset

Again, this is adapted from [the OpenFF multiplicity correction notebook][example]

[example]: https://github.com/openforcefield/qca-dataset-submission/blob/08ace97758087f06cf22a986fc9ee838f72edae7/submissions/2022-04-29-OpenFF-multiplicity-correction-torsion-drive-data/Dataset_Generation.ipynb

In [6]:
dataset_factory = TorsiondriveDatasetFactory()
dataset_factory.add_workflow_components(workflow_components.StandardConformerGenerator(max_conformers=10))

# 2024-01-31-OpenFF-Torsion-Coverage-Supplement-v1.0
dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Torsion Coverage Supplement v1.0",
    tagline="OpenFF Torsion Coverage Supplement v1.0",
    description="Additional coverage for Sage 2.1.0 proper torsions and new parameters from the torsion multiplicity work",
    molecules=molecules,
)

dataset.metadata.submitter = "ntBre"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-01-31-OpenFF-Torsion-Coverage-Supplement-v1.0"
)

Deduplication                 : 100%|████████| 808/808 [00:00<00:00, 881.18it/s]
StandardConformerGenerator    :   0%|           | 2/688 [00:01<08:55,  1.28it/s][17:14:39] UFFTYPER: Unrecognized charge state for atom: 31
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 15
StandardConformerGenerator    :   1%|           | 4/688 [00:02<04:29,  2.54it/s][17:14:39] UFFTYPER: Unrecognized charge state for atom: 15
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 52
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 44
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 27
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 20
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 57
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 50
[17:14:39] UFFTYPER: Unrecognized charge state for atom: 24
[17:14:40] UFFTYPER: Unrecognized charge state for atom: 52
[17:14:40] UFFTYPER: Unrecognized charge state for atom: 13
[17:14:40] UFFTYPER: Unrecognized charg

# Summarize the dataset
Make sure the molecules in the dataset match the input molecules

In [8]:
old_smiles = set()
for _pid, smiles in p_smiles.items():
    for smile in smiles:
        try:
            old_smiles.add(Molecule.from_smiles(smile, allow_undefined_stereo=True).to_smiles(isomeric=False))
        except AssertionError:
            # same issue as above
            continue

new_smiles = {molecule.to_smiles(isomeric=False) for molecule in dataset.molecules}

diff = len(old_smiles.symmetric_difference(new_smiles))
want = 74 # it's hard to say this is exactly what I want, but it is what happens
assert diff == want, f"{diff} ! = {want}"

Describe the molecules in the dataset

In [9]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

# thanks matt for updating this section in #344
masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))

Number of unique molecules        671
Number of filtered molecules      17
Number of conformers              671
Number of conformers min mean max 1   4.07 10
Mean molecular weight: 399.72
Max molecular weight: 833.16
Charges: [-2.0, -1.0, 0.0, 1.0, 2.0]


Describe the dataset

In [10]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsionDriveDataset',
 'creation_date': datetime.date(2024, 1, 31),
 'dataset_name': 'OpenFF Torsion Coverage Supplement v1.0',
 'elements': {'O', 'P', 'H', 'Br', 'N', 'S', 'F', 'Cl', 'C', 'I'},
 'long_description': 'Additional coverage for Sage 2.1.0 proper torsions and '
                     'new parameters from the torsion multiplicity work',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2024-01-31-OpenFF-Torsion-Coverage-Supplement-v1.0', ),
 'short_description': 'OpenFF Torsion Coverage Supplement v1.0',
 'submitter': 'ntBre'}


In [11]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'keywords': {},
 'maxiter': 200,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'scf_properties': ['dipole',
                    'quadrupole',
                    'wiberg_lowdin_indices',
                    'mayer_indices'],
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [12]:
pprint(dataset.qc_specifications['default'].scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


Export the dataset.

In [13]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

GLOBAL_TOOLKIT_REGISTRY.register_toolkit(OpenEyeToolkitWrapper)
dataset.visualize("dataset.pdf", columns=8)

