In [1]:
import logging
import warnings
from collections import defaultdict
from dataclasses import dataclass
from pprint import pprint

import qcportal
import openeye # these two lines prevent zstd header error

import numpy as np
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.toolkit import ForceField, Molecule
from openff.toolkit.utils import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
from tqdm import tqdm

In [2]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [3]:
# deregister openeye wrapper to use rdkit throughout
GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)

# Dataset Preparation

## Load Smiles and Force Field

In [4]:
p_smiles = defaultdict(list) # pid -> [smiles]
with open("all.smiles") as inp:
    for line in inp:
        [pid, smiles] = line.split() # lines of "pid smiles"
        p_smiles[pid].append(smiles)

ff = ForceField("tm.v2.offxml")

## Tag the torsions to drive

In [5]:
# adapted from 2022-04-29-OpenFF-multiplicity-correction-torsion-drive-data
# in the function create_tagged_molecules
molecules: list[Molecule] = []
for pid, smiles in tqdm(p_smiles.items(), desc="labeling molecules"):
    for smile in smiles:
        mol = Molecule.from_smiles(smile, allow_undefined_stereo=True)
        # avoids an error with one smiles for t91 that passes the Molecule.from_smiles but
        # can't be labeled or turned back into smiles
        try:
            labels = ff.label_molecules(mol.to_topology())[0]["ProperTorsions"]
        except ValueError:
            print(f"warning: failed to label molecule for {pid} with input smiles: {smile}")
            continue
        for atom_indices in (k for k, v in labels.items() if v.id == pid):
            torsion_indexer = TorsionIndexer()
            torsion_indexer.add_torsion(atom_indices, (0, 3), (-165, 180))
            mol.properties["dihedrals"] = torsion_indexer
            molecules.append(mol)
            break # so Pavan only takes one from each, at most

labeling molecules:  44%|██████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 37/85 [00:09<00:13,  3.64it/s]



labeling molecules: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 85/85 [00:21<00:00,  4.04it/s]


# Prepare the dataset

Again, this is adapted from [the OpenFF multiplicity correction notebook][example]

[example]: https://github.com/openforcefield/qca-dataset-submission/blob/08ace97758087f06cf22a986fc9ee838f72edae7/submissions/2022-04-29-OpenFF-multiplicity-correction-torsion-drive-data/Dataset_Generation.ipynb

In [6]:
dataset_factory = TorsiondriveDatasetFactory()
dataset_factory.add_workflow_components(workflow_components.StandardConformerGenerator(max_conformers=10))

# 2024-01-31-OpenFF-Torsion-Coverage-Supplement-v1.0
dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF Torsion Coverage Supplement v1.0",
    tagline="OpenFF Torsion Coverage Supplement v1.0",
    description="Additional coverage for Sage 2.1.0 proper torsions and new parameters from the torsion multiplicity work",
    molecules=molecules,
)

dataset.metadata.submitter = "ntBre"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2024-01-31-OpenFF-Torsion-Coverage-Supplement-v1.0"
)

Deduplication                 : 100%|████████| 820/820 [00:00<00:00, 886.05it/s]
StandardConformerGenerator    :   0%|           | 1/697 [00:02<25:21,  2.19s/it][15:55:51] UFFTYPER: Unrecognized charge state for atom: 31
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 15
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 15
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 52
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 44
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 57
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 20
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 27
StandardConformerGenerator    :   1%|           | 4/697 [00:02<05:46,  2.00it/s][15:55:51] UFFTYPER: Unrecognized charge state for atom: 50
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 24
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 52
[15:55:51] UFFTYPER: Unrecognized charge state for atom: 13
[15:55:51] UFFTYPER: Unrecognized charg

# Summarize the dataset
Make sure the molecules in the dataset match the input molecules

In [7]:
old_smiles = set()
for _pid, smiles in p_smiles.items():
    for smile in smiles:
        try:
            old_smiles.add(Molecule.from_smiles(smile, allow_undefined_stereo=True).to_smiles(isomeric=False))
        except AssertionError:
            # same issue as above
            continue

new_smiles = {molecule.to_smiles(isomeric=False) for molecule in dataset.molecules}

diff = len(old_smiles.symmetric_difference(new_smiles))
want = 66 # it's hard to say this is exactly what I want, but it is what happens
assert diff == want, f"{diff} ! = {want}"

Describe the molecules in the dataset

In [8]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

# thanks matt for updating this section in #344
masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))

Number of unique molecules        679
Number of filtered molecules      18
Number of conformers              679
Number of conformers min mean max 1   4.07 10
Mean molecular weight: 400.69
Max molecular weight: 833.16
Charges: [-6.0, -5.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0]


Describe the dataset

In [9]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsionDriveDataset',
 'creation_date': datetime.date(2024, 1, 31),
 'dataset_name': 'OpenFF Torsion Coverage Supplement v1.0',
 'elements': {'As',
              'B',
              'Br',
              'C',
              'Cl',
              'F',
              'H',
              'I',
              'N',
              'O',
              'P',
              'S',
              'Si'},
 'long_description': 'Additional coverage for Sage 2.1.0 proper torsions and '
                     'new parameters from the torsion multiplicity work',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2024-01-31-OpenFF-Torsion-Coverage-Supplement-v1.0', ),
 'short_description': 'OpenFF Torsion Coverage Supplement v1.0',
 'submitter': 'ntBre'}


In [10]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'keywords': {},
 'maxiter': 200,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'scf_properties': ['dipole',
                    'quadrupole',
                    'wiberg_lowdin_indices',
                    'mayer_indices'],
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [11]:
pprint(dataset.qc_specifications['default'].scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


Export the dataset.

In [12]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

GLOBAL_TOOLKIT_REGISTRY.register_toolkit(OpenEyeToolkitWrapper)
dataset.visualize("dataset.pdf", columns=8)

