In [1]:
#imports 

import logging
from pprint import pprint
import sys
import warnings

import numpy as np

from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.common_structures import TorsionIndexer
from openff.qcsubmit import workflow_components
from openforcefield.topology import Molecule

from openeye import oechem

In [2]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openforcefield").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [3]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client, threads=1)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())



# Datasets for WBO Interpolation

Lists of all the datasets used for WBO interpolation

In [4]:
# Datasets 

enyl = [
    "CC=Cc1ccc(cc1)O",
    "CC=Cc1ccc(cc1)S",
    "CC=Cc1ccc(cc1)C(=O)O",
    "CC=Cc1ccc(cc1)N",
    "CC=Cc1ccc(cc1)[N+](C)(C)C",
    "CC=Cc1ccc(cc1)NC(=O)N",
    "CC=Cc1ccc(cc1)NC",
    "CC=Cc1ccc(cc1)NON",
    "CC=Cc1ccc(cc1)C#N",
    "CC=Cc1ccc(cc1)C",
    "CC=Cc1ccc(cc1)S(=O)O",
    "CCOc1ccc(cc1)C=CC",
    "CC=Cc1ccc(cc1)[O-]",
]

styrene = [
    "C=Cc1ccc(cc1)O",
    "C=Cc1ccc(cc1)S",
    "C=Cc1ccc(cc1)C(=O)O",
    "C=Cc1ccc(cc1)N",
    "C[N+](C)(C)c1ccc(cc1)C=C",
    "C=Cc1ccc(cc1)NC(=O)N",
    "CNc1ccc(cc1)C=C",
    "C=Cc1ccc(cc1)NON",
    "C=Cc1ccc(cc1)C#N",
    "Cc1ccc(cc1)C=C",
    "C=Cc1ccc(cc1)S(=O)O",
    "CCOc1ccc(cc1)C=C",
    "C=Cc1ccc(cc1)[O-]",
]

amide_primary = [
    "C(=O)(N)O",
    "C(=O)(N)S",
    "C(=O)(C(=O)O)N",
    "C(=O)(N)N",
    "C[N+](C)(C)C(=O)N",
    "C(=O)(N)NC(=O)N",
    "CNC(=O)N(C)C",
    "C(=O)(N)NON",
    "C(#N)C(=O)N",
    "CCC(=O)N",
    "CN(C)C(=O)S(=O)O",
    "CCOC(=O)N",
    "C(=O)(N)[O-]",
]

amide_secondary = [
    "CNC(=O)O",
    "CNC(=O)S",
    "CNC(=O)C(=O)O",
    "CNC(=O)N",
    "CNC(=O)[N+](C)(C)C",
    "CNC(=O)NC(=O)N",
    "CNC(=O)NC",
    "CNC(=O)NON",
    "CNC(=O)C#N",
    "CCC(=O)NC",
    "CNC(=O)S(=O)O",
    "CCOC(=O)NC",
    "CNC(=O)[O-]",
]

amide_tertiary = [
    "CN(C)C(=O)O",
    "CN(C)C(=O)S",
    "CN(C)C(=O)C(=O)O",
    "CN(C)C(=O)N",
    "CN(C)C(=O)[N+](C)(C)C",
    "CN(C)C(=O)NC(=O)N",
    "CNC(=O)N(C)C",
    "CN(C)C(=O)NON",
    "CN(C)C(=O)C#N",
    "CCC(=O)N(C)C",
    "CN(C)C(=O)S(=O)O",
    "CCOC(=O)N(C)C",
    "CN(C)C(=O)[O-]",
]

carbonyl = [
    "C=CC(=O)O",
    "C=CC(=O)S",
    "C=CC(=O)C(=O)O",
    "C=CC(=O)N",
    "C[N+](C)(C)C(=O)C=C",
    "C=CC(=O)NC(=O)N",
    "CNC(=O)C=C",
    "C=CC(=O)NON",
    "C=CC(=O)C#N",
    "CCC(=O)C=C",
    "C=CC(=O)S(=O)O",
    "CCOC(=O)C=C",
    "C=CC(=O)[O-]",
]

carbamate = [
    "CN(C)C(=O)OO",
    "CN(C)C(=O)OS",
    "CN(C)C(=O)OC(=O)O",
    "CN(C)C(=O)ON",
    "CN(C)C(=O)O[N+](C)(C)C",
    "CN(C)C(=O)ONC(=O)N",
    "CNOC(=O)N(C)C",
    "CN(C)C(=O)ONON",
    "CN(C)C(=O)OC#N",
    "CCOC(=O)N(C)C",
    "CN(C)C(=O)OS(=O)O",
    "CCOOC(=O)N(C)C",
    "CN(C)C(=O)O[O-]",
]

urea = [
    "CN(C)C(=O)NO",
    "CN(C)C(=O)NS",
    "CN(C)C(=O)NC(=O)O",
    "CN(C)C(=O)NN",
    "CN(C)C(=O)N[N+](C)(C)C",
    "CN(C)C(=O)NNC(=O)N",
    "CNNC(=O)N(C)C",
    "CN(C)C(=O)NNON",
    "CN(C)C(=O)NC#N",
    "CCNC(=O)N(C)C",
    "CN(C)C(=O)NS(=O)O",
    "CCONC(=O)N(C)C",
    "CN(C)C(=O)N[O-]",
]


groups = {
    "enyl": [enyl, "[C:1]-[C:2]=[C:3]-[c:4]"],
    "styrene": [styrene, "[H:1]-[C:2]=[C:3]-[c:4]"],
    "amide_primary": [amide_primary, "[H:1]-[N:2]-[C:3]=[O:4]"],
    "amide_secondary": [amide_secondary, "[C:1]-[N:2]-[C:3]=[O:4]"],
    "amide_tertiary": [amide_tertiary, "[C:1]-[N:2]-[C:3]=[O:4]"],
    "carbonyl": [carbonyl, "[C:1]=[C:2]-[C:3]=[O:4]"],
    "carbamate": [carbamate, "[C:1]-[N:2]-[C:3]=[O:4]"],
    "urea": [urea, "[C:1]-[N:2]-[C:3]-[N:4]"],
}


In [5]:
def find_torsions(mols: list, smarts: str, verbose=False):

    for mol in mols:
        matches = mol.chemical_environment_matches(smarts)
        tds = TorsionIndexer()
        if verbose:
            print(
                "\nmolecule: {:32s} searching: {:16s}".format(
                    mol.to_smiles(explicit_hydrogens=True, mapped=True), smarts
                )
            )
        for match in matches:
            if verbose:
                print("    Adding torsion", match)
            tds.add_torsion(match, (-180, 180), True)
        mol.properties["dihedrals"] = tds
        assert tds.n_torsions > 0
        if verbose:
            print("    Total torsions: {:d}".format(tds.n_torsions))


In [6]:
def gen_molecules(smi: list):
    mols = []
    for m in smi: 
        mol=Molecule.from_smiles(m, allow_undefined_stereo=True)
        mols.append(mol)
    return mols

## Generate the molecules and torsion indices per group

In [7]:
all_mols = list()

for name, (smi, smarts) in groups.items():
    print("Group: {:8s}".format(name))

    mols = gen_molecules(smi)

    qcs_ds = TorsiondriveDatasetFactory()

    # The workflow only works in *this* order

    component = workflow_components.EnumerateProtomers()
    qcs_ds.add_workflow_component(component)

    component = workflow_components.EnumerateStereoisomers()
    qcs_ds.add_workflow_component(component)

    sys.stdout.flush()
    # This round is designed to create the new molecules from the workflow.
    # Because we are searching each molecule for a SMARTS pattern,
    # we do not assume two protomers would have the same indices
    # for the same torsion, hence the pull of the molecules from the dataset
    dataset = qcs_ds.create_dataset(
        dataset_name="thismustbeeightchars",
        tagline="thismustbeeightchars",
        description="thismustbeeightchars",
        molecules=mols,
    )

    expanded_mols = list(dataset.molecules)
    find_torsions(expanded_mols, smarts, verbose=False)

    # Hack to get our torsions saved to file, rather than have it save all torsions
    # qcs_ds = TorsiondriveDatasetFactory()
    # dataset = qcs_ds.create_dataset(dataset_name="thismustbeeightchars",
    #                                 tagline="thismustbeeightchars",
    #                                 description="thismustbeeightchars",
    #                                 molecules=expanded_mols)
    # dataset.visualize(f"{name}.pdf")

    all_mols.extend(expanded_mols)


Group: enyl    


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 1274.09it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 194.90it/s]
EnumerateStereoisomers        : 100%|███████████| 19/19 [00:01<00:00, 14.22it/s]
Preparation                   : 100%|███████████| 36/36 [00:01<00:00, 25.27it/s]


Group: styrene 


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 1268.22it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 229.92it/s]
EnumerateStereoisomers        : 100%|███████████| 19/19 [00:00<00:00, 20.39it/s]
Preparation                   : 100%|███████████| 17/17 [00:00<00:00, 42.70it/s]


Group: amide_primary


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 2366.99it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 369.48it/s]
EnumerateStereoisomers        : 100%|███████████| 19/19 [00:00<00:00, 20.56it/s]
Preparation                   : 100%|███████████| 17/17 [00:00<00:00, 47.65it/s]


Group: amide_secondary


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 2114.80it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 352.63it/s]
EnumerateStereoisomers        : 100%|███████████| 19/19 [00:00<00:00, 19.79it/s]
Preparation                   : 100%|███████████| 17/17 [00:00<00:00, 40.61it/s]


Group: amide_tertiary


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 2206.99it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 327.70it/s]
EnumerateStereoisomers        : 100%|███████████| 19/19 [00:00<00:00, 20.15it/s]
Preparation                   : 100%|███████████| 17/17 [00:00<00:00, 28.31it/s]


Group: carbonyl


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 1663.44it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 344.00it/s]
EnumerateStereoisomers        : 100%|███████████| 19/19 [00:00<00:00, 20.38it/s]
Preparation                   : 100%|███████████| 17/17 [00:00<00:00, 52.46it/s]


Group: carbamate


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 1619.71it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 297.12it/s]
EnumerateStereoisomers        : 100%|███████████| 20/20 [00:01<00:00, 18.39it/s]
Preparation                   : 100%|███████████| 18/18 [00:00<00:00, 31.78it/s]


Group: urea    


Deduplication                 : 100%|█████████| 13/13 [00:00<00:00, 1823.98it/s]
EnumerateProtomers            : 100%|██████████| 13/13 [00:00<00:00, 199.43it/s]
EnumerateStereoisomers        : 100%|███████████| 31/31 [00:01<00:00, 21.81it/s]
Preparation                   : 100%|███████████| 28/28 [00:01<00:00, 25.34it/s]


## Prepare the main dataset by combining the groups together and generate conformers

In [8]:
qcs_ds = TorsiondriveDatasetFactory()

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 10
component.rms_cutoff = 0.1
qcs_ds.add_workflow_component(component)

dataset = qcs_ds.create_dataset(
    dataset_name="OpenFF WBO Conjugated Series v1.0",
    tagline="A functional series of molecules with varying conjugation",
    description="A series of functional groups to study bond conjugation effects for FF parameter interpolation",
    molecules=all_mols,
)


Deduplication                 : 100%|████████| 693/693 [00:02<00:00, 236.27it/s]
StandardConformerGenerator    : 100%|█████████| 165/165 [00:03<00:00, 46.26it/s]
Preparation                   : 100%|█████████| 165/165 [00:04<00:00, 40.78it/s]


## Configure the dataset

In [9]:
dataset.optimization_procedure.coordsys = "dlc"
dataset.metadata.submitter = "jmaat"
dataset.metadata.short_description = "A series of functional groups to study bond conjugation effects for FF parameter interpolation"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/OpenFF-WBO-Conjugated-Series"
dataset.metadata.long_description = "This is a torsion drive dataset that probes a range of Wiberg bond orders for different chemistries to better understand the relationship between torsion barrier height and Wiberg bond order. The dataset is being used for developing Wiberg bond order interpolated torsion parameters in OpenFF.\nThe general principal behind the dataset is to start with a “base” chemical group and substitute chemical groups onto these “base” groups. The “base” groups included in this dataset are enyl, styrene, primary amide, secondary amide, tertiary amide, carbamate, urea and carbonyl group. For each of these “base” chemical groups, we substitute (1) hydroxy, (2) thiol, (3) carboxylic, (4) primary amine, (5) pronated amine, (6) urea , (7) secondary amine, (8) hydroxyl amine , (9) nitrile, (10) alkene, (11)  sulfone, (12) ethoxy, (13) hydroxide groups. The aim is to substitute chemical groups with varying electron withdrawing and donating properties, which will vary the Wiberg bond order of the central torsion bond. This dataset enables exploration of the effects of Wiberg bond order on the torsion barrier height for various chemistries.\nThis dataset enumerates both the protomeric and stereoisomers of the molecules."


## Describe the dataset

In [10]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsiondriveDataset',
 'creation_date': datetime.date(2021, 2, 3),
 'dataset_name': 'OpenFF WBO Conjugated Series v1.0',
 'elements': {'C', 'S', 'N', 'O', 'H'},
 'long_description': 'This is a torsion drive dataset that probes a range of '
                     'Wiberg bond orders for different chemistries to better '
                     'understand the relationship between torsion barrier '
                     'height and Wiberg bond order. The dataset is being used '
                     'for developing Wiberg bond order interpolated torsion '
                     'parameters in OpenFF.\n'
                     'The general principal behind the dataset is to start '
                     'with a “base” chemical group and substitute chemical '
                     'groups onto these “base” groups. The “base” groups '
                     'included in this dataset are enyl, styrene, primary '
                     'amide, secondary amide, tertiary amide, carbamate, u

In [11]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}


In [12]:
pprint(dataset.scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


In [13]:
# TODO: get this into metadata
masses = []
for molecule in dataset.molecules: 
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')

Mean molecular weight: 119.24
Max molecular weight: 182.24


In [14]:

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of torsion drives         ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")
dataset.visualize('dataset.pdf')

Number of unique molecules        165
Number of filtered molecules      0
Number of torsion drives          190
Number of conformers min mean max 1   7.21 10


In [15]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 190}
Total tasks: 190
CPU times: user 8.24 s, sys: 194 ms, total: 8.44 s
Wall time: 27.5 s
