In [1]:
import logging
import bz2
import sys
import os

In [2]:
import numpy as np

In [3]:
import qcfractal.interface as ptl
import tqdm
import sys
from qcsubmit import workflow_components
from qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule





In [4]:
# Need to turn off undefined stereochemistry warnings
logging.getLogger().setLevel(logging.ERROR)

In [5]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())


In [6]:
smi_prefix_dir = "smi"
smi_files = ["AlkEthOH_chain.smi.bz2", "AlkEthOH_rings.smi.bz2", "PhEthOH.smi.bz2"]

mols = []
for smi_file in smi_files:
    
    opener = open
    if smi_file.endswith("bz2"):
        opener = bz2.open
        
    with opener(os.path.join(smi_prefix_dir,smi_file), 'rt') as fd:
        lines = fd.readlines()
        
        for smi in tqdm.tqdm(
            lines, total=len(lines), ncols=80,
            desc="{:30s}".format(smi_file), file=sys.stdout
        ):
            smi = smi.split()[0]
            mols.append(
                Molecule.from_smiles(smi, allow_undefined_stereo=True)
            )
print("\nFiles loaded; molecules generated.")

AlkEthOH_chain.smi.bz2        : 100%|██████| 1303/1303 [00:01<00:00, 657.03it/s]
AlkEthOH_rings.smi.bz2        : 100%|██████| 1156/1156 [00:01<00:00, 629.37it/s]
PhEthOH.smi.bz2               : 100%|██████| 5082/5082 [00:13<00:00, 371.35it/s]

Files loaded; molecules generated.


In [7]:
# Generate the workflow to apply to the molecules
qcs_ds = OptimizationDatasetFactory()

component = workflow_components.EnumerateStereoisomers()
component.max_isomers = 100
component.toolkit = "rdkit"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 100
component.toolkit = "rdkit"
component.rms_cutoff = 3.0
qcs_ds.add_workflow_component(component)

In [8]:
dataset = qcs_ds.create_dataset(
    dataset_name="OpenFF Sandbox CHO PhAlkEthOH v1.0",
    molecules=mols,
    description="A diverse set of small CHO molecules",
    tagline="A diverse set of small CHO molecules",
)
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|██████| 7541/7541 [00:07<00:00, 977.29it/s]
EnumerateStereoisomers        : 100%|██████| 7408/7408 [01:03<00:00, 115.76it/s]
StandardConformerGenerator    : 100%|█████| 10505/10505 [36:59<00:00,  4.73it/s]
Preparation                   : 100%|█████| 10505/10505 [02:27<00:00, 71.00it/s]

Workflow complete; dataset generated.





In [9]:
dataset.metadata.short_description = "A diverse set of small CHO molecules"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-09-18-OpenFF-Sandbox-CHO-PhAlkEthOH"
dataset.metadata.long_description = "This dataset contains an expanded set of the AlkEthOH and PhEthOH datasets, which were used in the original derivation of the smirnoff99Frosst parameters."

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset.export_dataset("dataset.json.bz2")

Number of unique molecules        10505
Number of filtered molecules      0
Number of conformers              12592
Number of conformers min mean max 1   1.20 5


In [10]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 12271}
Total tasks: 12271
CPU times: user 6min 30s, sys: 7.53 s, total: 6min 38s
Wall time: 16min 36s


In [1]:
from qcsubmit.datasets import OptimizationDataset
dataset = OptimizationDataset.parse_file('./dataset.json.bz2')
dataset.metadata.long_description = "This dataset contains a stereo-expanded version of the AlkEthOH dataset, and the original PhEthOH dataset, which were used in the original derivation of the smirnoff99Frosst parameters."
dataset.metadata.submitter = 'trevorgokey'
dataset.export_dataset("dataset.json.bz2")





In [2]:
from qcsubmit.datasets import OptimizationDataset
ds = OptimizationDataset.parse_file('./dataset.json.bz2')
ds.visualize('visualize.pdf', annotate=True, hires=False)

visualize.pdf: 100%|██████████████████████| 10505/10505 [04:33<00:00, 38.48it/s]


Processing visualize.pdf; this could take several minutes for a large dataset
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                            
