In [1]:
import logging
import bz2
import sys
import os

In [2]:
import numpy as np

In [3]:
import qcfractal.interface as ptl
import tqdm
import sys
from qcsubmit import workflow_components
from qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule



In [4]:
# Need to turn off undefined stereochemistry warnings
logging.getLogger().setLevel(logging.ERROR)

In [5]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())


In [6]:
!tar -xzf ../2020-11-11-OpenFF-Optimization-Set-1-charged/OpenFF_references.sdf.tar.gz -C .

In [7]:
sdf_file = "./OpenFF_references.sdf"

mols = Molecule.from_file(sdf_file,
                          "sdf",
                          allow_undefined_stereo=True,
                         )


print("\nFiles loaded; molecules generated.")


Files loaded; molecules generated.


In [8]:
# Generate the workflow to apply to the molecules
qcs_ds = OptimizationDatasetFactory()

component = workflow_components.EnumerateTautomers()
component.max_tautomers = 100
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.EnumerateProtomers()
component.max_states = 100
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 100
component.toolkit = "openeye"
component.rms_cutoff = 3.0
qcs_ds.add_workflow_component(component)

In [9]:
dataset = qcs_ds.create_dataset(
    dataset_name="OpenFF Optimization Set 1 charged",
    molecules=mols,
    description="Roche optimization dataset (of Gen 1 fitting) with Protonation states and Tautomers enumerated",
    tagline="Optimization set with protomers and tautomers",
)
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|███████| 468/468 [00:00<00:00, 1143.05it/s]
EnumerateTautomers            : 100%|████████| 468/468 [00:03<00:00, 155.01it/s]
EnumerateProtomers            : 100%|████████| 553/553 [00:04<00:00, 119.59it/s]
StandardConformerGenerator    : 100%|███████| 1049/1049 [00:15<00:00, 67.35it/s]










Preparation                   : 100%|███████| 1043/1043 [00:13<00:00, 75.76it/s]

Workflow complete; dataset generated.





In [10]:
dataset.metadata.short_description = "Optimization set with protomers and tautomers"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-11-11-OpenFF-Optimization-Set-1-charged"
dataset.metadata.long_description = "Creating a protonation states and tautomers for Roche optimization dataset, which was used in Gen 1 fitting"

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset.export_dataset("dataset.json.bz2")

Number of unique molecules        1043
Number of filtered molecules      6
Number of conformers              1043
Number of conformers min mean max 1   1.00 1


In [11]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 1041}
Total tasks: 1041
CPU times: user 21.2 s, sys: 632 ms, total: 21.8 s
Wall time: 59.6 s


In [12]:
from qcsubmit.datasets import OptimizationDataset
dataset = OptimizationDataset.parse_file('./dataset.json.bz2')
dataset.metadata.long_description = "This dataset contains a tuatomer and protomer expanded version of the Roche Optimization set, which was used in Gen 1 fitting of FF."
dataset.metadata.submitter = 'pavankum'
dataset.export_dataset("dataset.json.bz2")









In [13]:
from qcsubmit.datasets import OptimizationDataset
ds = OptimizationDataset.parse_file('./dataset.json.bz2')
ds.visualize('visualize.pdf')







