In [1]:
import logging
import bz2
import sys
import os

In [2]:
import numpy as np

In [3]:
import qcfractal.interface as ptl
import tqdm
import sys
from qcsubmit import workflow_components
from qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule



In [4]:
# Need to turn off undefined stereochemistry warnings
logging.getLogger().setLevel(logging.ERROR)

In [5]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())


In [6]:
sdf_file = "./pubLigsNeutralGoodDensity.sdf"

mols = Molecule.from_file(sdf_file,
                          "sdf",
                          allow_undefined_stereo=True,
                         )


print("\nFiles loaded; molecules generated.")


Files loaded; molecules generated.


In [7]:
# Generate the workflow to apply to the molecules
qcs_ds = OptimizationDatasetFactory()

component = workflow_components.RotorFilter()
component.maximum_rotors = 3
qcs_ds.add_workflow_component(component)

component = workflow_components.EnumerateStereoisomers()
component.max_isomers = 100
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 100
component.toolkit = "openeye"
component.rms_cutoff = 3.0
qcs_ds.add_workflow_component(component)

In [8]:
dataset = qcs_ds.create_dataset(
    dataset_name="Genentech PDB Ligand Expo whole optimization neutral v1.0",
    molecules=mols,
    description="Genentech PDB Ligand Expo optimization dataset with neutral molecules, rotors > 3 are filtered out",
    tagline="Optimization set",
)
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|████████| 648/648 [00:01<00:00, 435.17it/s]
RotorFilter                   : 100%|████████| 646/646 [00:04<00:00, 134.09it/s]
EnumerateStereoisomers        : 100%|█████████| 127/127 [02:50<00:00,  1.34s/it]
StandardConformerGenerator    : 100%|█████████| 445/445 [01:23<00:00,  5.35it/s]
Preparation                   : 100%|█████████| 445/445 [00:16<00:00, 27.39it/s]

Workflow complete; dataset generated.





In [9]:
dataset.metadata.short_description = "Optimization set"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-11-19-Genentech-PDB-Ligand-Expo-whole-optimization-neutral-v1.0"
dataset.metadata.long_description = "Genentech PDB Ligand Expo neutral molecules set"

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset.export_dataset("dataset.json.bz2")

Number of unique molecules        445
Number of filtered molecules      519
Number of conformers              445
Number of conformers min mean max 1   1.00 1


In [10]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 445}
Total tasks: 445
CPU times: user 10.8 s, sys: 382 ms, total: 11.2 s
Wall time: 34.5 s


In [11]:
from qcsubmit.datasets import OptimizationDataset
dataset = OptimizationDataset.parse_file('./dataset.json.bz2')
dataset.metadata.long_description = "This dataset contains Genentech PDB Ligand Expo neutral set of molecules that filters out greater than 3 rotors"
dataset.metadata.submitter = 'pavankum'
dataset.export_dataset("dataset.json.bz2")

In [12]:
from qcsubmit.datasets import OptimizationDataset
ds = OptimizationDataset.parse_file('./dataset.json.bz2')
ds.visualize('visualize.pdf')