In [9]:
import logging
import bz2
import sys
import os

In [10]:
import numpy as np

In [11]:
import qcfractal.interface as ptl
import tqdm
import sys
from qcsubmit import workflow_components
from qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule

In [12]:
# Need to turn off undefined stereochemistry warnings
logging.getLogger().setLevel(logging.ERROR)

In [13]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())


In [14]:
sdf_file = "./pubLigsNeutralGoodDensity.sdf"

mols = Molecule.from_file(sdf_file,
                          "sdf",
                          allow_undefined_stereo=True,
                         )


print("\nFiles loaded; molecules generated.")


Files loaded; molecules generated.


In [16]:
# Generate the workflow to apply to the molecules
qcs_ds = OptimizationDatasetFactory()

component = workflow_components.RotorFilter()
component.maximum_rotors = 3
qcs_ds.add_workflow_component(component)

component = workflow_components.EnumerateStereoisomers()
component.max_isomers = 100
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 100
component.toolkit = "openeye"
component.rms_cutoff = 3.0
qcs_ds.add_workflow_component(component)

In [17]:
dataset = qcs_ds.create_dataset(
    dataset_name="Genentech PDB Ligand Expo whole optimization neutral v1.0",
    molecules=mols,
    description="Genentech PDB Ligand Expo optimization dataset neutral",
    tagline="Optimization set",
)
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|████████| 648/648 [00:00<00:00, 677.74it/s]
RotorFilter                   : 100%|████████| 646/646 [00:02<00:00, 216.35it/s]
EnumerateStereoisomers        : 100%|█████████| 127/127 [01:51<00:00,  1.14it/s]
StandardConformerGenerator    : 100%|█████████| 445/445 [00:55<00:00,  7.99it/s]
Preparation                   : 100%|█████████| 445/445 [00:11<00:00, 40.37it/s]

Workflow complete; dataset generated.





In [18]:
dataset.metadata.short_description = "Optimization set"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-11-19-Genentech-PDB-Ligand-Expo-whole-optimization-neutral-v1.0"
dataset.metadata.long_description = "Genentech PDB Ligand Expo neutral molecules set"

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset.export_dataset("dataset.json.bz2")

Number of unique molecules        445
Number of filtered molecules      519
Number of conformers              445
Number of conformers min mean max 1   1.00 1


In [19]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 445}
Total tasks: 445
CPU times: user 8.7 s, sys: 316 ms, total: 9.01 s
Wall time: 27.5 s


In [20]:
from qcsubmit.datasets import OptimizationDataset
dataset = OptimizationDataset.parse_file('./dataset.json.bz2')
dataset.metadata.long_description = "This dataset contains Genentech PDB Ligand Expo neutral set of molecules that filters greater than 3 rotors"
dataset.metadata.submitter = 'pavankum'
dataset.export_dataset("dataset.json.bz2")

In [21]:
from qcsubmit.datasets import OptimizationDataset
ds = OptimizationDataset.parse_file('./dataset.json.bz2')
ds.visualize('visualize.pdf')

In [22]:
# Generate the workflow to apply to the molecules
qcs_ds = OptimizationDatasetFactory()

component = workflow_components.EnumerateStereoisomers()
component.max_isomers = 100
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 100
component.toolkit = "openeye"
component.rms_cutoff = 3.0
qcs_ds.add_workflow_component(component)

In [23]:
dataset_whole = qcs_ds.create_dataset(
    dataset_name="Genentech PDB Ligand Expo whole optimization neutral v1.0",
    molecules=mols,
    description="Genentech PDB Ligand Expo optimization dataset neutral",
    tagline="Optimization set",
)
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|████████| 648/648 [00:00<00:00, 653.26it/s]
EnumerateStereoisomers        : 100%|█████████| 646/646 [09:19<00:00,  1.16it/s]
StandardConformerGenerator    : 100%|███████| 3101/3101 [09:11<00:00,  5.62it/s]
Preparation                   : 100%|███████| 3090/3090 [01:33<00:00, 33.07it/s]

Workflow complete; dataset generated.





In [28]:
confs = np.array([len(mol.conformers) for mol in dataset_whole.molecules])
print("Number of unique molecules       ", dataset_whole.n_molecules)
print("Number of filtered molecules     ", dataset_whole.n_filtered)
print("Number of conformers             ", dataset_whole.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset_whole.export_dataset("dataset_whole.json.bz2")

Number of unique molecules        3090
Number of filtered molecules      2
Number of conformers              3574
Number of conformers min mean max 1   1.16 4


In [29]:
from qcsubmit.datasets import OptimizationDataset
ds = OptimizationDataset.parse_file('./dataset_whole.json.bz2')
ds.visualize('visualize_whole.pdf')