In [1]:
import logging
import bz2
import sys
import os

In [2]:
import numpy as np

In [3]:
import qcfractal.interface as ptl
import tqdm
import sys
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule

In [4]:
# Need to turn off undefined stereochemistry warnings
logging.getLogger().setLevel(logging.ERROR)

In [5]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())


In [6]:
sdf_file = "./pubLigsNeutralGoodDensity.sdf"

molecules = Molecule.from_file(sdf_file,
                          "sdf",
                          allow_undefined_stereo=True,
                         )
# Considering molecules that have rotors greater than 3 for fragmentation down the line
mols = []
for molecule in molecules:
    if len(molecule.find_rotatable_bonds()) > 3:
        mols.append(molecule)
    else:
        continue
print("\nFiles loaded; molecules generated.")


Files loaded; molecules generated.


In [7]:
# Generate the workflow to apply to the molecules
qcs_ds = OptimizationDatasetFactory()

component = workflow_components.WBOFragmenter()
component.keep_non_rotor_ring_substituents = True
qcs_ds.add_workflow_component(component)

component = workflow_components.EnumerateStereoisomers()
component.max_isomers = 100
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 100
component.toolkit = "openeye"
component.rms_cutoff = 3.0
qcs_ds.add_workflow_component(component)

In [8]:
dataset = qcs_ds.create_dataset(
        dataset_name="Genentech PDB Ligand Expo fragment optimization neutral v1.0",
        molecules=mols,
        description="Genentech PDB Ligand Expo optimization dataset with neutral molecules, molecules with rotors > 3 are fragmented",
        tagline="Optimization set")
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|████████| 521/521 [00:01<00:00, 332.13it/s]
WBOFragmenter                 :   0%|                   | 0/519 [00:00<?, ?it/s]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






WBOFragmenter                 :   0%|         | 1/519 [00:16<2:25:40, 16.87s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   1%|           | 5/519 [00:22<31:27,  3.67s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))














  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   1%|         | 6/519 [00:52<1:24:55,  9.93s/it]





WBOFragmenter                 :   5%|▌         | 27/519 [01:10<11:18,  1.38s/it]



WBOFragmenter                 :   6%|▌         | 32/519 [01:15<10:04,  1.24s/it]





WBOFragmenter                 :   7%|▋         | 35/519 [01:21<11:09,  1.38s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   8%|▊         | 42/519 [01:37<13:19,  1.68s/it]





  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   8%|▊         | 42/519 [01:48<13:19,  1.68s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :   9%|▉         | 48/519 [02:01<21:22,  2.72s/it]



WBOFragmenter                 :  10%|█         | 53/519 [02:01<13:50,  1.78s/it]





WBOFragmenter                 :  12%|█▏        | 60/519 [02:07<10:37,  1.39s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  13%|█▎        | 66/519 [02:08<07:37,  1.01s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  13%|█▎        | 68/519 [02:25<15:16,  2.03s/it]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))








WBOFragmenter                 :  19%|█▋       | 100/519 [03:06<06:46,  1.03it/s]



WBOFragmenter                 :  19%|█▋       | 100/519 [03:18<06:46,  1.03it/s]



WBOFragmenter                 :  20%|█▊       | 104/519 [03:21<10:37,  1.53s/it]



WBOFragmenter                 :  20%|█▊       | 106/519 [03:27<11:37,  1.69s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  20%|█▊       | 106/519 [03:38<11:37,  1.69s/it]



WBOFragmenter                 :  21%|█▊       | 108/519 [03:43<18:22,  2.68s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  24%|██▏      | 127/519 [04:05<08:39,  1.33s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  26%|██▍      | 137/519 [04:08<05:57,  1.07it/s]





  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  27%|██▍      | 142/519 [04:14<06:26,  1.03s/it]



WBOFragmenter                 :  28%|██▌      | 146/519 [04:17<06:01,  1.03it/s]







WBOFragmenter                 :  28%|██▌      | 146/519 [04:28<06:01,  1.03it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))








  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  29%|██▌      | 149/519 [04:40<12:59,  2.11s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  33%|██▉      | 171/519 [04:51<05:08,  1.13it/s]



WBOFragmenter                 :  34%|███      | 174/519 [05:04<08:11,  1.43s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  37%|███▎     | 191/519 [05:21<05:22,  1.02it/s]



WBOFragmenter                 :  37%|███▎     | 193/519 [05:28<08:26,  1.55s/it]



WBOFragmenter                 :  38%|███▍     | 195/519 [05:30<07:49,  1.45s/it]



WBOFragmenter                 :  38%|███▍     | 196/519 [05:53<24:54,  4.63s/it]



WBOFragmenter                 :  39%|███▌     | 202/519 [05:53<10:48,  2.05s/it]



WBOFragmenter                 :  40%|███▌     | 206/519 [05:53<07:00,  1.34s/it]



WBOFragmenter                 :  41%|███▋     | 213/519 [06:10<09:13,  1.81s/it]



WBOFragmenter                 :  43%|███▉     | 225/519 [06:20<04:51,  1.01it/s]





WBOFragmenter                 :  43%|███▉     | 225/519 [06:38<04:51,  1.01it/s]



WBOFragmenter                 :  44%|███▉     | 229/519 [06:39<09:14,  1.91s/it]



WBOFragmenter                 :  45%|████     | 233/519 [06:46<08:30,  1.79s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  47%|████▏    | 243/519 [06:53<04:38,  1.01s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  47%|████▏    | 244/519 [07:16<13:17,  2.90s/it]



WBOFragmenter                 :  48%|████▎    | 251/519 [07:20<07:31,  1.68s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  50%|████▍    | 257/519 [07:27<06:18,  1.44s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  51%|████▌    | 264/519 [07:43<06:43,  1.58s/it]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






WBOFragmenter                 :  60%|█████▎   | 309/519 [09:47<02:43,  1.28it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  60%|█████▎   | 309/519 [09:58<02:43,  1.28it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  61%|█████▍   | 314/519 [10:01<04:29,  1.31s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  62%|█████▌   | 321/519 [10:22<05:35,  1.70s/it]



WBOFragmenter                 :  63%|█████▋   | 326/519 [10:35<06:14,  1.94s/it]



WBOFragmenter                 :  65%|█████▊   | 335/519 [10:41<03:43,  1.21s/it]



WBOFragmenter                 :  66%|█████▉   | 340/519 [10:46<03:35,  1.21s/it]



WBOFragmenter                 :  66%|█████▉   | 343/519 [10:49<03:22,  1.15s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  67%|██████   | 346/519 [10:53<03:26,  1.19s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  67%|██████   | 348/519 [10:58<03:59,  1.40s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  67%|██████   | 348/519 [11:18<03:59,  1.40s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  69%|██████▏  | 357/519 [11:27<04:46,  1.77s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  72%|██████▌  | 375/519 [11:35<01:44,  1.38it/s]



WBOFragmenter                 :  72%|██████▌  | 375/519 [11:48<01:44,  1.38it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  73%|██████▌  | 378/519 [11:51<03:57,  1.68s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  75%|██████▋  | 388/519 [12:06<03:10,  1.46s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  75%|██████▋  | 388/519 [12:18<03:10,  1.46s/it]



WBOFragmenter                 :  75%|██████▊  | 390/519 [12:20<05:26,  2.53s/it]



WBOFragmenter                 :  76%|██████▊  | 393/519 [12:20<03:48,  1.81s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  76%|██████▊  | 396/519 [12:23<03:06,  1.52s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  77%|██████▉  | 398/519 [12:29<03:50,  1.91s/it]



WBOFragmenter                 :  77%|██████▉  | 400/519 [12:30<03:02,  1.53s/it]



WBOFragmenter                 :  78%|███████  | 407/519 [12:45<03:06,  1.67s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  78%|███████  | 407/519 [12:58<03:06,  1.67s/it]



WBOFragmenter                 :  79%|███████  | 408/519 [13:01<06:09,  3.33s/it]



WBOFragmenter                 :  79%|███████▏ | 411/519 [13:01<03:57,  2.20s/it]



WBOFragmenter                 :  80%|███████▏ | 415/519 [13:02<02:26,  1.41s/it]



WBOFragmenter                 :  80%|███████▏ | 415/519 [13:18<02:26,  1.41s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  81%|███████▏ | 418/519 [13:25<05:39,  3.37s/it]





WBOFragmenter                 :  83%|███████▍ | 430/519 [13:54<02:56,  1.99s/it]



WBOFragmenter                 :  83%|███████▌ | 433/519 [14:01<03:01,  2.11s/it]



WBOFragmenter                 :  85%|███████▋ | 440/519 [14:18<02:41,  2.04s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  86%|███████▋ | 446/519 [14:33<02:28,  2.03s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  87%|███████▊ | 451/519 [15:02<03:45,  3.32s/it]



WBOFragmenter                 :  87%|███████▊ | 451/519 [15:18<03:45,  3.32s/it]



WBOFragmenter                 :  88%|███████▉ | 459/519 [15:24<02:58,  2.98s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  92%|████████▎| 480/519 [16:04<00:33,  1.16it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  92%|████████▎| 480/519 [16:18<00:33,  1.16it/s]



WBOFragmenter                 :  94%|████████▍| 490/519 [16:30<00:44,  1.53s/it]



WBOFragmenter                 :  95%|████████▌| 495/519 [16:34<00:32,  1.35s/it]



WBOFragmenter                 :  96%|████████▌| 497/519 [16:42<00:37,  1.73s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  97%|████████▋| 501/519 [16:48<00:29,  1.64s/it]



WBOFragmenter                 :  97%|████████▋| 503/519 [16:51<00:25,  1.60s/it]



WBOFragmenter                 :  98%|████████▊| 508/519 [17:03<00:18,  1.71s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  98%|████████▊| 508/519 [17:18<00:18,  1.71s/it]



WBOFragmenter                 : 100%|█████████| 519/519 [17:24<00:00,  2.01s/it]
EnumerateStereoisomers        : 100%|███████| 1043/1043 [05:37<00:00,  3.09it/s]
StandardConformerGenerator    : 100%|███████| 2319/2319 [04:02<00:00,  9.54it/s]
Preparation                   : 100%|███████| 2319/2319 [00:55<00:00, 41.83it/s]

Workflow complete; dataset generated.





In [9]:
dataset.metadata.short_description = "Optimization set"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-12-02-Genentech-PDB-Ligand-Expo-fragment-optimization-neutral-v1.0"
dataset.metadata.long_description = "Genentech PDB Ligand Expo neutral molecules set"

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

dataset.export_dataset("dataset.json.bz2")

Number of unique molecules        2319
Number of filtered molecules      98
Number of conformers              2366
Number of conformers min mean max 1   1.02 3


In [10]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 2363}
Total tasks: 2363
CPU times: user 49.4 s, sys: 2.46 s, total: 51.9 s
Wall time: 1min 29s


In [13]:
from openff.qcsubmit.datasets import OptimizationDataset
dataset = OptimizationDataset.parse_file('./dataset.json.bz2')
dataset.metadata.long_description = "This dataset contains Genentech PDB Ligand Expo neutral set of molecules that fragments molecules with greater than 3 rotors"
dataset.metadata.submitter = 'pavankum'
dataset.export_dataset("dataset.json.bz2")

In [15]:
from openff.qcsubmit.datasets import OptimizationDataset
ds = OptimizationDataset.parse_file('./dataset.json.bz2')
ds.visualize('visualize.pdf')