In [1]:
import logging
import bz2
import sys
import os
from pprint import pprint

In [2]:
import numpy as np

In [3]:
import qcfractal.interface as ptl
import tqdm
import sys
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openforcefield.topology import Molecule

In [4]:
# Need to turn off undefined stereochemistry warnings
logging.getLogger().setLevel(logging.ERROR)

In [5]:
def test_submit(qcs_ds, client):
    result = qcs_ds.submit(client, threads=1)
    print(result)
    print("Total tasks:", sum(result.values()))

def test_submit_to_local_server(qcs_ds):
    print("Submitting to test-run to local server")
    client = ptl.FractalClient("localhost:7777", verify=False)
    test_submit(qcs_ds, client)
    
def test_submit_to_test_server(qcs_ds):
    from qcfractal import FractalSnowflakeHandler
    print("Submitting test-run to in-memory server")
    server = FractalSnowflakeHandler()
    test_submit(qcs_ds, server.client())


In [6]:
sdf_file = "./pubLigsNeutralGoodDensity.sdf"

molecules = Molecule.from_file(sdf_file,
                          "sdf",
                          allow_undefined_stereo=True,
                         )
# Considering molecules that have rotors greater than 3 for fragmentation down the line
mols = []
for molecule in molecules:
    if len(molecule.find_rotatable_bonds()) > 3:
        mols.append(molecule)
    else:
        continue
print("\nFiles loaded; molecules generated.")


Files loaded; molecules generated.


In [7]:
# Generate the workflow to apply to the molecules
qcs_ds = TorsiondriveDatasetFactory()

component = workflow_components.WBOFragmenter()
component.keep_non_rotor_ring_substituents = True
qcs_ds.add_workflow_component(component)

component = workflow_components.EnumerateStereoisomers()
component.max_isomers = 10
component.toolkit = "openeye"
qcs_ds.add_workflow_component(component)

component = workflow_components.StandardConformerGenerator()
component.max_conformers = 10
component.toolkit = "openeye"
component.rms_cutoff = 3
qcs_ds.add_workflow_component(component)

In [8]:
dataset = qcs_ds.create_dataset(dataset_name="Genentech PDB Ligand Expo fragment torsiondrive neutral v1.0",
        molecules=mols,
        description="Genentech PDB Ligand Expo optimization dataset with fragmented neutral molecules",
        tagline="TorsionDriveDataset")
print("Workflow complete; dataset generated.")

Deduplication                 : 100%|████████| 521/521 [00:00<00:00, 639.43it/s]
WBOFragmenter                 :   0%|                   | 0/519 [00:00<?, ?it/s]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






WBOFragmenter                 :   0%|         | 1/519 [00:10<1:28:40, 10.27s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   1%|           | 5/519 [00:14<21:38,  2.53s/it]











  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   1%|▏          | 6/519 [00:32<50:39,  5.93s/it]



WBOFragmenter                 :   3%|▎         | 14/519 [00:38<18:44,  2.23s/it]



WBOFragmenter                 :   6%|▌         | 29/519 [00:43<06:08,  1.33it/s]



WBOFragmenter                 :   6%|▌         | 32/519 [00:46<06:27,  1.26it/s]





WBOFragmenter                 :   7%|▋         | 35/519 [00:52<09:05,  1.13s/it]



WBOFragmenter                 :   7%|▋         | 37/519 [00:54<08:55,  1.11s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   8%|▊         | 42/519 [01:08<12:43,  1.60s/it]





  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :   8%|▊         | 42/519 [01:18<12:43,  1.60s/it]





  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  10%|█         | 53/519 [01:31<13:24,  1.73s/it]





  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  14%|█▍        | 73/519 [01:50<07:55,  1.07s/it]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))






WBOFragmenter                 :  14%|█▍        | 73/519 [02:08<07:55,  1.07s/it]



WBOFragmenter                 :  15%|█▍        | 77/519 [02:25<21:06,  2.86s/it]



WBOFragmenter                 :  19%|█▉        | 98/519 [02:26<06:48,  1.03it/s]



WBOFragmenter                 :  19%|█▉        | 98/519 [02:38<06:48,  1.03it/s]



WBOFragmenter                 :  20%|█▊       | 104/519 [02:40<09:16,  1.34s/it]



WBOFragmenter                 :  20%|█▊       | 106/519 [02:46<10:23,  1.51s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  20%|█▊       | 106/519 [02:58<10:23,  1.51s/it]



WBOFragmenter                 :  21%|█▊       | 108/519 [03:00<15:24,  2.25s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  24%|██▏      | 123/519 [03:22<09:41,  1.47s/it]



WBOFragmenter                 :  26%|██▎      | 134/519 [03:22<05:28,  1.17it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  29%|██▌      | 148/519 [03:32<04:40,  1.32it/s]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))








  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  29%|██▌      | 149/519 [03:55<12:13,  1.98s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  33%|██▉      | 169/519 [04:03<04:39,  1.25it/s]



WBOFragmenter                 :  34%|███      | 174/519 [04:16<07:26,  1.29s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  36%|███▎     | 188/519 [04:33<07:17,  1.32s/it]



WBOFragmenter                 :  37%|███▎     | 193/519 [04:38<06:31,  1.20s/it]



WBOFragmenter                 :  38%|███▍     | 195/519 [04:43<07:47,  1.44s/it]



WBOFragmenter                 :  40%|███▌     | 206/519 [05:05<06:46,  1.30s/it]



WBOFragmenter                 :  40%|███▌     | 206/519 [05:18<06:46,  1.30s/it]



WBOFragmenter                 :  41%|███▋     | 213/519 [05:21<08:44,  1.71s/it]



WBOFragmenter                 :  43%|███▉     | 225/519 [05:30<04:37,  1.06it/s]





WBOFragmenter                 :  44%|███▉     | 227/519 [05:47<11:56,  2.45s/it]



WBOFragmenter                 :  45%|████     | 233/519 [05:52<08:14,  1.73s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  45%|████     | 235/519 [06:03<11:08,  2.35s/it]



WBOFragmenter                 :  47%|████▏    | 243/519 [06:03<05:23,  1.17s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  47%|████▏    | 243/519 [06:18<05:23,  1.17s/it]



WBOFragmenter                 :  49%|████▎    | 252/519 [06:28<07:05,  1.59s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  50%|████▍    | 257/519 [06:37<06:54,  1.58s/it]



WBOFragmenter                 :  50%|████▍    | 257/519 [06:48<06:54,  1.58s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  50%|████▍    | 258/519 [06:52<14:15,  3.28s/it]







  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))






  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  59%|█████▎   | 307/519 [08:50<02:34,  1.37it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  61%|█████▍   | 314/519 [09:03<03:45,  1.10s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  62%|█████▌   | 321/519 [09:25<05:07,  1.55s/it]



WBOFragmenter                 :  63%|█████▋   | 326/519 [09:39<06:01,  1.87s/it]



WBOFragmenter                 :  65%|█████▊   | 335/519 [09:43<03:29,  1.14s/it]



WBOFragmenter                 :  66%|█████▉   | 340/519 [09:51<03:52,  1.30s/it]



WBOFragmenter                 :  66%|█████▉   | 343/519 [09:52<03:14,  1.10s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  67%|██████   | 346/519 [09:56<03:11,  1.11s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  67%|██████   | 348/519 [10:01<03:54,  1.37s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  67%|██████   | 348/519 [10:18<03:54,  1.37s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  70%|██████▎  | 361/519 [10:30<03:26,  1.31s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  72%|██████▍  | 372/519 [10:39<02:18,  1.06it/s]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  73%|██████▌  | 378/519 [10:53<03:17,  1.40s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  75%|██████▋  | 388/519 [11:09<03:03,  1.40s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))






WBOFragmenter                 :  76%|██████▊  | 393/519 [11:21<03:31,  1.68s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  76%|██████▊  | 396/519 [11:27<03:29,  1.71s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  77%|██████▉  | 398/519 [11:31<03:39,  1.81s/it]



WBOFragmenter                 :  78%|██████▉  | 403/519 [11:46<04:26,  2.29s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))






WBOFragmenter                 :  80%|███████▏ | 417/519 [12:02<01:56,  1.14s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  80%|███████▏ | 417/519 [12:19<01:56,  1.14s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  81%|███████▏ | 418/519 [12:27<06:34,  3.90s/it]





WBOFragmenter                 :  81%|███████▎ | 419/519 [12:55<12:16,  7.37s/it]



WBOFragmenter                 :  82%|███████▎ | 425/519 [12:56<05:14,  3.35s/it]



WBOFragmenter                 :  83%|███████▍ | 430/519 [12:56<03:01,  2.03s/it]



WBOFragmenter                 :  83%|███████▍ | 432/519 [13:02<03:11,  2.20s/it]



WBOFragmenter                 :  85%|███████▋ | 440/519 [13:18<02:33,  1.94s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))
WBOFragmenter                 :  85%|███████▋ | 440/519 [13:29<02:33,  1.94s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  86%|███████▋ | 445/519 [13:33<02:35,  2.10s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  88%|███████▉ | 456/519 [14:03<01:58,  1.88s/it]



WBOFragmenter                 :  88%|███████▉ | 456/519 [14:19<01:58,  1.88s/it]



WBOFragmenter                 :  88%|███████▉ | 459/519 [14:23<03:09,  3.15s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  92%|████████▎| 478/519 [15:03<00:49,  1.20s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  92%|████████▎| 478/519 [15:19<00:49,  1.20s/it]



WBOFragmenter                 :  93%|████████▍| 484/519 [15:24<01:09,  1.97s/it]



WBOFragmenter                 :  94%|████████▍| 490/519 [15:25<00:40,  1.39s/it]



WBOFragmenter                 :  95%|████████▌| 495/519 [15:31<00:31,  1.32s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 :  97%|████████▋| 501/519 [15:44<00:28,  1.61s/it]



WBOFragmenter                 :  97%|████████▋| 503/519 [15:46<00:24,  1.52s/it]



WBOFragmenter                 :  97%|████████▋| 503/519 [15:59<00:24,  1.52s/it]



WBOFragmenter                 :  98%|████████▊| 508/519 [16:00<00:19,  1.78s/it]



  problematic_bonds, oechem.OEMolToSmiles(molecule)))




  problematic_bonds, oechem.OEMolToSmiles(molecule)))
  problematic_bonds, oechem.OEMolToSmiles(molecule)))




WBOFragmenter                 : 100%|█████████| 519/519 [16:19<00:00,  1.89s/it]
EnumerateStereoisomers        : 100%|███████| 1043/1043 [05:20<00:00,  3.26it/s]
StandardConformerGenerator    : 100%|███████| 1448/1448 [00:59<00:00, 24.29it/s]
Preparation                   : 100%|███████| 1448/1448 [00:51<00:00, 27.87it/s]

Workflow complete; dataset generated.





In [9]:
dataset.metadata.short_description = "TorsionDrive set"
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-02-22-Genentech-PDB-Ligand-Expo-fragment-torsiondrive-neutral-v1.0"
dataset.metadata.long_description = "Genentech PDB Ligand Expo neutral molecules set"

confs = np.array([len(mol.conformers) for mol in dataset.molecules])
print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max", 
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())
dataset.metadata.submitter = "pavankum"
dataset.export_dataset("dataset.json.bz2")

Number of unique molecules        1448
Number of filtered molecules      98
Number of conformers              3816
Number of conformers min mean max 1   1.02 3


In [10]:
pprint(dataset.metadata.dict())

{'collection_type': 'TorsiondriveDataset',
 'creation_date': datetime.date(2021, 3, 24),
 'dataset_name': 'Genentech PDB Ligand Expo fragment torsiondrive neutral v1.0',
 'elements': {'N', 'F', 'S', 'Cl', 'H', 'O', 'Br', 'I', 'C'},
 'long_description': 'Genentech PDB Ligand Expo neutral molecules set',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-02-22-Genentech-PDB-Ligand-Expo-fragment-torsiondrive-neutral-v1.0', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-02-22-Genentech-PDB-Ligand-Expo-fragment-torsiondrive-neutral-v1.0'),
 'short_description': 'TorsionDrive set',
 'submitter': 'pavankum'}


In [11]:
%time test_submit_to_test_server(dataset)

Submitting test-run to in-memory server
{'default': 3816}
Total tasks: 3816
CPU times: user 12min 33s, sys: 7.02 s, total: 12min 40s
Wall time: 31min 53s


In [12]:
from openff.qcsubmit.datasets import TorsiondriveDataset
dataset = TorsiondriveDataset.parse_file('./dataset.json.bz2')
dataset.metadata.long_description = "This is the third of the Genentech PDB Ligand Expo Datasets referenced here (https://github.com/openforcefield/qca-dataset-submission/pull/48). This contains torsiondrives for fragmented neutral molecules. Conformers were generated using a RMS cutoff of 3 Ångstroms."
dataset.metadata.submitter = 'pavankum'
dataset.export_dataset("dataset.json.bz2")

In [13]:
dataset.visualize('visualize.pdf')

In [14]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: default
{'basis': 'DZVP',
 'implicit_solvent': None,
 'method': 'B3LYP-D3BJ',
 'program': 'psi4',
 'spec_description': 'Standard OpenFF optimization quantum chemistry '
                     'specification.',
 'spec_name': 'default',
 'store_wavefunction': 'none'}
