In [1]:
from openff.qcsubmit.datasets import load_dataset, OptimizationDataset
from openforcefield.utils.toolkits import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
import os
import re

#GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper())
#print(GLOBAL_TOOLKIT_REGISTRY)

  Trajectory.opt_id == id
  select([func.array_agg(TorsionInitMol.molecule_id)]).where(TorsionInitMol.torsion_id == id)


## Preparation

This is a repreparation of the `OpenFF Industry Benchmark Season 1 v1.0`, with the only change being the replacement of the MRK (Merck) conformers with a set derived from a source set with explicit hydrogens. The original Merck submission did not feature explicit hydrogens, so this serves as a correction.

In [2]:
merck = load_dataset("merck/refdata.json.bz2")

 - Atom N (index 25)

 - Atom N (index 25)

 - Atom N (index 25)

 - Atom N (index 25)

 - Atom N (index 25)

 - Atom N (index 25)

 - Atom N (index 25)

 - Atom N (index 25)



In [4]:
len(merck.dataset)

14457

In [11]:
set(key.split('-')[0] for key in merck.dataset.keys())

{'MRK'}

In [3]:
full_dataset = load_dataset("../2021-03-30-OpenFF-Industry-Benchmark-Season-1-v1.0/dataset.json.bz2")

 - Atom C (index 26)

 - Atom N (index 22)

 - Atom C (index 4)

 - Atom N (index 21)
 - Atom N (index 80)

 - Atom C (index 8)

 - Atom C (index 38)

 - Atom C (index 8)

 - Atom C (index 10)

 - Atom N (index 14)

 - Atom N (index 9)
 - Atom N (index 16)

 - Atom N (index 8)

 - Atom N (index 40)

 - Atom N (index 53)

 - Atom N (index 5)
 - Atom N (index 22)

 - Atom N (index 5)

 - Atom C (index 6)

 - Atom N (index 11)
 - Atom N (index 32)

 - Atom N (index 21)

 - Atom N (index 33)

 - Atom N (index 2)
 - Atom N (index 13)

 - Atom C (index 8)

 - Atom N (index 6)
 - Atom N (index 13)

 - Atom N (index 13)

 - Bond 7 (atoms 7-8 of element (N-N)

 - Atom N (index 32)

 - Atom C (index 9)

 - Atom N (index 8)

 - Atom N (index 23)



In [5]:
len(full_dataset.dataset)

9104

First, we'll excise the MRK molecules from the full v1.0 dataset.

In [6]:
mrk_keys = [key for key in full_dataset.dataset.keys() if 'MRK' in key]

In [7]:
len(mrk_keys)

792

In [8]:
for key in mrk_keys:
    full_dataset.dataset.pop(key)

In [9]:
len(full_dataset.dataset)

8312

In [10]:
set(key.split('-')[0] for key in full_dataset.dataset.keys())

{'BRI', 'GNT', 'JNS', 'RCH', 'WCS', 'XTP'}

Now, we'll process the merck set, in particular filtering any molecules containing `B`, `Si`, or `I`, and combining conformers of the same molecule into a single entry.

In [12]:
filtered_dataset = OptimizationDataset(dataset_name="Merck Public Filtered v1.0", 
                                       description="The set of molecules filtered from the season 1 openff industry benchmark merck public dataset. These molecules were filtered due to missing forcefield coverage and an issue with density fitting in psi4 regarding iodine.")
filtered_elements = {"B", "Si", "I"}

for index, entry in merck.dataset.items():
    # split the index to get the molecule id 
    mol_id = re.search("[A-Z]{3}-[0-9]{5}", index).group()
    mol_id += "-00"
    # get the symbols to quick filter elements
    qcmol_elements = set(entry.initial_molecules[0].symbols)
    if qcmol_elements.intersection(filtered_elements) != set():
        # add to the filter dataset
        if mol_id in filtered_dataset.dataset:
            filtered_dataset.dataset[mol_id].initial_molecules.extend(entry.initial_molecules)
        else:
            filtered_dataset.dataset[mol_id] = entry
    else:
        # add to the master dataset
        if mol_id in full_dataset.dataset:
            full_dataset.dataset[mol_id].initial_molecules.extend(entry.initial_molecules)
        else:
            full_dataset.dataset[mol_id] = entry

In [13]:
len(full_dataset.dataset)

9919

In [14]:
set(key.split('-')[0] for key in full_dataset.dataset.keys())

{'BRI', 'GNT', 'JNS', 'MRK', 'RCH', 'WCS', 'XTP'}

In [15]:
filtered_dataset.n_molecules

7

In [16]:
filtered_dataset.n_records

66

In [17]:
filtered_dataset.dataset.keys()

dict_keys(['MRK-00808-00', 'MRK-01219-00', 'MRK-00826-00', 'MRK-01112-00', 'MRK-01107-00', 'MRK-01596-00', 'MRK-01595-00'])

In [18]:
from rdkit.Chem import Descriptors
import numpy as np

In [19]:
confs = np.array([len(mol.conformers) for mol in full_dataset.molecules])
elements = set()
for entry in full_dataset.dataset.values():
    elements.update(entry.initial_molecules[0].symbols)

print("Unique list of elements          ", elements)
print("Number of unique molecules       ", full_dataset.n_molecules)
print("Number of filtered molecules     ", filtered_dataset.n_molecules)
print("Number of optimizations         ", full_dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in full_dataset.molecules:
    rd_mol = molecule.to_rdkit()
    mass = Descriptors.ExactMolWt(rd_mol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in full_dataset.molecules)))


 - Atom C (index 26)

 - Atom N (index 22)

 - Atom C (index 4)

 - Atom N (index 21)
 - Atom N (index 80)

 - Atom C (index 8)

 - Atom C (index 38)

 - Atom C (index 8)

 - Atom C (index 10)

 - Atom N (index 14)

 - Atom N (index 9)
 - Atom N (index 16)

 - Atom N (index 8)

 - Atom N (index 40)

 - Atom N (index 53)

 - Atom N (index 5)
 - Atom N (index 22)

 - Atom N (index 5)

 - Atom C (index 6)

 - Atom N (index 11)
 - Atom N (index 32)

 - Atom N (index 21)

 - Atom N (index 33)

 - Atom N (index 2)
 - Atom N (index 13)

 - Atom C (index 8)

 - Atom N (index 6)
 - Atom N (index 13)

 - Bond 7 (atoms 7-8 of element (N-N)

 - Atom N (index 32)

 - Atom C (index 9)

 - Atom N (index 8)

 - Atom N (index 23)

 - Atom N (index 25)



Unique list of elements           {'Br', 'F', 'P', 'H', 'N', 'S', 'Cl', 'O', 'C'}
Number of unique molecules        9847
Number of filtered molecules      7
Number of optimizations          77055
Number of conformers min mean max 1   7.77 10


 - Atom C (index 26)

 - Atom N (index 22)

 - Atom C (index 4)

 - Atom N (index 21)
 - Atom N (index 80)

 - Atom C (index 8)

 - Atom C (index 38)

 - Atom C (index 8)

 - Atom C (index 10)

 - Atom N (index 14)

 - Atom N (index 9)
 - Atom N (index 16)

 - Atom N (index 8)

 - Atom N (index 40)

 - Atom N (index 53)

 - Atom N (index 5)
 - Atom N (index 22)

 - Atom N (index 5)

 - Atom C (index 6)

 - Atom N (index 11)
 - Atom N (index 32)

 - Atom N (index 21)

 - Atom N (index 33)

 - Atom N (index 2)
 - Atom N (index 13)

 - Atom C (index 8)

 - Atom N (index 6)
 - Atom N (index 13)

 - Bond 7 (atoms 7-8 of element (N-N)

 - Atom N (index 32)

 - Atom C (index 9)

 - Atom N (index 8)

 - Atom N (index 23)

 - Atom N (index 25)



Mean molecular weight: 348.07
Max molecular weight: 1104.40


 - Atom C (index 26)

 - Atom N (index 22)

 - Atom C (index 4)

 - Atom N (index 21)
 - Atom N (index 80)

 - Atom C (index 8)

 - Atom C (index 38)

 - Atom C (index 8)

 - Atom C (index 10)

 - Atom N (index 14)

 - Atom N (index 9)
 - Atom N (index 16)

 - Atom N (index 8)

 - Atom N (index 40)

 - Atom N (index 53)

 - Atom N (index 5)
 - Atom N (index 22)

 - Atom N (index 5)

 - Atom C (index 6)

 - Atom N (index 11)
 - Atom N (index 32)

 - Atom N (index 21)

 - Atom N (index 33)

 - Atom N (index 2)
 - Atom N (index 13)

 - Atom C (index 8)

 - Atom N (index 6)
 - Atom N (index 13)

 - Bond 7 (atoms 7-8 of element (N-N)

 - Atom N (index 32)

 - Atom C (index 9)

 - Atom N (index 8)

 - Atom N (index 23)

 - Atom N (index 25)



Charges: [-2.0, -1.0, 0.0, 1.0, 2.0]


In [45]:
from openff import qcsubmit
import openforcefield
import rdkit
from pprint import pprint
import datetime

In [46]:
full_dataset.dataset_name = 'OpenFF Industry Benchmark Season 1 v1.1'
full_dataset.metadata.dataset_name = full_dataset.dataset_name
full_dataset.metadata.creation_date = datetime.datetime.now().date()

In [47]:
# change the metadata
full_dataset.metadata.submitter = "dotsdl"
full_dataset.metadata.long_description_url = ("https://github.com/openforcefield/qca-dataset-submission/tree/master/"
                                                "submissions/"
                                                "2021-06-04-OpenFF-Industry-Benchmark-Season-1-v1.1")
full_dataset.metadata.elements = elements
full_dataset.provenance = {"openff-qcsubmit": qcsubmit.__version__,
                            "openff-toolkit": openforcefield.__version__,
                            "rdkit": rdkit.__version__,
                            "openff-benchmark": "2021.03.17.0"}

full_dataset.metadata.long_description = 'The combination of all publicly chosen compound sets by industry partners from the OpenFF season 1 industry benchmark.'

In [49]:
pprint(full_dataset.metadata.dict())

{'collection_type': 'OptimizationDataset',
 'creation_date': datetime.date(2021, 6, 4),
 'dataset_name': 'OpenFF Industry Benchmark Season 1 v1.1',
 'elements': {'Br', 'F', 'P', 'H', 'N', 'S', 'Cl', 'O', 'C'},
 'long_description': 'The combination of all publicly chosen compound sets by '
                     'industry partners from the OpenFF season 1 industry '
                     'benchmark.',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-06-04-OpenFF-Industry-Benchmark-Season-1-v1.1', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-06-04-OpenFF-Industry-Benchmark-Season-1-v1.1'),
 'short_description': 'The public molecules from the OpenFF Industry '
                      'Benchmark.',
 'submitter': 'dotsdl'}


In [50]:
full_dataset.export_dataset("dataset.json.bz2")
full_dataset.molecules_to_file("dataset.smi", "smi")

# Dataset visualization 
A manual workaround for rdkit not correclty producing a pdf.

In [51]:
from rdkit.Chem import AllChem, Draw
images = []
molecules = []
for data in full_dataset.dataset.values():
    rdkit_mol = data.get_off_molecule(include_conformers=False).to_rdkit()
    AllChem.Compute2DCoords(rdkit_mol)
    molecules.append(rdkit_mol)
# evey 24 molecules split the page
for i in range(0, len(molecules), 24):
    mol_chunk = molecules[i : i + 24]

    # now make the image
    image = Draw.MolsToGridImage(
        mol_chunk,
        molsPerRow=4,
        subImgSize=(500, 500),
        returnPNG=False
    )
    # write the pdf to bytes and pass straight to the pdf merger
    images.append(image)


 - Atom C (index 26)

 - Atom N (index 22)

 - Atom C (index 4)

 - Atom N (index 21)
 - Atom N (index 80)

 - Atom C (index 8)

 - Atom C (index 38)

 - Atom C (index 8)

 - Atom C (index 10)

 - Atom N (index 14)

 - Atom N (index 9)
 - Atom N (index 16)

 - Atom N (index 8)

 - Atom N (index 40)

 - Atom N (index 53)

 - Atom N (index 5)
 - Atom N (index 22)

 - Atom N (index 5)

 - Atom C (index 6)

 - Atom N (index 11)
 - Atom N (index 32)

 - Atom N (index 21)

 - Atom N (index 33)

 - Atom N (index 2)
 - Atom N (index 13)

 - Atom C (index 8)

 - Atom N (index 6)
 - Atom N (index 13)

 - Bond 7 (atoms 7-8 of element (N-N)

 - Atom N (index 32)

 - Atom C (index 9)

 - Atom N (index 8)

 - Atom N (index 23)

 - Atom N (index 25)



In [52]:
images[0].save("dataset.pdf", append_images=images[1:], save_all=True)

# Filter Dataset
The dataset of filtered molecules is also saved as filtered.json.bz2 so it is not submitted by qcsubmit.

In [53]:
filtered_dataset.export_dataset("filtered-merck.json.bz2")