In [None]:
from openff.qcsubmit.datasets import load_dataset, OptimizationDataset
from openforcefield.utils.toolkits import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
import os
import re

GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper())
print(GLOBAL_TOOLKIT_REGISTRY)

# Preperation

Each partner has has their chossen molecules ran through the openff-benchmark pipeline to produce a reference qcsubmit dataset. Here we combine all of the datasets to gether keeping the partner identifiers and filter some unwanted elements out that were missed by the benchmark infastructure into a seprate dataset.

In [None]:
# load all datasets and filter out unwanted elements into a new dataset
# filter B, Si and I
filtered_dataset = OptimizationDataset(dataset_name="OpenFF Industry Benchmark Filtered v1.0", description="The set of molecules filtered from the season 1 openff industry benchmark public dataset. These molecules were filtered due to missing forcefield coverage and an issue with density fitting in psi4 regarding iodine.")

master_dataset = OptimizationDataset(dataset_name="OpenFF Industry Benchmark Season 1 v1.0", dataset_tagline="The public molecules from the OpenFF Industry Benchmark.", description="The combination of all publicly chossen compound sets by industry partners from the OpenFF season 1 industry benchmark.")

filtered_elements = {"B", "Si", "I"}
for ref_dir in ["boehringer", "genentech", "janssen", "merck", "roche", "swope", "xtalpi"]:

    dataset = load_dataset(os.path.join(ref_dir, "refdata.json.bz2"))
    
    for index, entry in dataset.dataset.items():
        # split the index to get the molecule id 
        mol_id = re.search("[A-Z]{3}-[0-9]{5}", index).group()
        mol_id += "-00"
        # get the symbols to quick filter elements
        qcmol_elements = set(entry.initial_molecules[0].symbols)
        if qcmol_elements.intersection(filtered_elements) != set():
            # add to the filter dataset
            if mol_id in filtered_dataset.dataset:
                filtered_dataset.dataset[mol_id].initial_molecules.extend(entry.initial_molecules)
            else:
                filtered_dataset.dataset[mol_id] = entry
        else:
            # add to the master dataset
            if mol_id in master_dataset.dataset:
                master_dataset.dataset[mol_id].initial_molecules.extend(entry.initial_molecules)
            else:
                master_dataset.dataset[mol_id] = entry
                

In [None]:
filtered_dataset.n_molecules

In [None]:
filtered_dataset.n_records

In [None]:
filtered_dataset.dataset.keys()

In [None]:
from rdkit.Chem import Descriptors
import numpy as np

In [None]:
confs = np.array([len(mol.conformers) for mol in master_dataset.molecules])

print("Number of unique molecules       ", master_dataset.n_molecules)
print("Number of filtered molecules     ", filtered_dataset.n_molecules)
print("Number of optimizations         ", master_dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in master_dataset.molecules:
    rd_mol = molecule.to_rdkit()
    mass = Descriptors.ExactMolWt(rd_mol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in master_dataset.molecules)))


In [None]:
from openff import qcsubmit
import openforcefield
import rdkit
from pprint import pprint

In [None]:
# change the metadata
master_dataset.metadata.submitter = "jthorton"
master_dataset.metadata.long_description_url = ("https://github.com/openforcefield/qca-dataset-submission/tree/master/"
                                                "submissions/"
                                                "2021-03-30-OpenFF-Industry-Benchmark-Season-1-v1.0")
master_dataset.provenance = {"openff-qcsubmit": qcsubmit.__version__,
                            "openff-toolkit": openforcefield.__version__,
                            "rdkit": rdkit.__version__,
                            "openff-benchamrk": "2021.03.17.0"}



In [None]:
pprint(master_dataset.metadata.dict())

In [None]:
master_dataset.export_dataset("dataset.json.bz2")
master_dataset.molecules_to_file("dataset.smi", "smi")

# Dataset visualization 
A manual workaround for rdkit not correclty producing a pdf.

In [None]:
from rdkit.Chem import AllChem, Draw
images = []
molecules = []
for data in master_dataset.dataset.values():
    rdkit_mol = data.get_off_molecule(include_conformers=False).to_rdkit()
    AllChem.Compute2DCoords(rdkit_mol)
    molecules.append(rdkit_mol)
# evey 24 molecules split the page
for i in range(0, len(molecules), 24):
    mol_chunk = molecules[i : i + 24]

    # now make the image
    image = Draw.MolsToGridImage(
        mol_chunk,
        molsPerRow=4,
        subImgSize=(500, 500),
        returnPNG=False
    )
    # write the pdf to bytes and pass straight to the pdf merger
    images.append(image)


In [None]:
images[0].save("dataset.pdf", append_images=images[1:], save_all=True)