In [1]:
import os
import json
import requests

import numpy as np
import tqdm

from qcportal import PortalClient
from qcportal.record_models import BaseRecord, RecordStatusEnum

from openff.units import unit

from openff.toolkit.topology import Molecule
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry

from openff.qcsubmit.common_structures import MoleculeAttributes
from openff.qcsubmit.datasets import OptimizationDataset
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.results import OptimizationResultCollection
from openff.qcsubmit.results.filters import (
    ConnectivityFilter,
    RecordStatusFilter,
    UnperceivableStereoFilter,
    SinglepointRecordFilter,
)

from openff.qcsubmit._pydantic import Field

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = PortalClient(ADDRESS, cache_dir=".",)
#client = PortalClient(
#    ADDRESS, 
#    username=os.environ['QCARCHIVE_USER'],
#    password=os.environ['QCARCHIVE_PASSWORD'],
#    cache_dir=".",
#)

# Get Records to Remove

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/sage-2.2.0/refs/heads/main/05_benchmark_forcefield/process_bm/problem_ids/all_r7_outliers.txt"
)
remove_record_ids = set([int(x) for x in file.content.decode().splitlines()])
print(f"There are {len(remove_record_ids)} records to remove")


There are 29 records to remove


## Get v1.1 Dataset and Filter

In [3]:
dataset = OptimizationResultCollection.from_server(
    client=client,
    datasets=["OpenFF Industry Benchmark Season 1 v1.1"],
    spec_name="default",
)
print(f"Number of records before filtering: {dataset.n_results}")



Number of records before filtering: 76666


In [4]:
# 50 min
dataset = dataset.filter(
    RecordStatusFilter(status=RecordStatusEnum.complete),
    ConnectivityFilter(tolerance=1.2),
    UnperceivableStereoFilter(),
)



In [5]:
# 12 min
print(f"Number of records before filtering: {dataset.n_results}")
nrec = dataset.n_results
class RecordIDFilter(SinglepointRecordFilter):
    """A filter which will only retain records if their status matches a specified
    value.
    """

    record_list: list = Field(
        [],
        description="Records whose ID is NOT in this list will be retains",
    )

    def _filter_function(
        self, result: "_BaseResult", record: BaseRecord, molecule: Molecule
    ) -> bool:
        return record.id not in self.record_list

dataset = dataset.filter(
    RecordIDFilter(record_list=remove_record_ids),
)

Number of records before filtering: 74570




In [6]:
print(f"Number of records after filtering: {dataset.n_results}. {nrec-dataset.n_results} records were removed, equal to number expected: {nrec-dataset.n_results == len(remove_record_ids)}")

Number of records after filtering: 74541. 29 records were removed, equal to number expected: True


In [7]:
rec_and_mol = dataset.to_records()
initial_mols = [rec[0].initial_molecule for rec in rec_and_mol]
print('Finished converting to records and extracting entries',flush = True)



Finished converting to records and extracting entries


# Make New Dataset

In [37]:
with open("ds_info.json") as f:
    dataset_information = json.load(f)

dataset_factory1 = OptimizationDatasetFactory()
provenance1 = dataset_factory1.provenance(ToolkitRegistry([OpenEyeToolkitWrapper]))

dataset1 = OptimizationDataset(
    dataset_name=dataset_information["dataset_name"],
    dataset_tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=provenance1
)
dataset1.metadata.submitter = "jaclark5"
dataset1.metadata.long_description_url = dataset_information["metadata.long_description_url"]


In [38]:
# Have to add records this way to avoid a round trip through the toolkit.
records_by_cmiles= {}
for record, molecule in rec_and_mol:
    cmiles = molecule.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
    if cmiles in records_by_cmiles.keys():
        records_by_cmiles[cmiles].append((record, molecule))
    else:
        records_by_cmiles[cmiles]=[(record, molecule)]

count = 0 # NoteHere
hashes = set()
prev_len, prev_smiles = 0, ""
for records in tqdm.tqdm(records_by_cmiles.values()):
    base_record, base_molecule = records[0]
    base_molecule._conformers = [m.conformers[0] for _, m in records]

    dataset1.add_molecule(
        index=base_molecule.to_smiles(
            isomeric=True, explicit_hydrogens=False, mapped=False
        ),
        molecule=None,
        initial_molecules=[rec.initial_molecule for rec, _ in records],
        attributes=MoleculeAttributes.from_openff_molecule(base_molecule),
        extras=base_record.extras,
        keywords=base_record.specification.keywords,
    )
    
    current_count = len({
        qcemol.identifiers.molecule_hash
        for moldata in dataset1.dataset.values()
        for qcemol in moldata.initial_molecules
    })
    if current_count != count + len(records):
        print(len(dataset1.dataset), prev_len)
        print(prev_smiles)
        print(base_molecule.to_smiles(isomeric=True, explicit_hydrogens=False, mapped=False))
        print("Same?", prev_smiles == base_molecule.to_smiles(isomeric=True, explicit_hydrogens=False, mapped=False))
        raise ValueError(f"There should be {count + len(records)}, not {current_count} molecules")
    else:
        prev_len = len(dataset1.dataset)
        prev_smiles = base_molecule.to_smiles(isomeric=True, explicit_hydrogens=False, mapped=False)
    count += len(records)
    for rec, _ in records:
        hashes.add(rec.initial_molecule.identifiers.molecule_hash)

 38%|███▊      | 3720/9829 [03:11<05:14, 19.40it/s]  


3720 3720
C[C@@H]1CN(CCN1C(=O)C=C)c2c3cc(c(c(c3ncn2)F)c4c(cccc4F)O)Cl
c1cc2cccnc2c(c1)NC(=O)c3ccc(cc3)N4C(=O)[C@H]5[C@@H]6C[C@H]([C@H]5C4=O)C=C6
Same? False


ValueError: There should be 30472, not 30463 molecules

In [None]:
print(current_count, count, len(records))
# How do we have one fewer molecule than before we tried to add 8...

30463 30464 8


In [27]:
new_hashes = [rec.initial_molecule.identifiers.molecule_hash for rec, _ in records]
test_hashes = hashes.copy().union(new_hashes)

In [29]:
print(len(hashes) + len(new_hashes), len(test_hashes))

30472 30472


In [15]:
from pprint import pprint
from qcportal.serialization import encode_to_json
pprint(encode_to_json(records[0]))

[{'created_on': '2021-04-20T15:11:40.129611+00:00',
  'creator_user': None,
  'energies': [-1354.2532051055496,
               -1354.294529401838,
               -1354.308747123733,
               -1354.3095908944638,
               -1354.3127072999548,
               -1354.3083354516334,
               -1354.3125137884006,
               -1354.3139773390255,
               -1354.314675228994,
               -1354.3150541254315,
               -1354.3155633351791,
               -1354.3159278067233,
               -1354.3164003125705,
               -1354.3166517466643,
               -1354.3167650198143,
               -1354.3168228768332,
               -1354.3168265000822,
               -1354.3168375963983,
               -1354.3168456086757,
               -1354.3168525157914,
               -1354.3168510324685,
               -1354.316847148259,
               -1354.3168519402877,
               -1354.3168540189972,
               -1354.3168499712367,
               -1354.3168480

In [None]:
# Check that the molecules are identical
opt_hashes = {rec.initial_molecule.get_hash() for rec, _mol in rec_and_mol}

new_hashes = {
    qcemol.identifiers.molecule_hash
    for moldata in dataset1.dataset.values()
    for qcemol in moldata.initial_molecules
}

print('Molecules are the same? ',opt_hashes==new_hashes)

Molecules are the same?  False


## Write Statistics

In [None]:
n_confs = np.array([mol.n_conformers for mol in dataset1.molecules])
n_heavy_atoms = np.array([mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset1.molecules])
masses = np.array([sum([atom.mass.m for atom in mol.atoms]) for mol in dataset1.molecules])
elements = sorted(set(atom.symbol for mol in dataset1.molecules for atom in mol.atoms))
unique_charges = set()
for mol in dataset1.molecules:
    unique_charges.add(mol.total_charge.m_as(unit.elementary_charge))
unique_charges = sorted(unique_charges)




In [None]:
print(f"* Number of unique molecules: {dataset1.n_molecules}")
print("* Number of conformers:", dataset1.n_records)
print(
    "* Number of conformers (min, mean, max): {}, {}, {}".format(
        int(min(n_confs)), int(np.mean(n_confs)), int(max(n_confs))
))
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))
print("* Elements: {}".format(", ".join([x for x in elements])))

* Number of unique molecules: 9784
* Number of conformers: 74362
* Number of conformers (min, mean, max): 1, 7, 10
* Molecular weight (min, mean, max): 16.04, 348.76, 1105.16
* Charges: -2.0, -1.0, 0.0, 1.0, 2.0
* Elements: Br, C, Cl, F, H, N, O, P, S


In [None]:

print("* Name: {}".format(dataset1.dataset_name))
print("* Purpose: {}".format(dataset1.dataset_tagline))
print("* Description: {}".format(dataset1.description))
print("* Submitter: {}\n".format(dataset1.metadata.submitter))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(sorted(dataset1.metadata.elements))}}}")

for spec, obj in dataset1.qc_specifications.items():
    od = obj.dict()
    print("* Program:", od["program"])
    print("* QC Specifications:", spec)
    for field, value in od.items():
        if field in ["scf_properties", 'program', "spec_name", "spec_description"]:
            continue
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["scf_properties"]:
        print(f"    * {field}")


* Name: OpenFF Industry Benchmark Season 1 v1.2
* Purpose: The combination of all publicly chosen compound sets by industry partners from the OpenFF season 1 industry benchmark with unrealistic conformers removed.
* Description: This dataset is the public counterpart of the OpenFF Industry Benchmark Season 1. Each industry partner has selected a range of diverse molecules which represent their current chemical interests. The dataset will be used in conjunction with private counterparts also designed by each partner to give an unbiased assessment of the progress and current performance of the OpenFF line of force fields in comparison with other contemporary force fields.
The v1.1 dataset features corrected Merck (MRK) molecules with explicit hydrogens. The original v1.0 dataset did not have explicit hydrogens on these molecules, resulting in poor starting conformers that have largely failed to geometry optimize under QM.
The v1.1 dataset was prepared from the v1.0 dataset, excising the 

In [None]:
dataset1.export_dataset("dataset.json.bz2")
dataset1.molecules_to_file('dataset.smi', 'smi')
dataset1.visualize("dataset.pdf", columns=8)

