In [1]:
import numpy as np
import json
import requests
import datetime
from collections import Counter, defaultdict

import periodictable

from qcportal.external import scaffold
from qcportal import PortalClient
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = PortalClient(ADDRESS, cache_dir=".")

# Get Records and Molecular Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids")

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/openff-sage/37a36e7eeaf6cdca795847089a288bdff168c08a/data-set-curation/quantum-chemical/data-sets/1-2-0-opt-set-v3.json"
)
data = json.loads(file.content)
provenance = data["provenance"]
# list with: {type, record_id, cmiles, inchi_key}
entry_dicts = data["entries"][ADDRESS]
dataset_type = entry_dicts[0]["type"]


Getting record ids


In [3]:
# _________ Get Records ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)

Getting records


In [5]:
cmiles_by_record_id = {
    int(x["record_id"]): {"cmiles": x["cmiles"], "mol": None} 
    for x in entry_dicts
}
for record in records:
    cmiles_by_record_id[record.id]["mol"] = record.initial_molecule
    
cmiles_count = defaultdict(Counter)
molecules = []
for recid, x in cmiles_by_record_id.items():
    cmiles = x["cmiles"]

    if cmiles not in cmiles_count:
        molecules.append(x["mol"])
    hash = x["mol"].get_hash()
    cmiles_count[cmiles][hash] += 1

In [6]:
# _________ Pull Statistics from Dataset ____________
     
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [7]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  4: 2
  5: 3
  6: 4
  7: 7
  8: 20
  9: 19
 10: 46
 11: 56
 12: 81
 13: 81
 14: 94
 15: 94
 16: 59
 17: 59
 18: 47
 19: 38
 20: 36
 21: 45
 22: 30
 23: 23
 24: 21
 25: 29
 26: 6
 27: 11
 28: 16
 29: 10
 30: 8
 31: 16
 32: 16
 33: 11
 34: 11
 35: 5
 36: 15
 37: 7
 38: 8
 39: 5
* Number of unique molecules: 1039
* Number of conformers: 3663
* Number of conformers (min, mean, max): 1.00, 3.53, 10.00
* Molecular weight (min, mean, max): 76.05, 261.37, 544.64
* Charges: -2.0, -1.0, 0.0, 1.0


# Make New Dataset

In [None]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")

with open("ds_info.json") as f:
    dataset_information = json.load(f)

#dataset = client.get_dataset(dataset_type, dataset_information["dataset_name"])
dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=provenance,
    default_tag="openff",
    owner_user="openffbot",
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_data": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": provenance['applied-filters']['ElementFilter-3']['allowed_elements'],
    },
)


Initializing new dataset


In [9]:
# Get ds associated with specifications
# _________ Get Records and Find Associated Dataset Name ____________
print("Getting records")

records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
records_to_copy = defaultdict(lambda: defaultdict(list))
for rec in records:
    try:
        response = client.query_dataset_records(record_id=[rec.id])
        records_to_copy[response[0]["dataset_id"]][response[0]["specification_name"]].append(response[0]["entry_name"])
    except Exception:
        print(f"Failed record {rec.id}, {response}")
    
for i, (ds_id, tmp_dict) in enumerate(records_to_copy.items()):
    print(f"Copying entries from ds-{ds_id} ({i+1} of {len(records_to_copy)})")
    for spec_name, entry_names in tmp_dict.items():
        dataset.copy_records_from( ds_id, entry_names=entry_names, specification_names=[spec_name])

Getting records
Failed record 19095358, [{'record_id': 19095355, 'dataset_id': 270, 'dataset_type': 'optimization', 'dataset_name': 'OpenFF Gen 2 Opt Set 5 Bayer', 'entry_name': 'cc(=ccoc1cc(ccc1oc)[c@@h]2cc(=o)nc2)c-0', 'specification_name': 'default'}, {'record_id': 19095355, 'dataset_id': 427, 'dataset_type': 'optimization', 'dataset_name': 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1', 'entry_name': 'cc(=ccoc1cc(ccc1oc)[c@@h]2cc(=o)nc2)c-0', 'specification_name': 'default'}]
Failed record 19095359, [{'record_id': 19095355, 'dataset_id': 270, 'dataset_type': 'optimization', 'dataset_name': 'OpenFF Gen 2 Opt Set 5 Bayer', 'entry_name': 'cc(=ccoc1cc(ccc1oc)[c@@h]2cc(=o)nc2)c-0', 'specification_name': 'default'}, {'record_id': 19095355, 'dataset_id': 427, 'dataset_type': 'optimization', 'dataset_name': 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1', 'entry_name': 'cc(=ccoc1cc(ccc1oc)[c@@h]2cc(=o)nc2)c-0', 'specification_name': 'default'}]
Copying entries from ds-251 (1

In [11]:
new_record_ids = [rec.id for _, _, rec in dataset.iterate_records()]
set(new_record_ids) == set([rec.id for rec in records])

True

In [12]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecule.symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")




# Output for README Part 2

* Description: A quantum chemical (QC) dataset curated to train the OpenFF 2.0.0 Sage, with reparametrized Lennard-Jones (LJ) and valence parameters, the latter relevant to this dataset. This QC dataset with the OpenFF default level of theory, B3LYP-D3BJ/DZVP, is used to train Sage parameters. These optimized conformer geometries were used in conjunction with the QC dataset used to train one dimensional torsional profiles. This Generation 2 dataset increases chemical diversity when compared to Generation 1, which are of value to our industry partners. Large molecules (>20 heavy atoms) were also included, offering more flexible molecules and a greater degree of conformational variation which provide intramolecular interactions. This is the complete Optimization dataset used for training OpenFF 2.0.0 Sage, consisting of molecules from the following datasets: OpenFF Gen 2 Opt Set 1 Roche', 'OpenFF Gen 2 Opt Set 2 Coverage', 'OpenFF Gen 2 Opt Set 3 Pfizer Disc

In [13]:
scaffold.to_json(dataset, compress=True)