In [20]:
import json
import os
import requests
import datetime
from collections import Counter, defaultdict

import numpy as np
from deepdiff import DeepDiff
import periodictable

import qcportal
from qcportal.serialization import encode_to_json
from qcportal.external import scaffold
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = qcportal.PortalClient(
    ADDRESS, 
    username=os.environ['QCARCHIVE_USER'],
    password=os.environ['QCARCHIVE_PASSWORD'],
    cache_dir=".",
)

# Get Records and Molecular Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids")

url = "https://raw.githubusercontent.com/openforcefield/ash-sage-rc2/32345dddeb6cb249367059fd99607ac2950a5c86/03_fit-valence/02_curate-data/output/optimizations-single-v3.json"
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request fails
data = response.json()  # Load the JSON content into a Python dictionary

entry_dicts = data["entries"][ADDRESS]
types = set(entry["type"] for entry in entry_dicts)
record_ids = [entry["record_id"] for entry in entry_dicts]
if len(types) != 1:
    raise ValueError(f"Expected exactly one unique type, but found: {types}")
dataset_type = types.pop()

Getting record ids


In [3]:
# _________ Get Records ____________
print("Getting records")
records = client.get_records(record_ids, missing_ok=False)

Getting records


In [4]:
cmiles_by_record_id = {
    int(x["record_id"]): {"cmiles": x["cmiles"], "mol": None} 
    for x in entry_dicts
}
for record in records:
    cmiles_by_record_id[record.id]["mol"] = record.initial_molecule
    
cmiles_count = defaultdict(Counter)
molecules = []
for recid, x in cmiles_by_record_id.items():
    cmiles = x["cmiles"]

    if cmiles not in cmiles_count:
        molecules.append(x["mol"])
    hash = x["mol"].get_hash()
    cmiles_count[cmiles][hash] += 1
    
print(f"There are {len(records)} records (conformers) and {len(cmiles_count)} unique SMILES strings (unique molecules)")

There are 4696 records (conformers) and 4696 unique SMILES strings (unique molecules)


In [5]:
# _________ Pull Statistics from Dataset ____________
     
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [6]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  2: 5
  3: 10
  4: 53
  5: 123
  6: 225
  7: 308
  8: 352
  9: 365
 10: 491
 11: 383
 12: 294
 13: 253
 14: 271
 15: 255
 16: 197
 17: 178
 18: 134
 19: 113
 20: 72
 21: 82
 22: 76
 23: 73
 24: 49
 25: 43
 26: 44
 27: 24
 28: 14
 29: 21
 30: 16
 31: 22
 32: 18
 33: 11
 34: 14
 35: 12
 36: 13
 37: 13
 38: 16
 39: 6
 41: 3
 42: 5
 43: 5
 44: 5
 45: 2
 46: 3
 47: 6
 48: 3
 49: 2
 50: 1
 52: 2
 53: 3
 54: 1
 55: 1
 57: 1
 58: 2
 61: 2
* Number of unique molecules: 4696
* Number of conformers: 4696
* Number of conformers (min, mean, max): 1.00, 1.00, 1.00
* Molecular weight (min, mean, max): 32.05, 207.67, 878.25
* Charges: -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0


# Validate Inter-database Record Entry Names and Specifications 

In [7]:
# Get Dataset Ids of Interest:
dataset_names = [
    "OpenFF Optimization Set 1",
    "SMIRNOFF Coverage Set 1",
    "OpenFF VEHICLe Set 1",
    "OpenFF Discrepancy Benchmark 1",
    "OpenFF Ehrman Informative Optimization v0.2",
    "Pfizer Discrepancy Optimization Dataset 1",
    "FDA Optimization Dataset 1",
    "Kinase Inhibitors: WBO Distributions",
    "OpenFF Gen 2 Opt Set 1 Roche",
    "OpenFF Gen 2 Opt Set 2 Coverage",
    "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
    "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
    "OpenFF Gen 2 Opt Set 5 Bayer",
    "OpenFF Sandbox CHO PhAlkEthOH v1.0",
    "OpenFF Aniline Para Opt v1.0",
    "OpenFF Industry Benchmark Season 1 v1.2",
    "OpenFF Gen2 Optimization Dataset Protomers v1.0",
    "OpenFF Protein Capped 1-mers 3-mers Optimization Dataset v1.0",
    "OpenFF Iodine Chemistry Optimization Dataset v1.0",
    "XtalPi Shared Fragments OptimizationDataset v1.0",
    "XtalPi 20-percent Fragments OptimizationDataset v1.0",
    "OpenFF Torsion Benchmark Supplement v1.0",
    "OpenFF Torsion Multiplicity Optimization Training Coverage Supplement v1.0",
    "OpenFF Torsion Multiplicity Optimization Benchmarking Coverage Supplement v1.0",
    "OpenFF Iodine Fragment Opt v1.0",
    "OpenFF Sulfur Optimization Training Coverage Supplement v1.0",
    "OpenFF Sulfur Optimization Benchmarking Coverage Supplement v1.0",
    "OpenFF Lipid Optimization Training Supplement v1.0",
    "OpenFF Lipid Optimization Benchmark Supplement v1.0",
    "OpenFF Cresset Additional Coverage Optimizations v4.0",
    "OpenFF Protein PDB 4-mers v4.0",
    "OpenFF Additional Generated ChEMBL Optimizations v4.0"
]
dataset_ids = [client.get_dataset(dataset_type, ds_name).id for ds_name in dataset_names]
print(f"We expect our records to come from the following datasets: {dataset_ids}")

record_ids = set([int(x["record_id"]) for x in entry_dicts])
tmp_ds_ids1 = []
wrong_ds1 = defaultdict(list)
for rec_id in record_ids:
    response = client.query_dataset_records(record_id=[rec_id])
    ds_name = None
    for resp in response:
        if resp["dataset_name"] in dataset_names:
            tmp_ds_ids1.append(resp["dataset_name"])
            ds_name = resp["dataset_name"]
    if ds_name is None:
        wrong_ds1[rec_id] = [resp["dataset_name"] for resp in response]
tmp_ds_ids1 = set(tmp_ds_ids1)
print(f"There are {len(wrong_ds1)} records that aren't in the datasets that we expect.")

We expect our records to come from the following datasets: [41, 43, 45, 50, 281, 68, 69, 232, 251, 253, 255, 254, 270, 296, 315, 453, 345, 365, 372, 379, 383, 385, 387, 388, 392, 393, 396, 399, 412, 415, 416, 426]
There are 0 records that aren't in the datasets that we expect.
There are 0 records that aren't in the datasets that we expect.


In [8]:
# __________ Check that all records share a single specification __________
specification_list = []
for rec in records:
    tmp = encode_to_json(rec.specification)
    if "constraints" in tmp["keywords"]:
        del tmp['keywords']['constraints']
    if all(len(DeepDiff(tmp, x)) > 0 for x in specification_list) or not specification_list:
        specification_list.append(tmp)
        
print(f"These records have {len(specification_list)} unique specifications")

These records have 9 unique specifications


In [9]:
specification_ds_ids = [[] for _ in range(len(specification_list))]
for ds_name in dataset_names:
    tmp_ds = client.get_dataset(dataset_type, ds_name)
    spec = encode_to_json(tmp_ds.specifications["default"].specification)
    if "keywords" in spec and "constraints" in spec["keywords"]:
        del spec['keywords']['constraints']
    for i, ref_spec in enumerate(specification_list):
        if len(DeepDiff(spec, ref_spec)) == 0:
            specification_ds_ids[i].append(tmp_ds.id)
            break
print(f"The follow datasets (represented by their ids) share a spec: {specification_ds_ids}")

The follow datasets (represented by their ids) share a spec: [[379, 383, 385, 387, 388, 392, 393, 396, 399, 412, 415, 416, 426], [281], [315, 453, 372], [43, 45, 50], [68, 69, 251, 253, 255, 254, 270], [345, 365], [296], [41], [232]]


In [10]:
# Determine if multiple datasets have entries with the same name
entry_dict = defaultdict(lambda: defaultdict(list))
for rec in records:
    response = client.query_dataset_records(record_id=[rec.id])
    for resp in response:
        if resp["dataset_name"] != "OpenFF Sage 2.0.0 Torsion Drive Training Dataset v1.0":
            entry_dict[resp["entry_name"]]["orig records"].append((rec.id, resp["dataset_name"]))
        
print("Entry names representing different records when in different datasets.")
repeat_entry_names = defaultdict(list)
for entry_name, tmp_record_dict in entry_dict.items():
    tmp = tmp_record_dict["orig records"]
    if len(tmp) > 1: # entry name is in multiple datasets
        tmp_dict = defaultdict(list)
        for x in tmp:
            tmp_dict[x[0]].append(x[1])
        if len(tmp_dict) > 1: # entry name is assigned to multiple different records
            print(len(tmp), entry_name)
            for rec_id, tmp_ds_names in tmp_dict.items():
                repeat_entry_names[entry_name].append(rec_id)
                print("    ", rec_id, tmp_ds_names)
print("We will rename all of these entry names later to include their respective dataset id of origin.")

Connection error for https://api.qcarchive.molssi.org:443/api/v1/datasets/queryrecords: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) - retrying in 0.50 seconds [1/5]


Entry names representing different records when in different datasets.
2 C[N+](C)(C)c1ccccc1
     127904750 ['OpenFF Torsion Benchmark Supplement v1.0']
     137157785 ['OpenFF Torsion Multiplicity Optimization Benchmarking Coverage Supplement v1.0']
2 C1CSSC1
     120134272 ['XtalPi 20-percent Fragments OptimizationDataset v1.0']
     138533300 ['OpenFF Lipid Optimization Training Supplement v1.0']
2 C1C[N+]23CCC[N+]2(C1)CCC3
     127904781 ['OpenFF Torsion Benchmark Supplement v1.0']
     137149064 ['OpenFF Torsion Multiplicity Optimization Training Coverage Supplement v1.0']
2 C[N+](C)(C)C1(CC1)C(=O)[O-]
     127904748 ['OpenFF Torsion Benchmark Supplement v1.0']
     137157784 ['OpenFF Torsion Multiplicity Optimization Benchmarking Coverage Supplement v1.0']
2 C(CBr)Br
     127904742 ['OpenFF Torsion Benchmark Supplement v1.0']
     146497251 ['OpenFF Additional Generated ChEMBL Optimizations v4.0']
We will rename all of these entry names later to include their respective dataset i

In [11]:
# ___________ Check that given a dataset id, entry_name, and spec_name, the same record is returned ________________
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
track_records_dict = defaultdict(lambda: defaultdict(list))
for rec in records:
    response = client.query_dataset_records(record_id=rec.id)
    for resp in response:
        tmp_ds = client.get_dataset(dataset_type, resp["dataset_name"])
        rec2 = tmp_ds.get_record(resp["entry_name"], resp["specification_name"])
        track_records_dict[rec.id == rec2.id][rec.id].append([resp["dataset_name"], resp["entry_name"], resp["specification_name"], rec2.id])

print("Given a record id, ensure that the same record is returned given a dataset name, entry name, and spec name")
for key, value in track_records_dict.items():
    print(key, len(value))

Given a record id, ensure that the same record is returned given a dataset name, entry name, and spec name
True 4696


# Make New Dataset

In [12]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")

with open("opt_ds_info.json") as f:
    dataset_information = json.load(f)

#dataset = client.get_dataset_by_id(447)
dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    tags=["openff"],
    provenance={
        "qcportal": qcportal.__version__,
    },
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_date": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        "long_description": dataset_information["description"],
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": elements,
    },
)
dataset.id


Initializing new dataset


471

In [13]:
dataset.delete_entries(dataset.entry_names)
for spec_name in dataset.specifications.keys():
    dataset.delete_specification(spec_name)
    
print(f"There are {dataset.record_count} records, {len(dataset.specifications)} specifications, and {len(dataset.entry_names)} entries")

There are 0 records, 0 specifications, and 0 entries


In [14]:
# _________ Organize Records by Dataset ____________
spec_index = {ds_id: next(i for i, sublist in enumerate(specification_ds_ids) if ds_id in sublist) for ds_id in dataset_ids}
print("Organizing records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
records_to_copy = [defaultdict(list) for _ in range(len(specification_ds_ids))]
record_ids_to_copy = defaultdict(lambda: defaultdict(list))
record_dataset = defaultdict(int)
dataset_names_from_id = defaultdict(str)
for rec in records:
    try:
        response = client.query_dataset_records(record_id=rec.id)
        if len(response) > 1:
            ds_id, spec_name, entry_name, ds_name = None, None, None, None
            for resp in response:
                if resp["dataset_id"] in dataset_ids:
                    ds_id = resp["dataset_id"]
                    ds_name = resp["dataset_name"]
                    spec_name = resp["specification_name"]
                    entry_name = resp["entry_name"]
        else:
            ds_id = response[0]["dataset_id"]
            ds_name = response[0]["dataset_name"]
            spec_name = response[0]["specification_name"]
            entry_name = response[0]["entry_name"]

        if ds_id is None:
            raise ValueError(f"This record, {rec.id}, is not found in a target dataset.")
        dataset_names_from_id[ds_id] = ds_name
        
        records_to_copy[spec_index[ds_id]][ds_id].append(entry_name)
        record_ids_to_copy[ds_id][entry_name] = rec.id
        record_dataset[rec.id] = ds_id

    except Exception as e:
        print(f"Failed record {rec.id}, {response}, {str(e)}")

Organizing records


In [15]:
tmp = sum(len(entry_names) for ds_dict in records_to_copy for _, entry_names in ds_dict.items())
print(f"There are {tmp} entry names, expect {len(records)}")

There are 4696 entry names, expect 4696


In [16]:

# _________ Copy Records by Dataset ____________
print("Copying records")
for i, ds_dicts in enumerate(records_to_copy):
    old_spec_name = "default" 
    new_spec_name = "default-" + "-".join([str(x) for x in specification_ds_ids[i]])
    
    for j, (ds_id, entry_names) in enumerate(ds_dicts.items()):
        print(f"Copying entries from ds-{ds_id} ({j+1} of {len(ds_dicts)})")

        dataset.copy_records_from( ds_id, entry_names=entry_names, specification_names=[old_spec_name])
        dataset.fetch_specifications(force_refetch=True)
        
        # Handle Repeat Entry Names
        tmp_repeat_entry_names = list(set(repeat_entry_names.keys()) & set(entry_names))
        name_map = {entry_name: f"{entry_name}-{ds_id}" for entry_name in tmp_repeat_entry_names}
        dataset.rename_entries(name_map)
        
    # Handle Repeat Spec Names
    dataset.rename_specification(old_spec_name, new_spec_name)
    dataset.fetch_specifications(force_refetch=True)

Copying records
Copying entries from ds-383 (1 of 13)
Copying entries from ds-426 (2 of 13)
Copying entries from ds-426 (2 of 13)
Copying entries from ds-385 (3 of 13)
Copying entries from ds-385 (3 of 13)
Copying entries from ds-399 (4 of 13)
Copying entries from ds-399 (4 of 13)
Copying entries from ds-387 (5 of 13)
Copying entries from ds-387 (5 of 13)
Copying entries from ds-392 (6 of 13)
Copying entries from ds-392 (6 of 13)
Copying entries from ds-412 (7 of 13)
Copying entries from ds-412 (7 of 13)
Copying entries from ds-393 (8 of 13)
Copying entries from ds-393 (8 of 13)
Copying entries from ds-415 (9 of 13)
Copying entries from ds-415 (9 of 13)
Copying entries from ds-379 (10 of 13)
Copying entries from ds-379 (10 of 13)
Copying entries from ds-416 (11 of 13)
Copying entries from ds-416 (11 of 13)
Copying entries from ds-388 (12 of 13)
Copying entries from ds-388 (12 of 13)
Copying entries from ds-396 (13 of 13)
Copying entries from ds-396 (13 of 13)
Copying entries from ds-28

# Validate that New Dataset Contains the Expected Records

In [17]:
new_record_ids = set([rec.id for _, _, rec in dataset.iterate_records()])
old_record_ids = set([rec.id for rec in records])
print(f"New record IDs match old record IDs: {set(new_record_ids) == set(old_record_ids)}")
print(f"There are {len(new_record_ids - old_record_ids)} record ids in the new dataset that aren't in the target list")
print(f"There are {len(old_record_ids - new_record_ids)} record ids in the target list that aren't in the new dataset")

New record IDs match old record IDs: True
There are 0 record ids in the new dataset that aren't in the target list
There are 0 record ids in the target list that aren't in the new dataset


# Write Out Dataset Metadata

In [18]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecule.symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("\n* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    if "scf_properties" in od["keywords"]:
        print("  * SCF Properties:")
        for field in od["keywords"]["scf_properties"]:
            print(f"    * {field}")




# Output for README Part 2

* Description: A quantum chemical (QC) optimization and torsiondrive datasets generated at the OpenFF default level of theory, B3LYP-D3BJ/DZVP, and curated to train parameters in [OpenFF 2.3.0 Sage](https://github.com/openforcefield/ash-sage-rc2) with NAGL partial charge model AshGC. Targets were curated from the following datasets:  OpenFF Optimization Set 1, SMIRNOFF Coverage Set 1, OpenFF VEHICLe Set 1, OpenFF Discrepancy Benchmark 1, OpenFF Ehrman Informative Optimization v0.2, Pfizer discrepancy optimization dataset 1, FDA optimization dataset 1, Kinase Inhibitors: WBO Distributions, OpenFF Gen 2 Opt Set 1 Roche, OpenFF Gen 2 Opt Set 2 Coverage, OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy, OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy, OpenFF Gen 2 Opt Set 5 Bayer, OpenFF Sandbox CHO PhAlkEthOH v1.0, OpenFF Aniline Para Opt v1.0, OpenFF Industry Benchmark Season 1 v1.2, OpenFF Gen2 Optimization Dataset Protomers v1.0, OpenFF Protein Capped 1-mers 3-mers

In [19]:
scaffold.to_json(dataset, filename="scaffold_opt.json", compress=True)

In [None]:
ij = dataset.create_view(description="Dataset without wavefunctions include in view", provenance={}, include=['**'], exclude=["wavefunction"], include_children=False)
ij.watch()

Complete: 100%|██████████| 100/100 [00:26<00:00,  3.82it/s]                     

Internal job final status: complete



