In [1]:
import json
import requests
import datetime
from collections import Counter, defaultdict

import numpy as np
from deepdiff import DeepDiff
import periodictable

from qcportal import PortalClient
from qcportal.external import scaffold
from qcportal.serialization import encode_to_json
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = PortalClient(ADDRESS, cache_dir=".")

# Get Records and Molecular Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids")

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/sage-2.2.0/5a4b058336506865e85fcdc6fd6d10c745a3fa7c/02_curate-data/output/optimization-training-set.json"
)
data = json.loads(file.content)
provenance = data["provenance"]
# list with: {type, record_id, cmiles, inchi_key}
entry_dicts = data["entries"][ADDRESS]
dataset_type = entry_dicts[0]["type"]


Getting record ids


In [3]:
# _________ Get Records ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)

Getting records


In [4]:
cmiles_by_record_id = {
    int(x["record_id"]): {"cmiles": x["cmiles"], "mol": None} 
    for x in entry_dicts
}
for record in records:
    cmiles_by_record_id[record.id]["mol"] = record.initial_molecule
    
cmiles_count = defaultdict(Counter)
molecules = []
for recid, x in cmiles_by_record_id.items():
    cmiles = x["cmiles"]

    if cmiles not in cmiles_count:
        molecules.append(x["mol"])
    hash = x["mol"].get_hash()
    cmiles_count[cmiles][hash] += 1
    
print(f"There are {len(records)} records (conformers) and {len(cmiles_count)} unique SMILES strings (unique molecules)")

There are 5126 records (conformers) and 1691 unique SMILES strings (unique molecules)


In [5]:
# _________ Pull Statistics from Dataset ____________
     
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [6]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  1: 1
  2: 3
  3: 4
  4: 9
  5: 12
  6: 17
  7: 33
  8: 62
  9: 73
 10: 121
 11: 152
 12: 153
 13: 151
 14: 130
 15: 137
 16: 74
 17: 73
 18: 56
 19: 51
 20: 41
 21: 47
 22: 33
 23: 29
 24: 24
 25: 32
 26: 7
 27: 14
 28: 17
 29: 14
 30: 9
 31: 21
 32: 21
 33: 13
 34: 12
 35: 5
 36: 18
 37: 8
 38: 9
 39: 5
* Number of unique molecules: 1691
* Number of conformers: 5126
* Number of conformers (min, mean, max): 1.00, 3.03, 12.00
* Molecular weight (min, mean, max): 16.04, 236.01, 544.64
* Charges: -3.0, -2.0, -1.0, 0.0, 1.0, 2.0


# Validate Inter-database Record Entry Names and Specifications 

In [7]:
# Get Dataset Ids of Interest:
dataset_names = [
    "OpenFF Gen 2 Opt Set 1 Roche",
    "OpenFF Gen 2 Opt Set 2 Coverage",
    "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
    "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
    "OpenFF Gen 2 Opt Set 5 Bayer",
    "OpenFF Gen2 Optimization Dataset Protomers v1.0",
    "OpenFF Iodine Chemistry Optimization Dataset v1.0",
    "OpenFF Optimization Set 1",
    "SMIRNOFF Coverage Set 1",
    "OpenFF Aniline Para Opt v1.0",
]
dataset_ids = [client.get_dataset(dataset_type, ds_name).id for ds_name in dataset_names]
print(f"We expect our records to come from the following datasets: {dataset_ids}")

record_ids = set([int(x["record_id"]) for x in entry_dicts])
tmp_ds_ids1 = []
wrong_ds1 = defaultdict(list)
for rec_id in record_ids:
    response = client.query_dataset_records(record_id=[rec_id])
    ds_name = None
    for resp in response:
        if resp["dataset_name"] in dataset_names:
            tmp_ds_ids1.append(resp["dataset_name"])
            ds_name = resp["dataset_name"]
    if ds_name is None:
        wrong_ds1[rec_id] = [resp["dataset_name"] for resp in response]
tmp_ds_ids1 = set(tmp_ds_ids1)
print(f"There are {len(wrong_ds1)} records that aren't in the datasets that we expect.")

We expect our records to come from the following datasets: [251, 253, 255, 254, 270, 345, 372, 41, 43, 315]
There are 0 records that aren't in the datasets that we expect.


In [8]:
# __________ Check that all records share a single specification __________
specification_list = []
for rec in records:
    tmp = encode_to_json(rec.specification)
    if all(len(DeepDiff(tmp, x)) > 0 for x in specification_list) or not specification_list:
        specification_list.append(tmp)
        
print(f"These records have {len(specification_list)} unique specifications")

These records have 5 unique specifications


In [9]:
specification_ds_ids = [[] for _ in range(len(specification_list))]
for ds_name in dataset_names:
    tmp_ds = client.get_dataset(dataset_type, ds_name)
    spec = encode_to_json(tmp_ds.specifications["default"].specification)
    for i, ref_spec in enumerate(specification_list):
        if len(DeepDiff(spec, ref_spec)) == 0:
            specification_ds_ids[i].append(tmp_ds.id)
            break
print(f"The follow datasets (represented by their ids) share a spec: {specification_ds_ids}")

The follow datasets (represented by their ids) share a spec: [[251, 253, 255, 254, 270], [41], [43], [345], [372, 315]]


In [10]:
# Determine if multiple datasets have entries with the same name
entry_dict = defaultdict(lambda: defaultdict(list))
for rec in records:
    response = client.query_dataset_records(record_id=[rec.id])
    for resp in response:
        if resp["dataset_name"] != "OpenFF Sage 2.0.0 Torsion Drive Training Dataset v1.0":
            entry_dict[resp["entry_name"]]["orig records"].append((rec.id, resp["dataset_name"]))
        
print("Entry names representing different records when in different datasets.")
repeat_entry_names = defaultdict(list)
for entry_name, tmp_record_dict in entry_dict.items():
    tmp = tmp_record_dict["orig records"]
    if len(tmp) > 1: # entry name is in multiple datasets
        tmp_dict = defaultdict(list)
        for x in tmp:
            tmp_dict[x[0]].append(x[1])
        if len(tmp_dict) > 1: # entry name is assigned to multiple different records
            print(len(tmp), entry_name)
            for rec_id, tmp_ds_names in tmp_dict.items():
                repeat_entry_names[entry_name].append(rec_id)
                print("    ", rec_id, tmp_ds_names)
print("We will rename all of these entry names later to include their respective dataset id of origin.")

Entry names representing different records when in different datasets.
3 c1cc(c(=o)[nh]c1)cn2cc[nh2+]cc2-1
     18433009 ['OpenFF Gen 2 Opt Set 1 Roche', 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1']
     1760625 ['OpenFF Optimization Set 1']
3 cc1ccccc1c(=o)c2ccccc2f-6
     1760494 ['OpenFF Optimization Set 1']
     18433027 ['OpenFF Gen 2 Opt Set 1 Roche', 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1']
3 cc1ccccc1c(=o)c2ccccc2f-3
     18433024 ['OpenFF Gen 2 Opt Set 1 Roche', 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1']
     1760491 ['OpenFF Optimization Set 1']
3 c1ccc(cc1)c[n@@]2ccccs2(=o)=o-3
     18433032 ['OpenFF Gen 2 Opt Set 1 Roche', 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1']
     1760743 ['OpenFF Optimization Set 1']
3 c[c@@h](c1ccccc1)oc2ccccc2-3
     1760423 ['OpenFF Optimization Set 1']
     18433061 ['OpenFF Gen 2 Opt Set 1 Roche', 'OpenFF Sage 2.0.0 Optimization Training Dataset v1.1']
3 c1ccc(c(c1)o)oc2ccccn2-5
     1760673 ['

In [11]:
# ___________ Check that given a dataset id, entry_name, and spec_name, the same record is returned ________________
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
track_records_dict = defaultdict(lambda: defaultdict(list))
for rec in records:
    response = client.query_dataset_records(record_id=rec.id)
    for resp in response:
        tmp_ds = client.get_dataset(dataset_type, resp["dataset_name"])
        rec2 = tmp_ds.get_record(resp["entry_name"], resp["specification_name"])
        track_records_dict[rec.id == rec2.id][rec.id].append([resp["dataset_name"], resp["entry_name"], resp["specification_name"], rec2.id])

print("Given a record id, ensure that the same record is returned given a dataset name, entry name, and spec name")
for key, value in track_records_dict.items():
    print(key, len(value))

Given a record id, ensure that the same record is returned given a dataset name, entry name, and spec name
True 5126


# Make New Dataset

In [None]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")

with open("opt_ds_info.json") as f:
    dataset_information = json.load(f)

#dataset = client.get_dataset(dataset_type, dataset_information["dataset_name"])
dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=provenance,
    default_tag="openff",
    owner_user="openffbot",
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_data": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": provenance['applied-filters']['ElementFilter-3']['allowed_elements'],
    },
)


Initializing new dataset


KeyError: 'ElementFilter-3'

In [None]:
# _________ Organize Records by Dataset ____________
print("Organizing records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
records_to_copy = defaultdict(lambda: defaultdict(list))
record_ids_to_copy = defaultdict(lambda: defaultdict(list))
dataset_names_from_id = defaultdict(str)
for rec in records:
    try:
        response = client.query_dataset_records(record_id=rec.id)
        if len(response) > 1:
            ds_id, spec_name, entry_name, ds_name = None, None, None, None
            for resp in response:
                if resp["dataset_id"] in dataset_ids:
                    ds_id = resp["dataset_id"]
                    ds_name = resp["dataset_name"]
                    spec_name = resp["specification_name"]
                    entry_name = resp["entry_name"]
        else:
            ds_id = response[0]["dataset_id"]
            ds_name = response[0]["dataset_name"]
            spec_name = response[0]["specification_name"]
            entry_name = response[0]["entry_name"]

        if ds_id is None:
            raise ValueError(f"This record, {rec.id}, is not found in a target dataset.")
        dataset_names_from_id[ds_id] = ds_name
        
        records_to_copy[ds_id][spec_name].append(entry_name)
        record_ids_to_copy[ds_id][entry_name] = rec.id

    except Exception as e:
        print(f"Failed record {rec.id}, {response}, {str(e)}")

In [None]:

# _________ Copy Records by Dataset ____________
print("Copying records")
for i, (ds_id, tmp_dict) in enumerate(records_to_copy.items()):
    print(f"Copying entries from ds-{ds_id} ({i+1} of {len(records_to_copy)})")
    new_spec_name = "default-" + "-".join([[str(y) for y in x] for x in specification_ds_ids if ds_id in x][0])
    for spec_name, entry_names in tmp_dict.items():
        dataset.copy_records_from( ds_id, entry_names=entry_names, specification_names=[spec_name])
        
        # Handle Repeat Spec Names
        dataset.rename_specification({spec_name: new_spec_name})
        
        # Handle Repeat Entry Names
        tmp_repeat_entry_names = list(set(repeat_entry_names.keys()) & set(entry_names))
        for entry_name in tmp_repeat_entry_names.keys():
            if record_ids_to_copy[ds_id][entry_name] in repeat_entry_names[entry_name]:
                repeat_entry_names[entry_name].remove(record_ids_to_copy[ds_id][entry_name])
            else:
                tmp_repeat_entry_names.remove(entry_name)
        name_map = {entry_name: f"{entry_name}-{ds_id}" for entry_name in tmp_repeat_entry_names}
        dataset.rename_entries(name_map)

# Validate that New Dataset Contains the Expected Records

In [None]:
new_record_ids = set([rec.id for _, _, rec in dataset.iterate_records()])
old_record_ids = set([rec.id for rec in records])
print(f"New record IDs match old record IDs: {set(new_record_ids) == set(old_record_ids)}")
print(f"There are {len(new_record_ids - old_record_ids)} record ids in the new dataset that aren't in the target list")
print(f"There are {len(old_record_ids - new_record_ids)} record ids in the target list that aren't in the new dataset")

# Write Out Dataset Metadata

In [None]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecule.symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")


In [None]:
scaffold.to_json(dataset, filename="scaffold_opt.json", compress=True)