In [26]:
import json
import requests
import datetime
from collections import Counter, defaultdict

import numpy as np
from deepdiff import DeepDiff
import periodictable

from qcportal import PortalClient
from qcportal.serialization import encode_to_json
from qcportal.external import scaffold
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = PortalClient(ADDRESS, cache_dir=".")

# Get Records and Molecular Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids")

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/sage-2.1.0/8d196aa104f83b8c901d922073ee68b875ae8c32/inputs-and-outputs/data-sets/td-set-for-fitting-2.1.0.json"
)
data = json.loads(file.content)
provenance = data["provenance"]
# list with: {type, record_id, cmiles, inchi_key}
entry_dicts = data["entries"][ADDRESS]
dataset_type = "torsiondrive"


Getting record ids


In [3]:
# _________ Get Records ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)

Getting records


In [4]:
cmiles_by_record_id = {
    int(x["record_id"]): {"cmiles": x["cmiles"], "mol": None} 
    for x in entry_dicts
}
for record in records:
    cmiles_by_record_id[record.id]["mol"] = record.initial_molecules

cmiles_count = defaultdict(Counter)
molecules = []
torsionstats = []
for recid, x in cmiles_by_record_id.items():
    cmiles = x["cmiles"]

    if cmiles not in cmiles_count:
        molecules.append(x["mol"][0])

    torsionstats.append(len(x["mol"]))
    hash = x["mol"][0].get_hash()
    cmiles_count[cmiles][hash] += 1

print(f"There are {len(records)} records (conformers) and {len(cmiles_count)} unique SMILES strings (unique molecules)")

There are 1300 records (conformers) and 953 unique SMILES strings (unique molecules)


In [5]:
# _________ Pull Statistics from Dataset ____________
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted([str(x) for x in set(elements)])
print(elements)

Generating Molecular Statistics
['Br', 'C', 'Cl', 'F', 'H', 'N', 'O', 'P', 'S']


In [6]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("\n\n# Output for README Part 1\n")
print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of driven torsions: {}".format(len(records)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.0f}, {:.0f}, {:.0f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  2: 1
  3: 2
  4: 23
  5: 27
  6: 62
  7: 66
  8: 97
  9: 80
 10: 82
 11: 76
 12: 47
 13: 63
 14: 57
 15: 56
 16: 27
 17: 33
 18: 17
 19: 26
 20: 16
 21: 12
 22: 7
 23: 11
 24: 13
 25: 6
 26: 5
 27: 11
 28: 15
 29: 10
 30: 4
 32: 1


# Output for README Part 1

* Number of unique molecules: 953
* Number of driven torsions: 1300
* Number of conformers: 974
* Number of conformers (min, mean, max): 1, 1, 3
* Molecular weight (min, mean, max): 32.04, 185.54, 503.42
* Charges: -1.0, 0.0, 1.0


# Validate Inter-database Record Entry Names and Specifications 

In [7]:
# Get Dataset Ids of Interest:
dataset_names = [
    "OpenFF Gen 2 Torsion Set 1 Roche 2",
    "OpenFF Gen 2 Torsion Set 2 Coverage 2",
    "OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy 2",
    "OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2",
    "OpenFF Gen 2 Torsion Set 5 Bayer 2",
    "OpenFF Gen 2 Torsion Set 6 Supplemental 2",
    "SMIRNOFF Coverage Torsion Set 1",
    "OpenFF Group1 Torsions",
    "OpenFF Group1 Torsions 2",
    "OpenFF Group1 Torsions 3",
    "Pfizer Discrepancy Torsion Dataset 1",
    "OpenFF Gen3 Torsion Set v1.0",
    "OpenFF Amide Torsion Set v1.0",
    "OpenFF WBO Conjugated Series v1.0",
    "OpenFF DANCE 1 eMolecules t142 v1.0",
]
dataset_ids = [client.get_dataset(dataset_type, ds_name).id for ds_name in dataset_names]
print(f"We expect our records to come from the following datasets: {dataset_ids}")

record_ids = set([int(x["record_id"]) for x in entry_dicts])
tmp_ds_ids1 = []
wrong_ds1 = defaultdict(list)
for rec_id in record_ids:
    response = client.query_dataset_records(record_id=[rec_id])
    ds_name = None
    for resp in response:
        if resp["dataset_name"] in dataset_names:
            tmp_ds_ids1.append(resp["dataset_name"])
            ds_name = resp["dataset_name"]
    if ds_name is None:
        wrong_ds1[rec_id] = [resp["dataset_name"] for resp in response]
tmp_ds_ids1 = set(tmp_ds_ids1)
print(f"There are {len(wrong_ds1)} records that aren't in the datasets that we expect.")

We expect our records to come from the following datasets: [256, 257, 258, 259, 265, 266, 48, 36, 242, 243, 70, 317, 314, 308, 282]
There are 0 records that aren't in the datasets that we expect.


In [8]:
def remove_dihedrals(dict1):
    del dict1['keywords']['dihedrals']
    del dict1['keywords']['dihedral_ranges']
    del dict1['keywords']['grid_spacing']
    del dict1['keywords']['energy_upper_limit']
    return dict1

def check_diff(dict1, dict2):
    tmp = DeepDiff(dict1, dict2)
    return len(tmp) > 0

# __________ Check that all records share a single specification __________
specification_list = []
for rec in records:
    tmp = remove_dihedrals(encode_to_json(rec.specification))
    if all(check_diff(tmp, x) for x in specification_list) or not specification_list:
        specification_list.append(tmp)
        
print(f"These records have {len(specification_list)} unique specifications")

These records have 3 unique specifications


In [9]:
from pprint import pprint

specification_ds_ids = [[] for _ in range(len(specification_list))]
for ds_name in dataset_names:
    tmp_ds = client.get_dataset(dataset_type, ds_name)
    spec = remove_dihedrals(encode_to_json(tmp_ds.specifications["default"].specification))
    for i, ref_spec in enumerate(specification_list):
        if len(DeepDiff(spec, ref_spec)) == 0:
            specification_ds_ids[i].append(tmp_ds.id)
            break
print(f"The follow datasets (represented by their ids) share a spec: {specification_ds_ids}")

The follow datasets (represented by their ids) share a spec: [[256, 257, 258, 259, 265, 266, 48, 242, 243, 70, 282], [36], [317, 314, 308]]


In [10]:
# Determine if multiple datasets have entries with the same name
entry_dict = defaultdict(lambda: defaultdict(list))
for rec in records:
    response = client.query_dataset_records(record_id=[rec.id])
    for resp in response:
        if resp["dataset_name"] != "OpenFF Sage 2.0.0 Torsion Drive Training Dataset v1.0":
            entry_dict[resp["entry_name"]]["orig records"].append((rec.id, resp["dataset_name"]))
        
print("Entry names representing different records when in different datasets.")
repeat_entry_names = defaultdict(list)
for entry_name, tmp_record_dict in entry_dict.items():
    tmp = tmp_record_dict["orig records"]
    if len(tmp) > 1: # entry name is in multiple datasets
        tmp_dict = defaultdict(list)
        for x in tmp:
            tmp_dict[x[0]].append(x[1])
        if len(tmp_dict) > 1: # entry name is assigned to multiple different records
            print(len(tmp), entry_name)
            for rec_id, tmp_ds_names in tmp_dict.items():
                repeat_entry_names[entry_name].append(rec_id)
                print("    ", rec_id, tmp_ds_names)
        repeat_entry_names[entry_name] = list(set(repeat_entry_names[entry_name]))
print("We will rename all of these entry names later to include their respective dataset id of origin.")

Entry names representing different records when in different datasets.
2 [ch3:4][ch2:3][c:2]1([ch2:1]oc1)c
     18535806 ['OpenFF Gen 2 Torsion Set 1 Roche 2']
     1762309 ['OpenFF Group1 Torsions']
3 c1[ch:1][c:2](c(=o)[nh]c1)[ch:3]2c=ccc=[ch:4]2
     18045320 ['OpenFF Gen 2 Torsion Set 1 Roche', 'OpenFF Gen 2 Torsion Set 1 Roche 2']
     1762103 ['OpenFF Group1 Torsions']
3 c1c[ch:1][c:2](cc1)[c:3]2(cc2)[nh:4]c3ccccc3
     18045325 ['OpenFF Gen 2 Torsion Set 1 Roche', 'OpenFF Gen 2 Torsion Set 1 Roche 2']
     1762745 ['OpenFF Group1 Torsions']
3 c[c:4](=o)[nh:3][c:2]1(cc1)[c:1]2nccn2c
     18045337 ['OpenFF Gen 2 Torsion Set 1 Roche', 'OpenFF Gen 2 Torsion Set 1 Roche 2']
     1762490 ['OpenFF Group1 Torsions']
3 c1c[ch:1][c:2](c(c1)c2c=cnc=c2)[n+:3](=o)[o-:4]
     18045344 ['OpenFF Gen 2 Torsion Set 1 Roche', 'OpenFF Gen 2 Torsion Set 1 Roche 2']
     1762095 ['OpenFF Group1 Torsions']
2 c[c:4](=o)[o:3][c:2]1[ch:1]cccc1
     18535872 ['OpenFF Gen 2 Torsion Set 1 Roche 2']
     176

In [11]:
# ___________ Check that given a dataset id, entry_name, and spec_name, the same record is returned ________________
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
track_records_dict = defaultdict(lambda: defaultdict(list))
for rec in records:
    response = client.query_dataset_records(record_id=rec.id)
    for resp in response:
        tmp_ds = client.get_dataset(dataset_type, resp["dataset_name"])
        rec2 = tmp_ds.get_record(resp["entry_name"], resp["specification_name"])
        track_records_dict[rec.id == rec2.id][rec.id].append([resp["dataset_name"], resp["entry_name"], resp["specification_name"], rec2.id])
        
print("Given a record id, ensure that the same record is returned given a dataset name, entry name, and spec name")
for key, value in track_records_dict.items():
    print(key, len(value))

Given a record id, ensure that the same record is returned given a dataset name, entry name, and spec name
True 1300


# Make New Dataset

In [None]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")
with open("td_ds_info.json") as f:
    dataset_information = json.load(f)

dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=provenance,
    default_tag="openff",
    owner_user="openffbot",
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_data": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": elements,
    },
)
dataset.id


Initializing new dataset


445

In [13]:
dataset.delete_entries(dataset.entry_names)
for spec_name in dataset.specifications:
    dataset.delete_specification(spec_name)

dataset.fetch_entries()
dataset.fetch_specifications()
print(f"There are {dataset.record_count} records, {len(dataset.entry_names)} entries, {len(dataset.specifications)} specifications")

There are 0 records, 0 entries, 0 specifications


In [14]:
# _________ Organize Records by Dataset ____________
spec_index = {ds_id: next(i for i, sublist in enumerate(specification_ds_ids) if ds_id in sublist) for ds_id in dataset_ids}
print("Organizing records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
records_to_copy = [defaultdict(list) for _ in range(len(specification_ds_ids))]
record_ids_to_copy = defaultdict(lambda: defaultdict(list))
record_dataset = defaultdict(int)
dataset_names_from_id = defaultdict(str)
for rec in records:
    try:
        response = client.query_dataset_records(record_id=rec.id)
        if len(response) > 1:
            ds_id, spec_name, entry_name, ds_name = None, None, None, None
            for resp in response:
                if resp["dataset_id"] in dataset_ids:
                    ds_id = resp["dataset_id"]
                    ds_name = resp["dataset_name"]
                    spec_name = resp["specification_name"]
                    entry_name = resp["entry_name"]
                    break
        else:
            ds_id = response[0]["dataset_id"]
            ds_name = response[0]["dataset_name"]
            spec_name = response[0]["specification_name"]
            entry_name = response[0]["entry_name"]

        if ds_id is None:
            raise ValueError(f"This record, {rec.id}, is not found in a target dataset.")
        dataset_names_from_id[ds_id] = ds_name
        
        records_to_copy[spec_index[ds_id]][ds_id].append(entry_name)
        record_ids_to_copy[ds_id][entry_name] = rec.id
        record_dataset[rec.id] = ds_id

    except Exception as e:
        print(f"Failed record {rec.id}, {response}, {str(e)}")

Organizing records


In [15]:
tmp = sum(len(entry_names) for ds_dict in records_to_copy for _, entry_names in ds_dict.items())
print(f"There are {tmp} entry names, expect {len(records)}")

There are 1300 entry names, expect 1300


In [16]:

# _________ Copy Records by Dataset ____________
print("Copying records")
for i, ds_dicts in enumerate(records_to_copy):
    old_spec_name = "default" 
    new_spec_name = "default-" + "-".join([str(x) for x in specification_ds_ids[i]])
    
    for j, (ds_id, entry_names) in enumerate(ds_dicts.items()):
        print(f"Copying entries from ds-{ds_id} ({j+1} of {len(ds_dicts)})")

        dataset.copy_records_from( ds_id, entry_names=entry_names, specification_names=[old_spec_name])
        dataset.fetch_specifications(force_refetch=True)
        
        # Handle Repeat Entry Names
        tmp_repeat_entry_names = list(set(repeat_entry_names.keys()) & set(entry_names))
        name_map = {entry_name: f"{entry_name}-{ds_id}" for entry_name in tmp_repeat_entry_names}
        dataset.rename_entries(name_map)

    # Handle Repeat Spec Names
    dataset.rename_specification(old_spec_name, new_spec_name)
    dataset.fetch_specifications(force_refetch=True)

Copying records
Copying entries from ds-256 (1 of 10)
Copying entries from ds-257 (2 of 10)
Copying entries from ds-48 (3 of 10)
Copying entries from ds-258 (4 of 10)
Copying entries from ds-70 (5 of 10)
Copying entries from ds-259 (6 of 10)
Copying entries from ds-265 (7 of 10)
Copying entries from ds-242 (8 of 10)
Copying entries from ds-266 (9 of 10)
Copying entries from ds-243 (10 of 10)
Copying entries from ds-36 (1 of 1)
Copying entries from ds-317 (1 of 3)
Copying entries from ds-314 (2 of 3)
Copying entries from ds-308 (3 of 3)


# Validate that New Dataset Contains the Expected Records

In [17]:
new_record_ids = set([rec.id for _, _, rec in dataset.iterate_records(force_refetch=True)])
old_record_ids = set([rec.id for rec in records])
print(f"New record IDs match old record IDs: {set(new_record_ids) == set(old_record_ids)}")
print(f"Number of target records: {len(records)}; Number of dataset records {len(new_record_ids)}")
print(f"There are {len(new_record_ids - old_record_ids)} record ids in the new dataset that aren't in the target list")
print(f"There are {len(old_record_ids - new_record_ids)} record ids in the target list that aren't in the new dataset")

New record IDs match old record IDs: True
Number of target records: 1300; Number of dataset records 1300
There are 0 record ids in the new dataset that aren't in the target list
There are 0 record ids in the target list that aren't in the new dataset


# Write Out Dataset Metadata

In [24]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecules[0].symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(sorted(elements))}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("\n* Program:", od["program"])
    od = od["optimization_specification"]
    print("* Optimization Specification:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    if "scf_properties" in od["keywords"]:
        print("  * SCF Properties:")
        for field in od["keywords"]["scf_properties"]:
            print(f"    * {field}")



# Output for README Part 2

* Description: A quantum chemical (QC) dataset of torsiondrive profiles was generated at the OpenFF default level of theory, B3LYP-D3BJ/DZVP, and curated to train valence parameters in [OpenFF 2.1.0 Sage](https://github.com/openforcefield/sage-2.1.0/). Torsion profiles were curated from the following datasets: 'OpenFF Gen 2 Torsion Set 1 Roche 2', 'OpenFF Gen 2 Torsion Set 2 Coverage 2', 'OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy 2', 'OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2', 'OpenFF Gen 2 Torsion Set 5 Bayer 2', 'OpenFF Gen 2 Torsion Set 6 Supplemental 2', 'SMIRNOFF Coverage Torsion Set 1', 'OpenFF Group1 Torsions', 'OpenFF Group1 Torsions 2', 'OpenFF Group1 Torsions 3', 'Pfizer Discrepancy Torsion Dataset 1', 'OpenFF Gen3 Torsion Set v1.0', 'OpenFF Amide Torsion Set v1.0', 'OpenFF WBO Conjugated Series v1.0', and 'OpenFF DANCE 1 eMolecules t142 v1.0'. These combined datasets were filtered with `ElementFilter(allowed_elements=['H', 'C', 'N

In [23]:
scaffold.to_json(dataset, filename="scaffold_td.json", compress=True)