In [1]:
import numpy as np
import json
import requests
import datetime
from collections import Counter, defaultdict

import periodictable

from qcportal import PortalClient
from qcportal.external import scaffold
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = PortalClient(ADDRESS, cache_dir=".")



# Get Records and Molecular Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids")

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/openff-sage/37a36e7eeaf6cdca795847089a288bdff168c08a/data-set-curation/quantum-chemical/data-sets/1-2-0-td-set.json"
)
data = json.loads(file.content)
provenance = data["provenance"]
# list with: {type, record_id, cmiles, inchi_key}
entry_dicts = data["entries"][ADDRESS]
dataset_type = "torsiondrive"


Getting record ids


In [3]:
# _________ Get Records ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)

Getting records


In [4]:
cmiles_by_record_id = {
    int(x["record_id"]): {"cmiles": x["cmiles"], "mol": None} 
    for x in entry_dicts
}
for record in records:
    cmiles_by_record_id[record.id]["mol"] = record.initial_molecules

cmiles_count = defaultdict(Counter)
molecules = []
torsionstats = []
for recid, x in cmiles_by_record_id.items():
    cmiles = x["cmiles"]

    if cmiles not in cmiles_count:
        molecules.append(x["mol"][0])

    torsionstats.append(len(x["mol"]))
    hash = x["mol"][0].get_hash()
    cmiles_count[cmiles][hash] += 1

print(len(records), len(cmiles_count))

713 562


In [5]:
# _________ Pull Statistics from Dataset ____________
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [6]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("\n\n# Output for README Part 1\n")
print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of driven torsions: {}".format(len(records)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.0f}, {:.0f}, {:.0f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  3: 1
  4: 3
  5: 4
  6: 13
  7: 18
  8: 29
  9: 23
 10: 41
 11: 39
 12: 30
 13: 44
 14: 47
 15: 50
 16: 25
 17: 35
 18: 16
 19: 25
 20: 16
 21: 13
 22: 10
 23: 11
 24: 14
 25: 9
 26: 5
 27: 11
 28: 15
 29: 10
 30: 4
 32: 1


# Output for README Part 1

* Number of unique molecules: 562
* Number of driven torsions: 713
* Number of conformers: 563
* Number of conformers (min, mean, max): 1, 1, 2
* Molecular weight (min, mean, max): 46.07, 224.91, 503.41
* Charges: -1.0, 0.0, 1.0


# Make New Dataset

In [7]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")
with open("ds_info.json") as f:
    dataset_information = json.load(f)

dataset = client.get_dataset(dataset_type, dataset_information["dataset_name"])

#dataset = client.add_dataset(
#    dataset_type,
#    dataset_information["dataset_name"],
#    tagline=dataset_information["dataset_tagline"],
#    description=dataset_information["description"],
#    provenance=provenance,
#    default_tag="openff",
#    owner_user="openffbot",
#    extras={
#        "submitter": dataset_information["metadata.submitter"],
#        "creation_data": str(datetime.date.today()),
#        'collection_type': 'OptimizationDataset',
#        'long_description_url': dataset_information["metadata.long_description_url"],
#        "short description": dataset_information["dataset_tagline"],
#        "dataset_name": dataset_information["dataset_name"],
#        "elements": elements,
#    },
#)


Initializing new dataset


In [12]:
# Get ds associated with specifications
# _________ Get Records and Find Associated Dataset Name ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
records_to_copy = defaultdict(lambda: defaultdict(list))
for rec in records:
    try:
        response = client.query_dataset_records(record_id=[rec.id])
        records_to_copy[response[0]["dataset_id"]][response[0]["specification_name"]].append(response[0]["entry_name"])
    except Exception:
        print(f"Failed record {rec.id}, {response}")
    
for i, (ds_id, tmp_dict) in enumerate(records_to_copy.items()):
    print(f"Copying entries from ds-{ds_id} ({i+1} of {len(records_to_copy)})")
    for spec_name, entry_names in tmp_dict.items():
        dataset.copy_records_from( ds_id, entry_names=entry_names, specification_names=[spec_name])

Getting records
Copying entries from ds-256 (1 of 17)


PortalRequestError: Request failed: Failed to authenticate user session or JWT: User {'user_id': None, 'username': None} is not authorized to access '{'type': '/api/v1/datasets'}' (HTTP status 401)

In [9]:
dataset.submit()

PortalRequestError: Request failed: Failed to authenticate user session or JWT: User {'user_id': None, 'username': None} is not authorized to access '{'type': '/api/v1/datasets'}' (HTTP status 401)

In [13]:
new_record_ids = set([rec.id for _, _, rec in dataset.iterate_records()])
old_record_ids = set([rec.id for rec in records])
set(new_record_ids) == set(old_record_ids)

False

In [14]:
print(len(old_record_ids), len(new_record_ids), len(new_record_ids - old_record_ids))

713 712 115


In [15]:
combined_records = client.get_records(record_ids=list(old_record_ids | new_record_ids))

entry_organization = defaultdict(dict)
for rec in combined_records:
    try:
        response = client.query_dataset_records(record_id=[rec.id])
        entry_organization[response[0]["entry_name"]][rec.id] = response
    except Exception:
        print(f"Failed record {rec.id}, {response}")
        
entries_new_record = {}
entries_merged = {}
for entry_name, response_dict in entry_organization.items():
    if len(response_dict) == 1:
        entries_merged[entry_name] = response_dict
    else:
        entries_new_record[entry_name] = response_dict

print(len(entries_new_record), len(entries_merged))

116 596


In [29]:
from deepdiff import DeepDiff
from qcportal.serialization import encode_to_json

index = 2
key = list(entries_new_record.keys())[index]
entry_keys = list(entries_new_record[key])
print(entry_keys)
print(entry_keys[0] in new_record_ids, entry_keys[1] in new_record_ids,)
record1 = client.get_records(record_ids=[entry_keys[0]])[0]
record2 = client.get_records(record_ids=[entry_keys[1]])[0]
DeepDiff(encode_to_json(record2), encode_to_json(record1))

[18045609, 18536962]
True False


{'values_changed': {"root['id']": {'new_value': 18045609,
   'old_value': 18536962},
  "root['created_on']": {'new_value': '2020-03-13T20:20:21.609930+00:00',
   'old_value': '2020-03-24T17:41:35.905767+00:00'},
  "root['modified_on']": {'new_value': '2020-03-13T20:20:21.609927+00:00',
   'old_value': '2020-03-24T17:41:35.905765+00:00'}}}

In [None]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecules[0].symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["optimization_specification"]
    print("* Optimization Specification:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")



# Output for README Part 2

* Description: A quantum chemical (QC) dataset curated to train the OpenFF 2.0.0 Sage torsion potentials. This QC dataset with the OpenFF default level of theory, B3LYP-D3BJ/DZVP, consists of one dimensional torsional profiles used to train torsion parameters. This Generation 2 dataset increases chemical diversity when compared to Generation 1, which are of value to our industry partners. Large molecules (>20 heavy atoms) were also included, offering more flexible molecules and a greater degree of conformational variation which provide intramolecular interactions. This is the complete TorsionDrive dataset used for training OpenFF 2.0.0 Sage, consisting of data drawn from the following datasets: 'OpenFF Gen 2 Torsion Set 1 Roche', 'OpenFF Gen 2 Torsion Set 2 Coverage', 'OpenFF Gen 2 Torsion Set 3 Pfizer Discrepancy', 'OpenFF Gen 2 Torsion Set 4 eMolecules  - Discrepancy', 'OpenFF Gen 2 Torsion Set 5 Bayer' and 'OpenFF Gen 2 Torsion Set 6 supplemental 2'. Th

In [None]:
scaffold.to_json(dataset, compress=True)