In [None]:
import numpy as np
import json
import requests
import datetime
from collections import Counter, defaultdict

import periodictable

from qcportal.external import scaffold
from qcportal import PortalClient
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
client = PortalClient(ADDRESS, cache_dir=".")

# Get Records and Molecular Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids")

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/openff-sage/37a36e7eeaf6cdca795847089a288bdff168c08a/data-set-curation/quantum-chemical/data-sets/1-2-0-opt-set-v3.json"
)
data = json.loads(file.content)
provenance = data["provenance"]
# list with: {type, record_id, cmiles, inchi_key}
entry_dicts = data["entries"][ADDRESS]
dataset_type = entry_dicts[0]["type"]


Getting record ids


In [3]:
# _________ Get Records ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)

Getting records


In [None]:
cmiles_by_record_id = {
    int(x["record_id"]): {"cmiles": x["cmiles"], "mol": None} 
    for x in entry_dicts
}
for record in records:
    cmiles_by_record_id[record.id]["mol"] = record.initial_molecule
    
cmiles_count = defaultdict(Counter)
molecules = []
for recid, x in cmiles_by_record_id.items():
    cmiles = x["cmiles"]

    if cmiles not in cmiles_count:
        molecules.append(x["mol"])
    hash = x["mol"].get_hash()
    cmiles_count[cmiles][hash] += 1

In [15]:
# _________ Pull Statistics from Dataset ____________
     
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [16]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  4: 2
  5: 3
  6: 4
  7: 7
  8: 20
  9: 19
 10: 46
 11: 56
 12: 81
 13: 81
 14: 94
 15: 94
 16: 59
 17: 59
 18: 47
 19: 38
 20: 36
 21: 45
 22: 30
 23: 23
 24: 21
 25: 29
 26: 6
 27: 11
 28: 16
 29: 10
 30: 8
 31: 16
 32: 16
 33: 11
 34: 11
 35: 5
 36: 15
 37: 7
 38: 8
 39: 5
* Number of unique molecules: 1039
* Number of conformers: 3663
* Number of conformers (min, mean, max): 1.00, 3.53, 10.00
* Molecular weight (min, mean, max): 76.05, 261.37, 544.64
* Charges: -2.0, -1.0, 0.0, 1.0


# Make New Dataset

In [1]:
## _________ Initialize New Dataset ____________
#print("Initializing new dataset")
#
#with open("ds_info.json") as f:
#    dataset_information = json.load(f)
#
#dataset = client.add_dataset(
#    dataset_type,
#    dataset_name=dataset_information["dataset_name"],
#    tagline=dataset_information["dataset_tagline"],
#    description=dataset_information["description"],
#    provenance=provenance,
#    default_tag="openff",
#    owner_user="openffbot",
#    metadata={
#        "submitter": dataset_information["metadata.submitter"],
#        "creation_data": str(datetime.date.today()),
#        'collection_type': 'OptimizationDataset',
#        'long_description_url': dataset_information["metadata.long_description_url"],
#        "short description": dataset_information["dataset_tagline"],
#        "dataset_name": dataset_information["dataset_name"],
#        "elements": provenance['applied-filters']['ElementFilter-3']['allowed_elements'],
#    },
#)


In [None]:
# Get ds associated with specifications
# _________ Get Records and Find Associated Dataset Name ____________
print("Getting records")
records = client.get_records([int(x["record_id"]) for x in entry_dicts], missing_ok=False)
for rec in records:
    response = client.query_dataset_records(record_id=[rec.id])
    if len(response) > 1:
        raise ValueError("More than one dataset is using this record!")

    dataset.copy_records_from(
        source_dataset_id=response[0]["dataset_id"]
        entry_names=[response[0]["entry_name"]],
        specification_names=[response[0]["specification_name"]],
    )

In [None]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.dataset_tagline))
print("* Name: {}".format(dataset.dataset_name))
print("* Submitter: {}\n".format(dataset.metadata.submitter))
print("* Number of filtered molecules:", dataset.n_filtered)

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

fields = ["basis", "implicit_solvent", "keywords", "maxiter", "method", "program"]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* QC Specifications:", spec)
    for field in fields:
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["scf_properties"]:
        print(f"    * {field}")


In [None]:
scaffold.to_json(dataset, compress=True)