In [19]:

import json, sys, os
import requests
import datetime
from collections import Counter, defaultdict

import numpy as np
import periodictable

from qcportal.external import scaffold
from qcportal import PortalClient
from qcportal.serialization import encode_to_json
from qcportal.optimization import OptimizationDatasetEntry
from qcportal.torsiondrive import TorsiondriveDatasetEntry
DatasetEntry = {"optimization": OptimizationDatasetEntry, "torsiondrive": TorsiondriveDatasetEntry}

ADDRESS = "https://api.qcarchive.molssi.org:443/"
#client = PortalClient(ADDRESS, cache_dir=".")
client = PortalClient(
    ADDRESS, 
    username=os.environ['QCARCHIVE_USER'],
    password=os.environ['QCARCHIVE_PASSWORD'],
    cache_dir=".",
)

# Get Datasets Statistics

In [2]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting datasets")

datasets = [
    client.get_dataset("singlepoint", "MLPepper RECAP Optimized Fragments v1.0"),
    client.get_dataset("singlepoint", "MLPepper RECAP Optimized Fragments v1.0 Add Iodines"),
]
dataset_type = datasets[0].dataset_type


Getting datasets


In [3]:
# _________ Get Records ____________
print("Getting records")
records = []
entry_spec_by_ds_id = defaultdict(lambda: defaultdict(list))
for ds in datasets:
    for entry_name, spec_name, rec in ds.iterate_records():
        records.append(rec)
        entry_spec_by_ds_id[ds.id][spec_name].append(entry_name)

Getting records


In [4]:
cmiles_count = defaultdict(Counter)
molecules = []
for rec in records:
    cmiles = rec.molecule.extras['canonical_isomeric_explicit_hydrogen_mapped_smiles']

    if cmiles not in cmiles_count:
        molecules.append(rec.molecule)
    hash = rec.molecule.get_hash()
    cmiles_count[cmiles][hash] += 1

print(f"There are {len(records)} records and {len(cmiles_count)} unique SMILES strings (unique molecules)")

There are 150194 records and 56351 unique SMILES strings (unique molecules)


In [5]:
# _________ Pull Statistics from Dataset ____________
     
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [6]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  1: 1
  2: 2
  3: 73
  4: 197
  5: 558
  6: 1338
  7: 2957
  8: 5728
  9: 8617
 10: 10872
 11: 12697
 12: 12632
 13: 52
 14: 102
 15: 61
 16: 90
 17: 128
 18: 111
 19: 71
 20: 46
 21: 2
 22: 2
 23: 9
 24: 2
 25: 1
 29: 2
* Number of unique molecules: 56351
* Number of conformers: 75097
* Number of conformers (min, mean, max): 1.00, 1.33, 5.00
* Molecular weight (min, mean, max): 32.12, 163.20, 701.59
* Charges: -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0


# Make New Dataset

In [None]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")
with open("ds_info.json") as f:
    dataset_information = json.load(f)

#dataset = client.get_dataset(dataset_type, dataset_information["dataset_name"])
dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance={},
    default_tag="openff",
    owner_user="openffbot",
    tags=["openff"],
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_date": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description': dataset_information["description"],
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short_description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": elements,
    },
)


Initializing new dataset


In [21]:
dataset.delete_entries(dataset.entry_names)
dataset.delete_specification('wb97x-d/def2-tzvpp')
dataset.delete_specification('wb97x-d/def2-tzvpp/ddx-water')
print(dataset.record_count)

0


In [None]:
# Get ds associated with specifications
# _________ Get Records and Find Associated Dataset Name ____________
print("Getting records")

for ds_id, spec_entries in entry_spec_by_ds_id.items():
    spec_names = list(spec_entries.keys())
    entry_names = list({name for names in spec_entries.values() for name in names})
    dataset.copy_records_from( ds_id, entry_names=entry_names, specification_names=spec_names, existing_ok=True)

Getting records


PortalRequestError: Request failed: Cannot copy entries from dataset - destination already has entries with the same name (HTTP status 400)

In [None]:
new_record_ids = [rec.id for _, _, rec in dataset.iterate_records()]
set(new_record_ids) == set([rec.id for rec in records])

False

In [None]:
# _________ Write Output Part 2 (Run After Approval) ____________

elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecule.symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")


In [None]:
scaffold.to_json(dataset, compress=True)