In [31]:
import os
import json
import requests
import datetime
from collections import Counter, defaultdict

import numpy as np

from qcportal.external import scaffold
from qcportal import PortalClient

from qcfractal.snowflake import FractalSnowflake
snowflake = FractalSnowflake()
client = snowflake.client()

#ADDRESS = "https://api.qcarchive.molssi.org:443/"
#client = PortalClient(ADDRESS, cache_dir=".",)
#client = PortalClient(
#    ADDRESS, 
#    username=os.environ['QCARCHIVE_USER'],
#    password=os.environ['QCARCHIVE_PASSWORD'],
#    cache_dir=".",
#)

# Get Records and Molecular Statistics

In [28]:
# _________ Pull Record IDs of Relevant Datasets ____________
print("Getting record ids to remove")

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/sage-2.2.0/refs/heads/main/05_benchmark_forcefield/process_bm/problem_ids/all_r7_outliers.txt"
)
remove_record_ids = set([int(x) for x in file.content.decode().splitlines()])


Getting record ids to remove


In [30]:
dataset_type = "optimization"
dataset_to_copy = client.get_dataset(dataset_type, "OpenFF Industry Benchmark Season 1 v1.1")
specs_to_copy = dataset_to_copy.specification_names
provenance = dataset_to_copy.provenance

In [None]:
dataset_to_copy.fetch_records(include=["initial_molecule"]) # 30 min

In [None]:
nrecords = dataset_to_copy.record_count

old_record_ids = []
entries_to_copy = []
conformer_counts = defaultdict(lambda: 0)

# Accessing initial molecule information from record information is slow, so get entry names first
for i, (entry_name, spec, rec) in enumerate(dataset_to_copy.iterate_records()):
    if i % 100000 == 0:
        print(f"{i+1} of {nrecords}")
    if rec.id not in remove_record_ids:
        entries_to_copy.append(entry_name)
        old_record_ids.append(rec.id)
        conformer_counts[entry_name.split("-")[1]] += 1

1 of 539385
10001 of 539385
20001 of 539385
30001 of 539385
40001 of 539385
50001 of 539385
60001 of 539385
70001 of 539385
80001 of 539385
90001 of 539385
100001 of 539385
110001 of 539385
120001 of 539385
130001 of 539385
140001 of 539385
150001 of 539385
160001 of 539385
170001 of 539385
180001 of 539385
190001 of 539385
200001 of 539385
210001 of 539385
220001 of 539385
230001 of 539385
240001 of 539385
250001 of 539385
260001 of 539385
270001 of 539385
280001 of 539385
290001 of 539385
300001 of 539385
310001 of 539385
320001 of 539385
330001 of 539385
340001 of 539385
350001 of 539385
360001 of 539385
370001 of 539385
380001 of 539385
390001 of 539385
400001 of 539385
410001 of 539385
420001 of 539385
430001 of 539385
440001 of 539385
450001 of 539385
460001 of 539385
470001 of 539385
480001 of 539385
490001 of 539385
500001 of 539385
510001 of 539385
520001 of 539385
530001 of 539385


In [17]:
# Iterating over entries is much faster
molecular_properties = {
    "charges": set(), 
    "elements": set(), 
    "masses": np.zeros(nrecords-len(remove_record_ids))
}
for i, entry in enumerate(dataset_to_copy.iterate_entries(entry_names=entries_to_copy)):
    molecular_properties["charges"].add(entry.initial_molecule.molecular_charge)
    for sym in set(entry.initial_molecule.symbols):
        molecular_properties["elements"].add(sym)
    molecular_properties["masses"][i] = sum(entry.initial_molecule.masses)

# Make New Dataset

In [32]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")
with open("ds_info.json") as f:
    dataset_information = json.load(f)

dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=provenance,
    default_tag="openff",
    owner_user="openffbot",
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_date": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
    },
)


Initializing new dataset


In [None]:
# _________ Copy Records ____________
print("Copy records")
dataset.copy_records_from( dataset_to_copy.id, entry_names=entries_to_copy, specification_names=specs_to_copy)

In [None]:
new_record_ids = [rec.id for _, _, rec in dataset.iterate_records()]
set(new_record_ids) == set(old_record_ids)

## Write Statistics

In [21]:
# _________ Write Output Part 1 (Run Before Approval) ____________
print("Generating Molecular Statistics")

molecular_properties["charges"] = sorted(molecular_properties["charges"])
molecular_properties["elements"] = sorted(molecular_properties["elements"])

print(f"* Number of unique molecules: {len(conformer_counts)}")
print("* Number of conformers:", int(sum(conformer_counts.values())))
print(
    "* Number of conformers (min, mean, max): {}, {}, {}".format(
        int(min(conformer_counts.values())), 
        int(np.mean(list(conformer_counts.values()))), 
        int(max(conformer_counts.values()))
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(molecular_properties["masses"]), np.mean(molecular_properties["masses"]), max(molecular_properties["masses"])
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in molecular_properties["charges"]])))
print("* Elements: {}".format(", ".join([str(x) for x in molecular_properties["elements"]])))

Generating Molecular Statistics
* Number of unique molecules: 2973
* Number of conformers: 539356
* Number of conformers (min, mean, max): 7, 181, 469
* Molecular weight (min, mean, max): 16.03, 362.35, 1104.40
* Charges: -2.0, -1.0, 0.0, 1.0, 2.0
* Elements: Br, C, Cl, F, H, N, O, P, S


In [None]:
# _________ Write Output Part 2 (Run After Approval) ____________

print("* Elements: {}".format(", ".join([str(x) for x in molecular_properties["elements"]])))
print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(molecular_properties["elements"])}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")


In [35]:
scaffold.to_json(dataset, compress=True)