In [33]:
import os
import json
import requests
import datetime
from collections import Counter, defaultdict

import numpy as np

from qcportal.external import scaffold
from qcportal import PortalClient

from qcfractal.snowflake import FractalSnowflake
snowflake = FractalSnowflake()
client = snowflake.client()

#ADDRESS = "https://api.qcarchive.molssi.org:443/"
#client = PortalClient(ADDRESS, cache_dir=".",)
#client = PortalClient(
#    ADDRESS, 
#    username=os.environ['QCARCHIVE_USER'],
#    password=os.environ['QCARCHIVE_PASSWORD'],
#    cache_dir=".",
#)

# Get Records and Molecular Statistics

In [22]:
# _________ Pull Record IDs of Relevant Datasets ____________

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/sage-2.2.0/refs/heads/main/05_benchmark_forcefield/process_bm/problem_ids/all_r7_outliers.txt"
)
remove_record_ids = set([int(x) for x in file.content.decode().splitlines()])
print(f"There are {len(remove_record_ids)} records to remove")


There are 29 records to remove


In [29]:
dataset_type = "optimization"
dataset_to_copy = client.get_dataset(dataset_type, "OpenFF Industry Benchmark Season 1 v1.1")
specs_to_copy = ["default"]
provenance = dataset_to_copy.provenance
print(f"There are {dataset_to_copy.record_count} records.")
n_rel_records = len([*dataset_to_copy.iterate_records(specification_names=specs_to_copy)])
print(f"For the default specification there are: {n_rel_records}")

There are 539385 records.
For the default specification there are: 77055


In [None]:
dataset_to_copy.fetch_records(include=["initial_molecule"]) # 30 min

In [30]:
old_record_ids = []
entries_to_copy = []
conformer_counts = defaultdict(lambda: 0)

# Accessing initial molecule information from record information is slow, so get entry names first
for i, (entry_name, spec, rec) in enumerate(dataset_to_copy.iterate_records(specification_names=specs_to_copy)):
    if i % 10000 == 0:
        print(f"{i+1} of {n_rel_records}")
    if rec.id not in remove_record_ids:
        entries_to_copy.append(entry_name)
        old_record_ids.append(rec.id)
        conformer_counts[entry_name.split("-")[1]] += 1

1 of 77055
10001 of 77055
20001 of 77055
30001 of 77055
40001 of 77055
50001 of 77055
60001 of 77055
70001 of 77055


In [31]:
# Iterating over entries is much faster
molecular_properties = {
    "charges": set(), 
    "elements": set(), 
    "masses": np.zeros(n_rel_records-len(remove_record_ids))
}
for i, entry in enumerate(dataset_to_copy.iterate_entries(entry_names=entries_to_copy)):
    molecular_properties["charges"].add(entry.initial_molecule.molecular_charge)
    for sym in set(entry.initial_molecule.symbols):
        molecular_properties["elements"].add(sym)
    molecular_properties["masses"][i] = sum(entry.initial_molecule.masses)

molecular_properties["charges"] = sorted(molecular_properties["charges"])
molecular_properties["elements"] = sorted(molecular_properties["elements"])

# Make New Dataset

In [34]:
# _________ Initialize New Dataset ____________
print("Initializing new dataset")
with open("ds_info.json") as f:
    dataset_information = json.load(f)

dataset = client.add_dataset(
    dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=provenance,
    default_tag="openff",
    owner_user="openffbot",
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_date": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": ["Br", "C", "Cl", "F", "H", "N", "O", "P", "S"],
#        "elements": molecular_properties["elements"],
    },
)


Initializing new dataset


In [None]:
# _________ Copy Records ____________
print("Copy records")
dataset.copy_records_from( dataset_to_copy.id, entry_names=entries_to_copy, specification_names=specs_to_copy)

In [None]:
new_record_ids = [rec.id for _, _, rec in dataset.iterate_records()]
set(new_record_ids) == set(old_record_ids)

## Write Statistics

In [32]:
# _________ Write Output Part 1 (Run Before Approval) ____________
print("Generating Molecular Statistics")

print(f"* Number of unique molecules: {len(conformer_counts)}")
print("* Number of conformers:", int(sum(conformer_counts.values())))
print(
    "* Number of conformers (min, mean, max): {}, {}, {}".format(
        int(min(conformer_counts.values())), 
        int(np.mean(list(conformer_counts.values()))), 
        int(max(conformer_counts.values()))
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(molecular_properties["masses"]), np.mean(molecular_properties["masses"]), max(molecular_properties["masses"])
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in molecular_properties["charges"]])))
print("* Elements: {}".format(", ".join([str(x) for x in molecular_properties["elements"]])))

Generating Molecular Statistics
* Number of unique molecules: 2973
* Number of conformers: 77026
* Number of conformers (min, mean, max): 1, 25, 67
* Molecular weight (min, mean, max): 16.03, 362.35, 1104.40
* Charges: -2.0, -1.0, 0.0, 1.0, 2.0
* Elements: Br, C, Cl, F, H, N, O, P, S


In [35]:
# _________ Write Output Part 2 (Run After Approval) ____________

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(dataset.extras['elements'])}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")




# Output for README Part 2

* Description: This dataset is the public counterpart of the OpenFF Industry Benchmark Season 1. Each industry partner has selected a range of diverse molecules which represent their current chemical interests. The dataset will be used in conjunction with private counterparts also designed by each partner to give an unbiased assessment of the progress and current performance of the OpenFF line of force fields in comparison with other contemporary force fields.
The v1.1 dataset features corrected Merck (MRK) molecules with explicit hydrogens. The original v1.0 dataset did not have explicit hydrogens on these molecules, resulting in poor starting conformers that have largely failed to geometry optimize under QM.
The v1.1 dataset was prepared from the v1.0 dataset, excising the MRK molecules and replacing them with the explicit hydrogen variants prepared using the [Season 1 protocol](https://openforcefield.atlassian.net/wiki/spaces/PS/pages/971898891/Optimiza

In [35]:
scaffold.to_json(dataset, compress=True)