In [87]:
import os, sys
import json
import requests
import datetime

import numpy as np
from collections import defaultdict, Counter
import periodictable

from qcportal import PortalClient
from qcportal.record_models import BaseRecord, RecordStatusEnum
from qcportal.external import scaffold

from openff.units import unit

from openff.toolkit.topology import Molecule
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry

from openff.qcsubmit.common_structures import MoleculeAttributes
from openff.qcsubmit.datasets import OptimizationDataset
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.results import OptimizationResultCollection
from openff.qcsubmit.results.filters import (
    ConnectivityFilter,
    RecordStatusFilter,
    UnperceivableStereoFilter,
    SinglepointRecordFilter,
)

from openff.qcsubmit._pydantic import Field

#ADDRESS = "https://api.qcarchive.molssi.org:443/"
#client = PortalClient(ADDRESS, cache_dir=".",)
#client = PortalClient(
#    ADDRESS, 
#    username=os.environ['QCARCHIVE_USER'],
#    password=os.environ['QCARCHIVE_PASSWORD'],
#    cache_dir=".",
#)

from qcfractal.snowflake import FractalSnowflake
snowflake = FractalSnowflake()
client = snowflake.client()

Process ForkProcess-11:
Traceback (most recent call last):
  File "/Users/jenniferclark/mamba/envs/qca-clean-openff/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/jenniferclark/mamba/envs/qca-clean-openff/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/jenniferclark/mamba/envs/qca-clean-openff/lib/python3.11/site-packages/qcfractal/snowflake.py", line 95, in _compute_process
    compute = ComputeManager(compute_config)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jenniferclark/mamba/envs/qca-clean-openff/lib/python3.11/site-packages/qcfractalcompute/compute_manager.py", line 157, in __init__
    raise ValueError(f"Executor {ex} has no available programs")
ValueError: Executor local has no available programs


# 1. Get List of Records to Keep (Using QCA Server)
### Get Records to Remove

In [44]:
# _________ Pull Record IDs of Relevant Datasets ____________

file = requests.get(
    "https://raw.githubusercontent.com/openforcefield/sage-2.2.0/refs/heads/main/05_benchmark_forcefield/process_bm/problem_ids/all_r7_outliers.txt"
)
remove_record_ids = set([int(x) for x in file.content.decode().splitlines()])
print(f"There are {len(remove_record_ids)} records to remove")


There are 29 records to remove


### Get v1.1 Dataset and Filter

In [46]:
ResultCollection = OptimizationResultCollection.from_server(
    client=client,
    datasets=["OpenFF Industry Benchmark Season 1 v1.1"],
    spec_name="default",
)
print(f"Number of records before filtering: {ResultCollection.n_results}")



Number of records before filtering: 76666


In [47]:
# 101 min
ResultCollection = ResultCollection.filter(
    RecordStatusFilter(status=RecordStatusEnum.complete),
    ConnectivityFilter(tolerance=1.2),
    UnperceivableStereoFilter(),
)



In [48]:
# 12 min
print(f"Number of records before filtering: {ResultCollection.n_results}")
nrec = ResultCollection.n_results
class RecordIDFilter(SinglepointRecordFilter):
    """A filter which will only retain records if their status matches a specified
    value.
    """

    record_list: list = Field(
        [],
        description="Records whose ID is NOT in this list will be retains",
    )

    def _filter_function(
        self, result: "_BaseResult", record: BaseRecord, molecule: Molecule
    ) -> bool:
        return record.id not in self.record_list

ResultCollection = ResultCollection.filter(
    RecordIDFilter(record_list=remove_record_ids),
)

Number of records before filtering: 74614




In [49]:
print(f"Number of records after filtering: {ResultCollection.n_results}. {nrec-ResultCollection.n_results} records were removed, equal to number expected: {nrec-ResultCollection.n_results == len(remove_record_ids)}")

Number of records after filtering: 74585. 29 records were removed, equal to number expected: True


In [70]:
record_ids = [rec.id for rec, _ in ResultCollection.to_records()]

ds_old = client.get_dataset("optimization", "OpenFF Industry Benchmark Season 1 v1.1")

cmiles_count = defaultdict(Counter)
molecules, rec_all_info = [], []
records = []
for entry_name, spec_name, record in ds_old.iterate_records():
    records.append(records)
    if record.id not in record_ids:
        continue
    rec_all_info.append([entry_name, spec_name, record.id])
    cmiles = record.initial_molecule.extras["canonical_isomeric_explicit_hydrogen_mapped_smiles"]

    if cmiles not in cmiles_count:
        molecules.append(record.initial_molecule)
    hash = record.initial_molecule.get_hash()
    cmiles_count[cmiles][hash] += 1

print(f"There are {len(rec_all_info)} records (conformers) and {len(cmiles_count)} unique SMILES strings (unique molecules)")



There are 74585 records (conformers) and 9835 unique SMILES strings (unique molecules)


In [72]:
open("final_record_info.txt", "w").write("\n".join(["{}, {}, {}".format(*x) for x in rec_all_info]))
print(f"Old Dataset ID: {ds_old.id}")

Old Dataset ID: 319


### Get Dataset Statistics

In [73]:
print("Generating Molecular Statistics")

lx = len(cmiles_count)
n_confs, n_heavy_atoms, masses, unique_charges = np.zeros(lx), [], np.zeros(lx), np.zeros(lx)
elements = []
for i, (cmiles, hashes) in enumerate(cmiles_count.items()):
    n_confs[i] = len(hashes)
    n_heavy_atoms.append(len([x for x in molecules[i].symbols if x != "H"]))
    elements.extend(list(set([x for x in molecules[i].symbols])))
    masses[i] = sum([getattr(periodictable, x).mass for x in molecules[i].symbols])
    unique_charges[i] = molecules[i].molecular_charge
    
unique_charges = sorted(set(unique_charges))

elements = sorted(list(set(elements)))

Generating Molecular Statistics


In [74]:
# _________ Write Output Part 1 (Run Before Approval) ____________

print("\n# Heavy Atom Counts")
counts1 = Counter(n_heavy_atoms)
for n_heavy in sorted(counts1):
    print(f"{str(n_heavy):>3}: {counts1[n_heavy]}")

print("* Number of unique molecules: {}".format(len(cmiles_count)))
print("* Number of conformers:", int(sum(n_confs)))
print(
    "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(n_confs), np.mean(n_confs), max(n_confs)
    )
)
print(
    "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}".format(
        min(masses), np.mean(masses), max(masses)
    )
)
print("* Charges: {}".format(", ".join([str(x) for x in unique_charges])))


# Heavy Atom Counts
  1: 1
  2: 4
  3: 8
  4: 25
  5: 36
  6: 27
  7: 19
  8: 37
  9: 30
 10: 43
 11: 39
 12: 59
 13: 76
 14: 84
 15: 109
 16: 146
 17: 189
 18: 239
 19: 353
 20: 445
 21: 605
 22: 737
 23: 793
 24: 670
 25: 576
 26: 605
 27: 598
 28: 564
 29: 521
 30: 538
 31: 475
 32: 393
 33: 317
 34: 260
 35: 85
 36: 29
 37: 21
 38: 21
 39: 16
 40: 10
 41: 7
 42: 6
 43: 3
 44: 3
 45: 1
 46: 1
 48: 1
 50: 2
 52: 1
 53: 2
 56: 1
 61: 1
 68: 1
 76: 1
 82: 1
* Number of unique molecules: 9835
* Number of conformers: 74585
* Number of conformers (min, mean, max): 1.00, 7.58, 10.00
* Molecular weight (min, mean, max): 16.04, 348.58, 1105.15
* Charges: -2.0, -1.0, 0.0, 1.0, 2.0


In [56]:
sys.exit("If PR is not Approved, Stop Here and Switch to Snowflake Server")

Old Dataset ID: 319


SystemExit: If PR is not Approved, Stop Here and Switch to Snowflake Server

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# 2. New Dataset
### Get Record IDs for New Dataset

In [76]:
print("Getting records")

rec_all_info = []
with open("final_record_info.txt", "r") as f:
    for line in f:
        tmp = line.split(",")
        rec_all_info.append([tmp[0], tmp[1], int(tmp[2])])

entry_names, spec_names = [], set()
for entry_name, spec_name, rec_id in rec_all_info:
    spec_names.add(spec_name)
    entry_names.append(entry_name)

Getting records


### Initiate Dataset and Copy Records

In [88]:

# _________ Initialize New Dataset ____________
print("Initializing new dataset")
with open("ds_info.json") as f:
    dataset_information = json.load(f)

dataset = client.add_dataset(
    ds_old.dataset_type,
    dataset_information["dataset_name"],
    tagline=dataset_information["dataset_tagline"],
    description=dataset_information["description"],
    provenance=ds_old.provenance,
    default_tag="openff",
    owner_user="openffbot",
    tags=["openff"],
    extras={
        "submitter": dataset_information["metadata.submitter"],
        "creation_date": str(datetime.date.today()),
        'collection_type': 'OptimizationDataset',
        'long_description_url': dataset_information["metadata.long_description_url"],
        "short description": dataset_information["dataset_tagline"],
        "dataset_name": dataset_information["dataset_name"],
        "elements": elements,
    },
)


Initializing new dataset


In [None]:
# Run after approval

dataset.copy_records_from(ds_old.id, entry_names=entry_names, specification_names=spec_names)

In [None]:
# Run after approval

new_record_ids = [rec.id for _, _, rec in dataset.iterate_records()]
set(new_record_ids) == set([rec.id for rec in records])

### Write Output: Part 2

In [None]:
elements = set(
    sym
    for entry in dataset.iterate_entries()
    for sym in entry.initial_molecule.symbols
)

print("\n\n# Output for README Part 2\n")
print("* Description: {}".format(dataset.description))
print("* Purpose: {}".format(dataset.tagline))
print("* Name: {}".format(dataset.name))
print("* Submitter: {}\n".format(dataset.extras["submitter"]))

print("\n## Metadata")
print(f"* Elements: {{{', '.join(elements)}}}")

for spec, obj in dataset.specifications.items():
    od = obj.dict()['specification']
    print("* Program:", od["program"])
    od = od["qc_specification"]
    print("* QC Specifications:", spec)
    for field, value in od.items():
        print(f"  * {field}: {od[field]}")
    print("  * SCF Properties:")
    for field in od["keywords"]["scf_properties"]:
        print(f"    * {field}")



In [89]:
scaffold.to_json(dataset, filename="scaffold_opt.json", compress=True)