In [1]:
from qcportal import PortalClient
from openff.qcsubmit.results import OptimizationResultCollection,BasicResultCollection
from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.results.filters import ConnectivityFilter, RecordStatusEnum, RecordStatusFilter
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec

# Load optimization dataset

In [2]:
client = PortalClient("https://api.qcarchive.molssi.org:443/")

In [3]:
opt_ds = OptimizationResultCollection.from_server(client=client, datasets=['OpenFF Cresset Additional Coverage Optimizations v4.0'])

In [4]:
filtered = opt_ds.filter(
        RecordStatusFilter(status=RecordStatusEnum.complete),
        ConnectivityFilter(tolerance=1.2),
    )

In [5]:
print(opt_ds.n_molecules,opt_ds.n_results)
print(filtered.n_molecules,filtered.n_results)

70 393
70 393


# Set up single points

In [6]:
from qcelemental.models import DriverEnum

dataset = filtered.create_basic_dataset(
    dataset_name="OpenFF Cresset Additional Coverage Hessian v4.0",
    tagline="Hessian single points for the OpenFF Cresset Additional Coverage Optimizations v4.0 dataset.",
    description=(
        "Hessian single points for the final molecules in the OpenFF Cresset Additional Coverage Optimizations v4.0 dataset at the B3LYP-D3BJ/DZVP level of theory."
    ),
    driver=DriverEnum.hessian, 
    metadata=Metadata(
        submitter="lilyminium",
        long_description_url=(
            "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2025-03-31-OpenFF-Cresset-Additional-Coverage-Hessian-v4.0"
        )
    ),
) # Default QCSpec shouldn't need to be explicitly stated

In [7]:
opt_hashes = {
        rec.final_molecule.get_hash() for rec, _mol in filtered.to_records()
    }

new_hashes = {
    qcemol.identifiers.molecule_hash
    for moldata in dataset.dataset.values()
    for qcemol in moldata.initial_molecules
}

opt_hashes==new_hashes

True

In [8]:
dataset._get_specifications()

{'default': QCSpecification(program='psi4', driver=<SinglepointDriver.hessian: 'hessian'>, method='b3lyp-d3bj', basis='dzvp', keywords={'maxiter': 200, 'scf_properties': [<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>]}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.none: 'none'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))}

# Exporting dataset

In [9]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


# Dataset information

In [10]:
import numpy as np
from collections import Counter

In [11]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 70
n_conformers: 393


In [12]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [13]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 5.614285714285714 10
# heavy atoms
  4: 1
  5: 1
  6: 5
  7: 5
  8: 8
  9: 13
 10: 7
 11: 9
 12: 8
 13: 6
 14: 2
 15: 1
 16: 2
 17: 1
 19: 1


In [14]:
from openff.units import unit
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

{-1.0, 0.0, 1.0}

In [15]:

masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 58.079452 144.97917499642858 280.7482890000001


In [16]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

{'F', 'N', 'H', 'O', 'Cl', 'S', 'Br', 'C'}
