In [1]:
from qcportal import PortalClient
from openff.qcsubmit.results import OptimizationResultCollection,BasicResultCollection
from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.results.filters import ConnectivityFilter, RecordStatusEnum, RecordStatusFilter
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec

# Load optimization dataset

In [2]:
client = PortalClient("https://api.qcarchive.molssi.org:443/")

In [3]:
opt_ds = OptimizationResultCollection.from_server(client=client,datasets=['OpenFF Gen2 Optimization Dataset Protomers v1.0'])

In [None]:
filtered = opt_ds.filter(
        RecordStatusFilter(status=RecordStatusEnum.complete),
        ConnectivityFilter(tolerance=1.2),
    )

In [None]:
print(opt_ds.n_molecules,opt_ds.n_results)
print(filtered.n_molecules,filtered.n_results)

In [None]:
records = filtered.to_records()

# Set up single points

In [None]:
from qcelemental.models import DriverEnum

dataset = filtered.create_basic_dataset(dataset_name="OpenFF Gen2 Hessian Dataset Protomers v1.0",
                                        tagline="Hessian single points for the OpenFF Gen2 Optimization Dataset Protomers v1.0 dataset.", 
                                        description="Hessian single points for the final molecules in the OpenFF Gen2 Optimization Dataset Protomers v1.0 dataset at the B3LYP-D3BJ/DZVP level of theory.",
                                        driver=DriverEnum.hessian,
                                        metadata=Metadata(submitter="amcisaac",
                                                          long_description_url=(
                                                              "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2024-10-07-OpenFF-Gen2-Hessian-Dataset-Protomers-v1.0"
                                                          )
                                                        )
                                       )

In [None]:
dataset._get_specifications()

# Exporting dataset

In [None]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

# Dataset information

In [None]:
import numpy as np
from collections import Counter

In [None]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

In [None]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [None]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

In [None]:
from openff.units import unit
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

In [None]:
masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

In [None]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)