In [1]:
from qcportal import PortalClient
from openff.qcsubmit.results import OptimizationResultCollection,BasicResultCollection
from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.results.filters import ConnectivityFilter, RecordStatusEnum, RecordStatusFilter
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec

# Load optimization dataset

In [2]:
client = PortalClient("https://api.qcarchive.molssi.org:443/")

In [3]:
opt_ds = OptimizationResultCollection.from_server(client=client,datasets=['OpenFF Gen2 Optimization Dataset Protomers v1.0'])

In [4]:
filtered = opt_ds.filter(
        RecordStatusFilter(status=RecordStatusEnum.complete),
        ConnectivityFilter(tolerance=1.2),
    )

In [5]:
print(opt_ds.n_molecules,opt_ds.n_results)
print(filtered.n_molecules,filtered.n_results)

109 598
108 597


# Set up single points

In [10]:
from qcelemental.models import DriverEnum

dataset = filtered.create_basic_dataset(dataset_name="OpenFF Gen2 Hessian Dataset Protomers v1.1",
                                        tagline="Hessian single points for the OpenFF Gen2 Optimization Dataset Protomers v1.0 dataset.", 
                                        description="Hessian single points for the final molecules in the OpenFF Gen2 Optimization Dataset Protomers v1.0 dataset at the B3LYP-D3BJ/DZVP level of theory. Compared to OpenFF Gen2 Hessian Dataset Protomers v1.0, this dataset has been re-generated with an updated `create_basic_dataset` so as to preserve the molecule_ids between optimization and Hessian datasets.",
                                        driver=DriverEnum.hessian,
                                        metadata=Metadata(submitter="amcisaac",
                                                          long_description_url=(
                                                              "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2024-11-12-OpenFF-Gen2-Hessian-Dataset-Protomers-v1.1"
                                                          )
                                                        )
                                       )

In [11]:
dataset._get_specifications()

{'default': QCSpecification(program='psi4', driver=<SinglepointDriver.hessian: 'hessian'>, method='b3lyp-d3bj', basis='dzvp', keywords={'maxiter': 200, 'scf_properties': [<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>]}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.none: 'none'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))}

In [12]:
opt_hashes = {
        rec.final_molecule.get_hash() for rec, _mol in filtered.to_records()
    }

new_hashes = {
    qcemol.identifiers.molecule_hash
    for moldata in dataset.dataset.values()
    for qcemol in moldata.initial_molecules
}

opt_hashes==new_hashes

True

# Exporting dataset

In [13]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


# Dataset information

In [14]:
import numpy as np
from collections import Counter

In [15]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 108
n_conformers: 597


In [16]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [17]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 5.527777777777778 10
# heavy atoms
  4: 1
  5: 1
  6: 2
  7: 2
  8: 5
  9: 1
 10: 4
 11: 3
 12: 4
 13: 6
 14: 6
 15: 10
 16: 6
 17: 6
 18: 2
 19: 5
 20: 4
 21: 2
 22: 1
 23: 5
 24: 1
 25: 3
 26: 1
 27: 3
 28: 1
 29: 4
 30: 1
 31: 5
 32: 5
 33: 2
 34: 1
 36: 3
 37: 1
 38: 1


In [18]:
from openff.units import unit
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

{-3.0, -2.0, -1.0, 0.0, 1.0, 2.0}

In [19]:
masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 82.064334 282.0491347347222 542.59233875


In [20]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

{'C', 'N', 'O', 'F', 'Cl', 'H', 'P', 'S', 'Br'}
