In [1]:
from qcportal import PortalClient
from qcelemental.models.results import WavefunctionProtocolEnum
from openff.qcsubmit.results import OptimizationResultCollection
from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.results.filters import ConnectivityFilter, ConformerRMSDFilter
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec

## Downloading Optimization results

In [2]:
client = PortalClient("https://api.qcarchive.molssi.org:443/")

In [3]:
optimization_results = OptimizationResultCollection.from_server(
    client=client,
    datasets=["OpenFF multi-Br ESP Fragment Conformers v1.0"],
    spec_name="HF/6-31G*"
)

In [4]:
filtered_records = optimization_results.filter(
    ConnectivityFilter(),
    ConformerRMSDFilter(rmsd_tolerance=0.05)
).to_records()

## Setting up single point

In [12]:
from qcelemental.models import DriverEnum

dataset = BasicDataset(
    dataset_name="OpenFF multi-Br ESP Fragment Conformers v1.1",
    dataset_tagline="HF/6-31G* single point calculations of diverse fragment molecules with multiple Br.",
    description=(
        "A dataset containing optimized molecules from the "
        "`OpenFF multi-Br ESP Fragment Conformers v1.0` optimization. "
    ),
    driver=DriverEnum.energy,
    metadata=Metadata(
        submitter="lilyminium",
        long_description_url=(
            "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
            "submissions/"
            "2023-11-30-OpenFF-multi-Br-ESP-Fragment-Conformers-v1.1-single-point"
        )
    ),
    qc_specifications={
        "HF/6-31G*": QCSpec(
            program="psi4",
            method="hf",
            basis="6-31G*",
            spec_name="HF/6-31G*",
            spec_description=(
                "The standard HF/6-31G* basis used to derive RESP style charges."
            ),
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues,
        )
    }
)

In [13]:
dataset._get_specifications()

{'HF/6-31G*': QCSpecification(program='psi4', driver=<SinglepointDriver.energy: 'energy'>, method='hf', basis='6-31g*', keywords={'maxiter': 200, 'scf_properties': ['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.orbitals_and_eigenvalues: 'orbitals_and_eigenvalues'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))}

In [14]:
from collections import defaultdict

records_by_cmiles = defaultdict(list)
for record, molecule in filtered_records:
    records_by_cmiles[
        molecule.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
    ].append((record, molecule))

In [15]:
from openff.qcsubmit.common_structures import MoleculeAttributes
import tqdm

for records in tqdm.tqdm(records_by_cmiles.values(), total=len(records_by_cmiles)):
    base_record, base_molecule = records[0]
    base_molecule._conformers = [m.conformers[0] for _, m in records]

    dataset.add_molecule(
        index=base_molecule.to_smiles(
            isomeric=True, explicit_hydrogens=False, mapped=False
        ),
        molecule=base_molecule,
        attributes=MoleculeAttributes.from_openff_molecule(base_molecule),
        extras=base_record.extras,
        keywords=base_record.to_qcschema_result().keywords,
    )


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 610/610 [04:13<00:00,  2.41it/s]


## Exporting dataset

In [16]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'HF/6-31G*': QCSpec(method='hf', basis='6-31G*', program='psi4', spec_name='HF/6-31G*', spec_description='The standard HF/6-31G* basis used to derive RESP style charges.', store_wavefunction=<WavefunctionProtocolEnum.orbitals_and_eigenvalues: 'orbitals_and_eigenvalues'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


## Dataset information

In [17]:
import numpy as np
from collections import Counter

In [18]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 610
n_conformers: 650


In [19]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [20]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 1.0655737704918034 4
# heavy atoms
  4: 1
  5: 8
  6: 10
  7: 17
  8: 31
  9: 81
 10: 121
 11: 171
 12: 170


In [22]:
from openff.units import unit
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

{-2.0, -1.0, 0.0, 1.0, 2.0}

In [23]:
masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 201.84508399999999 292.1294328785246 466.5868909999999


In [24]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

{'O', 'S', 'C', 'F', 'P', 'H', 'Br', 'N'}
