# Vacuum dataset

In [1]:
from qcportal import PortalClient
from qcelemental.models.results import WavefunctionProtocolEnum
from openff.qcsubmit.results import OptimizationResultCollection,BasicResultCollection
from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.results.filters import ConnectivityFilter, ConformerRMSDFilter
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec


# Load datasets prepared in `subsample_esp_ds.ipynb`

In [2]:
client = PortalClient("https://api.qcarchive.molssi.org:443/")



In [3]:
br_dataset = BasicResultCollection.parse_file('br_subsample_filtered.json')

In [4]:
esp50k_dataset = BasicResultCollection.parse_file('esp_subsample_filtered.json')

In [5]:
# These were already filtered earlier
br_records = br_dataset.to_records()
esp50k_records = esp50k_dataset.to_records()



# Set up single points

In [6]:
from openff.qcsubmit.common_structures import SCFProperties, Metadata

# want a finer grid for wb97 functionals
keywords = {
    "dft_spherical_points": 590,
    "dft_radial_points": 99
}

# let's calculate a bunch of stuff while we're at it; in case we decide to train on any of this.
properties = [
        SCFProperties.Dipole,
        SCFProperties.Quadrupole,
        SCFProperties.LowdinCharges,
        SCFProperties.MullikenCharges,
        SCFProperties.MBISCharges,
        SCFProperties.MayerIndices,
        SCFProperties.WibergLowdinIndices,
        SCFProperties.DipolePolarizabilities,
    ]


In [7]:
from qcelemental.models import DriverEnum

dataset = BasicDataset(
    dataset_name="OpenFF NAGL2 ESP Timing Benchmark v1.0",
    dataset_tagline="wB97X-V/def2-TZVPPD single point calculations of ~1000 diverse molecules.",
    description=(
        "wB97X-V/def2-TZVPPD/vacuum single point calculations of ~1000 diverse molecules sub-sampled from the ESP50k and multi-BR ESP datasets, to benchmark computational cost."
    ),
    driver=DriverEnum.properties, # This was DriverEnum.energy for the other ESP datasets, but I think to calculate dipole etc it needs to be properties.
    metadata=Metadata(
        submitter="amcisaac",
        long_description_url=(
            "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2024-09-06-OpenFF-NAGL2-ESP-Timing-Benchmark-v1.0"
        )
    ),
    qc_specifications={
        "wB97X-V/def2-TZVPPD": QCSpec(
            program="psi4",
            method="wb97X-V",
            basis="def2-TZVPPD",
            spec_name="wB97X-V/def2-TZVPPD",
            spec_description=(
                "wB97X-V functional with def2-TZVPPD basis set"
            ),
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues,
            keywords=keywords,
            scf_properties=properties
        ),
            
    }
)

In [8]:
dataset._get_specifications()

{'wB97X-V/def2-TZVPPD': QCSpecification(program='psi4', driver=<SinglepointDriver.properties: 'properties'>, method='wb97x-v', basis='def2-tzvppd', keywords={'maxiter': 200, 'function_kwargs': {'properties': [<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.LowdinCharges: 'lowdin_charges'>, <SCFProperties.MullikenCharges: 'mulliken_charges'>, <SCFProperties.MBISCharges: 'mbis_charges'>, <SCFProperties.MayerIndices: 'mayer_indices'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.DipolePolarizabilities: 'dipole_polarizabilities'>]}, 'dft_spherical_points': 590, 'dft_radial_points': 99}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.orbitals_and_eigenvalues: 'orbitals_and_eigenvalues'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))}

In [10]:
from collections import defaultdict

records_by_cmiles = defaultdict(list)
for record, molecule in br_records + esp50k_records:
    records_by_cmiles[
        molecule.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
    ].append((record, molecule))

In [11]:
from openff.qcsubmit.common_structures import MoleculeAttributes
import tqdm

for records in tqdm.tqdm(records_by_cmiles.values(), total=len(records_by_cmiles)):
    base_record, base_molecule = records[0]
    base_molecule._conformers = [m.conformers[0] for _, m in records]

    dataset.add_molecule(
        index=base_molecule.to_smiles(
            isomeric=True, explicit_hydrogens=False, mapped=False
        ),
        molecule=base_molecule,
        attributes=MoleculeAttributes.from_openff_molecule(base_molecule),
        extras=base_record.extras,
        keywords=base_record.to_qcschema_result().keywords,
    )

 31%|██████████████████████████████████████████████▏                                                                                                       | 306/995 [03:21<07:21,  1.56it/s]Connection failed: ('Connection aborted.', TimeoutError(60, 'Operation timed out')) - retrying in 0.52 seconds [1/5]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 995/995 [19:38<00:00,  1.18s/it]


# Exporting dataset

In [12]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'wB97X-V/def2-TZVPPD': QCSpec(method='wb97X-V', basis='def2-TZVPPD', program='psi4', spec_name='wB97X-V/def2-TZVPPD', spec_description='wB97X-V functional with def2-TZVPPD basis set', store_wavefunction=<WavefunctionProtocolEnum.orbitals_and_eigenvalues: 'orbitals_and_eigenvalues'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.LowdinCharges: 'lowdin_charges'>, <SCFProperties.MullikenCharges: 'mulliken_charges'>, <SCFProperties.MBISCharges: 'mbis_charges'>, <SCFProperties.MayerIndices: 'mayer_indices'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.DipolePolarizabilities: 'dipole_polarizabilities'>], keywords={'dft_spherical_points': 590, 'dft_radial_points': 99})}


# Dataset information

In [13]:
import numpy as np
from collections import Counter

In [14]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 995
n_conformers: 1001


In [15]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)
n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

In [16]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 1.0060301507537688 2
# heavy atoms
  4: 2
  5: 4
  6: 24
  7: 46
  8: 118
  9: 170
 10: 179
 11: 235
 12: 217


In [17]:
from openff.units import unit
unique_charges = set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
])
unique_charges

{-4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0}

In [18]:

masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 58.10263600000002 149.83921897015077 329.815488


In [19]:
elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)
print(elements)

{'P', 'S', 'N', 'C', 'Cl', 'F', 'Br', 'O', 'H'}
