In [1]:
import logging
import warnings
from pprint import pprint

import numpy as np
from openff.qcsubmit.common_structures import QCSpec, PCMSettings
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.workflow_components import StandardConformerGenerator
from openff.toolkit.topology import Molecule
from qcelemental.models.results import WavefunctionProtocolEnum
from tqdm import tqdm

In [2]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

In [3]:
from openff.toolkit.utils import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
if OpenEyeToolkitWrapper.is_available():
    GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)

# Dataset Preparation

If starting from SMILES, load in the SMILES patterns of the molecules to include with something like:

In [4]:
with open("molecules.smi") as file:
    smiles_patterns = file.read().split("\n")
    smiles_patterns = [pattern for pattern in smiles_patterns if len(pattern) > 0]

molecules = [
    Molecule.from_smiles(smiles_pattern)
    for smiles_pattern in tqdm(smiles_patterns)
]

100%|██████████| 200/200 [00:04<00:00, 47.73it/s]


Prepare the main dataset from the molecule list.

In [5]:
# Required due to occasional SCF failures. See the V1 dataset as well as
# http://forum.psicode.org/t/dft-scf-not-converging/1725/3
dft_ultra_fine_keywords = dict(
    dft_spherical_points=590,
    dft_radial_points=99,
    dft_pruning_scheme="robust"
)

dataset_factory = OptimizationDatasetFactory(
    qc_specifications={
        "hf/6-31G*": QCSpec(
            method="hf",
            basis="6-31G*",
            spec_name="hf/6-31G*",
            spec_description=(
                "The quantum chemistry specification used to generate the original AM1BCCs."
            ),
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues
        ),
        "pw6b95/aug-cc-pV(D+d)Z/uf": QCSpec(
            method="pw6b95",
            basis="aug-cc-pV(D+d)Z",
            spec_name="pw6b95/aug-cc-pV(D+d)Z/uf",
            spec_description=(
                "The quantum chemistry specification used in the RESP2 publication "
                "for the vacuum (i.e. no PCM) calculations modified to use an 'ultra-fine' "
                "DFT grid to improve SCF convergence."
            ),
            keywords=dft_ultra_fine_keywords,
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues
        ),
        "pw6b95/aug-cc-pV(D+d)Z/uf,pcm": QCSpec(
            method="pw6b95",
            basis="aug-cc-pV(D+d)Z",
            spec_name="pw6b95/aug-cc-pV(D+d)Z/uf,pcm",
            spec_description=(
                "The quantum chemistry specification used in the RESP2 publication "
                "for the aqueous (i.e. with PCM) calculations modified to use an 'ultra-fine' "
                "DFT grid to improve SCF convergence."
            ),
            implicit_solvent=PCMSettings(
                units="angstrom",
                cavity_Type="GePol",
                cavity_Area=0.3,
                cavity_Scaling=True,
                cavity_RadiiSet="Bondi",
                cavity_Mode="Implicit",
                medium_SolverType="CPCM",
                medium_Solvent="Water",
            ),
            keywords=dft_ultra_fine_keywords,
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues
        ),
    }
)

dataset_factory.add_workflow_components(
    StandardConformerGenerator(max_conformers=10, rms_cutoff=0.1, clear_existing=True)
)

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF BCC Refit Study COH v2.0",
    tagline="Optimizations of diverse, para-substituted aniline derivatives.",
    description="A data set curated for the initial stage of the on-going OpenFF "
    "study which aims to co-optimize the AM1BCC bond charge correction (BCC) "
    "parameters against an experimental training set of density and enthalpy of "
    "mixing data points and a QM training set of electric field data."
    "\n\n"
    "The initial data set is limited to only molecules composed of C, O, H. This "
    "limited scope significantly reduces the number of BCC parameters which must "
    "be retrained, thus allowing for easier convergence of the initial "
    "optimizations."
    "\n\n"
    "The included molecules were combinatorially generated to cover a range of "
    "alcohol, ether, and carbonyl containing molecules.",
    molecules=molecules,
)

dataset.metadata.submitter = "simonboothroyd"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-06-22-OpenFF-BCC-Refit-Study-COH-v2.0"
)

dataset.provenance["constructure"] = "0.0.1"

Deduplication                 : 100%|████████| 200/200 [00:00<00:00, 774.89it/s]
StandardConformerGenerator    : 100%|█████████| 200/200 [00:05<00:00, 39.12it/s]
Preparation                   : 100%|█████████| 200/200 [00:06<00:00, 29.85it/s]


Make sure the molecules in the dataset match the input molecules

In [6]:
old_smiles = {Molecule.from_smiles(smiles).to_smiles(isomeric=False) for smiles in smiles_patterns}
new_smiles = {molecule.to_smiles(isomeric=False) for molecule in dataset.molecules}

assert len(old_smiles.symmetric_difference(new_smiles)) == 0

Describe the molecule in the dataset

In [7]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules]
]

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(molecule.total_charge.m for molecule in dataset.molecules)))

Number of unique molecules        200
Number of filtered molecules      0
Number of conformers              729
Number of conformers min mean max 1   3.65 9
Mean molecular weight: 117.62
Max molecular weight: 204.31
Charges: [0.0]


Describe the dataset

In [8]:
pprint(dataset.metadata.dict())

{'collection_type': 'OptimizationDataset',
 'creation_date': datetime.date(2024, 1, 31),
 'dataset_name': 'OpenFF BCC Refit Study COH v2.0',
 'elements': {'H', 'O', 'C'},
 'long_description': 'A data set curated for the initial stage of the on-going '
                     'OpenFF study which aims to co-optimize the AM1BCC bond '
                     'charge correction (BCC) parameters against an '
                     'experimental training set of density and enthalpy of '
                     'mixing data points and a QM training set of electric '
                     'field data.\n'
                     '\n'
                     'The initial data set is limited to only molecules '
                     'composed of C, O, H. This limited scope significantly '
                     'reduces the number of BCC parameters which must be '
                     'retrained, thus allowing for easier convergence of the '
                     'initial optimizations.\n'
                     '\n'
  

In [9]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: hf/6-31G*
{'basis': '6-31G*',
 'implicit_solvent': None,
 'keywords': {},
 'maxiter': 200,
 'method': 'hf',
 'program': 'psi4',
 'scf_properties': ['dipole',
                    'quadrupole',
                    'wiberg_lowdin_indices',
                    'mayer_indices'],
 'spec_description': 'The quantum chemistry specification used to generate the '
                     'original AM1BCCs.',
 'spec_name': 'hf/6-31G*',
 'store_wavefunction': 'orbitals_and_eigenvalues'}
Spec: pw6b95/aug-cc-pV(D+d)Z/uf
{'basis': 'aug-cc-pV(D+d)Z',
 'implicit_solvent': None,
 'keywords': {'dft_pruning_scheme': 'robust',
              'dft_radial_points': 99,
              'dft_spherical_points': 590},
 'maxiter': 200,
 'method': 'pw6b95',
 'program': 'psi4',
 'scf_properties': ['dipole',
                    'quadrupole',
                    'wiberg_lowdin_indices',
                    'mayer_indices'],
 'spec_description': 'The quantum chemistry specification used in the RESP2 '
                  

In [10]:
pprint(dataset.qc_specifications['hf/6-31G*'].scf_properties)

[<SCFProperties.Dipole: 'dipole'>,
 <SCFProperties.Quadrupole: 'quadrupole'>,
 <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>,
 <SCFProperties.MayerIndices: 'mayer_indices'>]


In [11]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")

try:
    dataset.visualize("dataset.pdf", columns=8)
except AttributeError:
    # patched in QCSubmit >0.50.2
    pass