In [1]:
import logging
import warnings
from pprint import pprint

import numpy as np
from openeye import oechem
from openff.qcsubmit.common_structures import QCSpec, PCMSettings
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.workflow_components import StandardConformerGenerator
from openff.toolkit.topology import Molecule
from qcelemental.models.results import WavefunctionProtocolEnum
from tqdm import tqdm



In [2]:
from openff.toolkit.utils import GLOBAL_TOOLKIT_REGISTRY, OpenEyeToolkitWrapper
GLOBAL_TOOLKIT_REGISTRY.deregister_toolkit(OpenEyeToolkitWrapper)

In [3]:
# Warnings that tell us we have undefined stereo and charged molecules
logging.getLogger("openff.toolkit").setLevel(logging.ERROR)
warnings.simplefilter("ignore")

# Dataset Preperation

Load in the SMILES patterns of the molecules to include:

In [4]:
with open("molecules.smi") as file:
    smiles_patterns = file.read().split("\n")
    smiles_patterns = [pattern for pattern in smiles_patterns if len(pattern) > 0]

Load in the molecules to be optimized:

In [5]:
molecules = [
    Molecule.from_smiles(s)
    for s in tqdm(smiles_patterns)
]
print(len(molecules))

 ... (more hidden) ...

39





Prepare the main dataset from the molecule list.

In [6]:
# Required due to occasional SCF failures. See the V1 dataset as well as
# http://forum.psicode.org/t/dft-scf-not-converging/1725/3
# dft_ultra_fine_keywords = dict(
#     dft_spherical_points=590,
#     dft_radial_points=99,
#     dft_pruning_scheme="robust"
# )
external_field = {
    "X-": [-0.01, 0.0, 0.0],
    "X+": [0.01, 0.0, 0.0],
    "Y-": [0.0, -0.01, 0.0],
    "Y+": [0.0, 0.01, 0.0],
    "Z-": [0.0, 0.0, -0.01],
    "Z+": [0.0, 0.0, 0.01],
}


In [7]:
qc_specifications = {}
for key, value in external_field.items():
    qc_specifications[f"MP2/aug-cc-pVTZ/{key}"] = QCSpec(
            method="MP2",
            basis="aug-cc-pVTZ",
            spec_name=f"MP2/aug-cc-pVTZ/{key}",
            spec_description=(
                "The quantum chemistry specification used to generate data for typed polarizabilities training."
            ),
        keywords= { 
        "scf_type": "df",
        "mp2_type": "df",
        "E_CONVERGENCE": "1.0e-8",
        "PERTURB_H": True,
        "PERTURB_WITH": "DIPOLE",
        "PERTURB_DIPOLE": value, # ["X-", "X+", "Y-", "Y+", "Z-", "Z+"] 
            },
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues
        )
    
qc_specifications["MP2/aug-cc-pVTZ"] = QCSpec(
            method="MP2",
            basis="aug-cc-pVTZ",
            spec_name="MP2/aug-cc-pVTZ",
            spec_description=(
                "The quantum chemistry specification used to generate data for typed polarizabilities training."
            ),
            store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues
        )

In [8]:
dataset_factory = OptimizationDatasetFactory(
    qc_specifications=qc_specifications
)

dataset_factory.add_workflow_components(
    StandardConformerGenerator(max_conformers=5, rms_cutoff=0.1, clear_existing=True)
)

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF RESP Polarizability Optimizations v1.1",
    tagline="Optimizations of ESP-fitting based direct polarizabilities.",
    description="A data set used for training typed polarizabilities using direct polarization.\n"
    "This data set only includes element C, H, N, and O.",
    molecules=molecules,
)


dataset.metadata.submitter = "willawang"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/"
    "2021-10-01-OpenFF-resppol-mp2-single-point"
)

# dataset.provenance["constructure"] = "0.0.1"

 ... (more hidden) ...
 ... (more hidden) ...
 ... (more hidden) ...


Make sure the molecules in the dataset match the input molecules

In [9]:
old_smiles = {Molecule.from_smiles(smiles).to_smiles(isomeric=False) for smiles in smiles_patterns}
new_smiles = {molecule.to_smiles(isomeric=False) for molecule in dataset.molecules}

assert len(old_smiles.symmetric_difference(new_smiles)) == 0

In [10]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        39
Number of filtered molecules      0
Number of conformers              105
Number of conformers min mean max 1   2.69 5
Mean molecular weight: 90.54
Max molecular weight: 158.24
Charges: [-1.0, 0.0, 1.0]


Describe the dataset

In [11]:
pprint(dataset.metadata.dict())

{'collection_type': 'OptimizationDataset',
 'creation_date': datetime.date(2021, 10, 14),
 'dataset_name': 'OpenFF RESP Polarizability Optimizations v1.1',
 'elements': {'O', 'C', 'N', 'H'},
 'long_description': 'A data set used for training typed polarizabilities '
                     'using direct polarization.\n'
                     'This data set only includes element C, H, N, and O.',
 'long_description_url': HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-10-01-OpenFF-resppol-mp2-single-point', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-10-01-OpenFF-resppol-mp2-single-point'),
 'short_description': 'Optimizations of ESP-fitting based direct '
                      'polarizabilities.',
 'submitter': 'willawang'}


In [12]:
for spec, obj in dataset.qc_specifications.items():
    print("Spec:", spec)
    pprint(obj.dict())

Spec: MP2/aug-cc-pVTZ/X-
{'basis': 'aug-cc-pVTZ',
 'implicit_solvent': None,
 'keywords': {'E_CONVERGENCE': '1.0e-8',
              'PERTURB_DIPOLE': [-0.01, 0.0, 0.0],
              'PERTURB_H': True,
              'PERTURB_WITH': 'DIPOLE',
              'mp2_type': 'df',
              'scf_type': 'df'},
 'maxiter': 200,
 'method': 'MP2',
 'program': 'psi4',
 'scf_properties': ['dipole',
                    'quadrupole',
                    'wiberg_lowdin_indices',
                    'mayer_indices'],
 'spec_description': 'The quantum chemistry specification used to generate '
                     'data for typed polarizabilities training.',
 'spec_name': 'MP2/aug-cc-pVTZ/X-',
 'store_wavefunction': 'orbitals_and_eigenvalues'}
Spec: MP2/aug-cc-pVTZ/X+
{'basis': 'aug-cc-pVTZ',
 'implicit_solvent': None,
 'keywords': {'E_CONVERGENCE': '1.0e-8',
              'PERTURB_DIPOLE': [0.01, 0.0, 0.0],
              'PERTURB_H': True,
              'PERTURB_WITH': 'DIPOLE',
              'mp2_t

Export the dataset.

In [13]:
dataset.export_dataset("dataset-v1.1.json.bz2")
dataset.molecules_to_file("dataset-v1.1.smi", "smi")

dataset.visualize("dataset-v1.1.pdf", columns=8)