# OpenFF NSP Optimization Set 1 S v1.0

This notebook generates optimizations for Sulfur set of molecules in the NSP molecules set obtained from Pubchem.

In [1]:
import qcportal
import pathlib
from pprint import pprint
from openff.toolkit import Molecule, ForceField
import numpy as np
import tqdm

from openff.qcsubmit.utils import get_symmetry_classes, get_symmetry_group
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.utils.visualize import molecules_to_pdf
from qcportal.singlepoint import SinglepointDriver, QCSpecification

In [2]:
input_file = 'set1-S-smiles.smi'
molecules = Molecule.from_file(input_file, allow_undefined_stereo=True)
molecules_to_pdf(molecules, f"dataset.pdf")

In [3]:
len(molecules)

647

In [4]:
dataset_factory = OptimizationDatasetFactory()
dataset_factory.add_workflow_components(
    workflow_components.StandardConformerGenerator(max_conformers=10)
)

description = """\
This optimization dataset looks at the coverage for NSP elements.
This dataset uses the OpenFF default level of theory (B3LYP-D3BJ/DZVP), and WB97M-D3BJ/def2-TZVP.
"""

dataset = dataset_factory.create_dataset(
    dataset_name="OpenFF NSP Optimization Set 1 S v1.0",
    tagline="Molecules curated from Pubchem for Sulfur",
    description=description,
    molecules=molecules,
)

dataset.metadata.submitter = "pavankum"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path.cwd().name)
)

Deduplication                 : 100%|████████| 647/647 [00:02<00:00, 254.84it/s]
[13:28:33] UFFTYPER: Unrecognized charge state for atom: 1
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 1
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 1
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 12
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 1
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 10
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 7
[13:28:34] UFFTYPER: Unrecognized charge state for atom: 1
[13:28:35] UFFTYPER: Unrecognized charge state for atom: 6
[13:28:35] UFFTYPER: Unrecognized charge state for atom: 1
StandardConformerGenerator    : 100%|█████████| 634/634 [04:17<00:00,  2.46it/s]
Preparation                   : 100%|█████████| 634/634 [01:14<00:00,  8.46it/s]


In [5]:
dataset.n_molecules

634

Add a new qcspecification to the factory which will be applied to the dataset.
    
    Parameters:
        method: The name of the method to use eg B3LYP-D3BJ
        basis: The name of the basis to use can also be `None`
        program: The name of the program to execute the computation
        spec_name: The name the spec should be stored under
        spec_description: The description of the spec
        store_wavefunction: what parts of the wavefunction that should be saved
        overwrite: If there is a spec under this name already overwrite it
        implicit_solvent: The implicit solvent settings if it is to be used.
        maxiter: The maximum number of SCF iterations that should be done.
        scf_properties: The list of SCF properties that should be extracted from the calculation.
        keywords: Program specific computational keywords that should be passed to
            the program

In [6]:
dataset.add_qc_spec(method='WB97M-D3BJ',
                    basis='def2-TZVP',
                    program='psi4',
                    spec_name="WB97M-D3BJ/def2-TZVP",
                    spec_description="WB97M-D3BJ/def2-TZVP",
                    store_wavefunction='none',
                    implicit_solvent=None,
                    maxiter=200,
                    scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices', 'lowdin_charges', 'mulliken_charges'],
                    keywords={}
                    )
        

In [7]:
pprint(dataset.dict()['qc_specifications'])

{'WB97M-D3BJ/def2-TZVP': {'basis': 'def2-TZVP',
                          'implicit_solvent': None,
                          'keywords': {},
                          'maxiter': 200,
                          'method': 'WB97M-D3BJ',
                          'program': 'psi4',
                          'scf_properties': ['dipole',
                                             'quadrupole',
                                             'wiberg_lowdin_indices',
                                             'mayer_indices',
                                             'lowdin_charges',
                                             'mulliken_charges'],
                          'spec_description': 'WB97M-D3BJ/def2-TZVP',
                          'spec_name': 'WB97M-D3BJ/def2-TZVP',
                          'store_wavefunction': 'none'},
 'default': {'basis': 'DZVP',
             'implicit_solvent': None,
             'keywords': {},
             'maxiter': 200,
             'method': 'B3LYP

In [8]:
# summarize dataset for readme
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print(
    "* Number of conformers per molecule (min, mean, max): "
    f"{confs.min()}, {confs.mean():.2f}, {confs.max()}"
)

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f"* Mean molecular weight: {np.mean(np.array(masses)):.2f}")
print(f"* Min molecular weight: {np.min(np.array(masses)):.2f}")
print(f"* Max molecular weight: {np.max(np.array(masses)):.2f}")
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))


print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")


fields = [
    "basis",
    "implicit_solvent",
    "keywords",
    "maxiter",
    "method",
    "program",
]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print(f"\t * {field}: {od[field]}")
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")


# export the dataset
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("dataset.smi", "smi")
dataset.visualize("dataset.pdf", columns=4)

* Number of unique molecules: 634
* Number of filtered molecules: 0
* Number of conformers: 3852
* Number of conformers per molecule (min, mean, max): 1, 6.08, 10
* Mean molecular weight: 256.48
* Min molecular weight: 87.14
* Max molecular weight: 590.93
* Charges: [-2.0, -1.0, 0.0, 1.0, 2.0]
## Metadata
* Elements: {C, Cl, O, Br, P, F, H, S, I, N}
* Spec: default
	 * basis: DZVP
	 * implicit_solvent: None
	 * keywords: {}
	 * maxiter: 200
	 * method: B3LYP-D3BJ
	 * program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices
* Spec: WB97M-D3BJ/def2-TZVP
	 * basis: def2-TZVP
	 * implicit_solvent: None
	 * keywords: {}
	 * maxiter: 200
	 * method: WB97M-D3BJ
	 * program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices
		* lowdin_charges
		* mulliken_charges
