In [1]:
import h5py
from openff.toolkit import Molecule, Quantity, unit
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit.datasets import OptimizationDataset
import numpy as np

# Generating molecules

Pulling out the lowest energy conformer of each molecule to run a full geometry optimization on.

SPICE2 hdf5 file can be downloaded [here](https://zenodo.org/records/10975225).
Not including with submission as it is 35 GB. Including the code here for provenence.

In [5]:
# # Extract the DES370k monomer subset

# des370k_results = {}
# with h5py.File("/Users/amcisaac/Documents/SPICE/SPICE-2.0.1.hdf5") as f:
#     for key in list(f.keys()):
#         group = f[key]
#         subset = group['subset'][()][0].decode('utf-8')

#         if subset == 'SPICE DES Monomers Single Points Dataset v1.1':
#             smiles = group['smiles'][()][0].decode('utf-8')
#             confs = group['conformations'][()]
#             energies = group['dft_total_energy'][()]
#             dEs = energies - min(energies)
#             des370k_results[smiles] = {'conformations':confs,'dft_total_energy':energies,'dEs':dEs}

In [6]:
# # Identify the lowest energy conformer for each molecule

# mols=[]
# for i,key in enumerate(list(des370k_results.keys())):
#     result = des370k_results[key]
#     idx_lowe = np.argmin(result['dEs'])
#     conf_lowe = result['conformations'][idx_lowe]

#     mol = Molecule.from_mapped_smiles(key,allow_undefined_stereo=True)
#     mol.add_conformer(Quantity(conf_lowe, unit.bohr)) # OpenFF converts to angstroms upon addition, then back to Bohr when adding to dataset

#     mols.append(mol)
    

In [7]:
# # Save as a dataset just to minimize converting between different ecosystems.
# # Could also save as SDF files
# dataset_factory_molsonly = OptimizationDatasetFactory()

# dataset_molsonly = dataset_factory_molsonly.create_dataset(
#     dataset_name="SPICE DES370k Monomers Molecules Placeholder",
#     tagline="SPICE DES370k Monomers Molecules Placeholder.",
#     description=(
#         "SPICE DES370k Monomers Molecules Placeholder"
#     ),
#     molecules=mols
# )

# dataset_molsonly.export_dataset("des370k_monomers_minEconf.json")

# Generate the actual dataset

In [8]:
des370k_monomer_lowestE_confs = list(OptimizationDataset.parse_file('des370k_monomers_minEconf.json').molecules)

In [9]:
dataset_factory = OptimizationDatasetFactory()

dataset = dataset_factory.create_dataset(
    dataset_name="SPICE DES370k Monomers Lowest E Conformer Optimization Dataset v4.0",
    tagline="B3LYP-D3BJ/DZVP relaxation of the lowest energy conformer of each molecule in the DES370k monomer dataset.",
    description=(
        "A dataset containing the lowest energy conformer of all molecules from the "
        "`SPICE DES Monomers Single Points Dataset v1.1` dataset, "
        "optimized at the OpenFF default level of theory (B3LYP-D3BJ/DZVP). " 
        "Detailed description on how the original dataset is generated can be found at https://github.com/openmm/qmdataset/tree/main/des370k."
    ),
    molecules=des370k_monomer_lowestE_confs
)
dataset.metadata.submitter = "amcisaac"
dataset.metadata.long_description_url = ("https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2025-01-08-SPICE-DES370k-Monomers-Lowest-E-Conformer-Optimization-Dataset-v4.0") 

Deduplication                 : 100%|███████| 374/374 [00:00<00:00, 3120.13it/s]




































Preparation                   : 100%|████████| 374/374 [00:01<00:00, 336.00it/s]


## Analyze dataset

In [10]:
n_confs = np.array(
    [mol.n_conformers for mol in dataset.molecules]
)

n_heavy_atoms = np.array(
    [mol.to_rdkit().GetNumHeavyAtoms() for mol in dataset.molecules]
)

masses = np.array([
    sum([atom.mass.m for atom in mol.atoms])
    for mol in dataset.molecules
])

elements = set(
    atom.symbol
    for mol in dataset.molecules
    for atom in mol.atoms
)

unique_charges = [str(charge) for charge in sorted(set([
    mol.total_charge.m_as(unit.elementary_charge)
    for mol in dataset.molecules
]))]

from collections import Counter

print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")


# heavy atoms
  1: 5
  2: 14
  3: 25
  4: 52
  5: 78
  6: 89
  7: 58
  8: 34
  9: 10
 10: 4
 11: 3
 12: 2


In [11]:
print('* Name: {}'.format(dataset.dataset_name))
print('* Number of unique molecules: {}'.format(dataset.n_molecules))
print('* Number of conformers: {}'.format(dataset.n_records))
print('* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}'.format(min(n_confs),np.mean(n_confs),max(n_confs)))
print('* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}'.format(min(masses),np.mean(masses),max(masses)))
print('* Charges: {}'.format(' '.join(unique_charges)))

print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")

def print_field(od, field): print(f"  * {field}: {od[field]}")

fields = ["basis", "implicit_solvent", "keywords", "maxiter", "method", "program"]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print_field(od, field)
    print("  * SCF properties:")
    for field in od["scf_properties"]:
        print(f"    * {field}")

* Name: SPICE DES370k Monomers Lowest E Conformer Optimization Dataset v4.0
* Number of unique molecules: 374
* Number of conformers: 374
* Number of conformers (min, mean, max): 1.00, 1.00, 1.00
* Molecular weight (min, mean, max): 16.04, 95.89, 284.78
* Charges: -1.0 0.0 1.0
## Metadata
* Elements: {S, O, F, P, Br, N, H, C, Cl, I}
* Spec: default
  * basis: DZVP
  * implicit_solvent: None
  * keywords: {}
  * maxiter: 200
  * method: B3LYP-D3BJ
  * program: psi4
  * SCF properties:
    * dipole
    * quadrupole
    * wiberg_lowdin_indices
    * mayer_indices


## Update description based on analysis and export

In principle, we could analyze the molecules before adding them to the dataset, and just do the description once. I'm doing it after in order to make sure it's reflective of any changes made when adding molecules to the dataset.

In [12]:
dataset.metadata.long_description=(("A dataset containing the lowest energy conformer of all molecules from the "
        "`SPICE DES Monomers Single Points Dataset v1.1` dataset, "
        "optimized at the OpenFF default level of theory (B3LYP-D3BJ/DZVP). " 
        "Detailed description on how the original dataset is generated can be found at https://github.com/openmm/qmdataset/tree/main/des370k.\n\n"
        "Dataset information:\n"
        "* Number of unique molecules: {}\n"
        "* Number of conformers: {}\n"
        "* Number of conformers (min, mean, max): {:.2f}, {:.2f}, {:.2f}\n"
        "* Molecular weight (min, mean, max): {:.2f}, {:.2f}, {:.2f}\n"
        "* Charges: {}\n"
        "* Elements: {}\n"
        "* Submission directory: {}".format(dataset.n_molecules,dataset.n_records,min(n_confs),np.mean(n_confs),max(n_confs),min(masses),np.mean(masses),max(masses),' '.join(unique_charges),f"{{{', '.join(dataset.metadata.dict()['elements'])}}}",dataset.metadata.long_description_url)
        ))

dataset.description = dataset.metadata.long_description

In [13]:
print(dataset.description)

A dataset containing the lowest energy conformer of all molecules from the `SPICE DES Monomers Single Points Dataset v1.1` dataset, optimized at the OpenFF default level of theory (B3LYP-D3BJ/DZVP). Detailed description on how the original dataset is generated can be found at https://github.com/openmm/qmdataset/tree/main/des370k.

Dataset information:
* Number of unique molecules: 374
* Number of conformers: 374
* Number of conformers (min, mean, max): 1.00, 1.00, 1.00
* Molecular weight (min, mean, max): 16.04, 95.89, 284.78
* Charges: -1.0 0.0 1.0
* Elements: {S, O, F, P, Br, N, H, C, Cl, I}
* Submission directory: https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2025-01-08-SPICE-DES370k-Monomers-Lowest-E-Conformer-Optimization-Dataset-v4.0


In [14]:
print(dataset.metadata.long_description)

A dataset containing the lowest energy conformer of all molecules from the `SPICE DES Monomers Single Points Dataset v1.1` dataset, optimized at the OpenFF default level of theory (B3LYP-D3BJ/DZVP). Detailed description on how the original dataset is generated can be found at https://github.com/openmm/qmdataset/tree/main/des370k.

Dataset information:
* Number of unique molecules: 374
* Number of conformers: 374
* Number of conformers (min, mean, max): 1.00, 1.00, 1.00
* Molecular weight (min, mean, max): 16.04, 95.89, 284.78
* Charges: -1.0 0.0 1.0
* Elements: {S, O, F, P, Br, N, H, C, Cl, I}
* Submission directory: https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2025-01-08-SPICE-DES370k-Monomers-Lowest-E-Conformer-Optimization-Dataset-v4.0


In [18]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)