In [12]:
from qcportal import PortalClient
from openff.qcsubmit.results import OptimizationResultCollection,BasicResultCollection
from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.results.filters import ConnectivityFilter, RecordStatusEnum, RecordStatusFilter
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.qcsubmit.common_structures import Metadata, QCSpec
import pathlib
import numpy as np

# Load optimization dataset

In [2]:
opt_ds = OptimizationResultCollection.parse_file("optimizations.json")

In [3]:
print(opt_ds.n_molecules,opt_ds.n_results)

63441 297934


# Set up single points

In [4]:
OPTIMIZATION_WHITELISTS = [
    "OpenFF Optimization Set 1",
    "SMIRNOFF Coverage Set 1",
    "OpenFF VEHICLe Set 1",
    "OpenFF Discrepancy Benchmark 1",
    "OpenFF Ehrman Informative Optimization v0.2",
    "Pfizer discrepancy optimization dataset 1",
    "FDA optimization dataset 1",
    "Kinase Inhibitors: WBO Distributions",
    "OpenFF Gen 2 Opt Set 1 Roche",
    "OpenFF Gen 2 Opt Set 2 Coverage",
    "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
    "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
    "OpenFF Gen 2 Opt Set 5 Bayer",
    "OpenFF Sandbox CHO PhAlkEthOH v1.0",
    "OpenFF Industry Benchmark Season 1 v1.1",
    "OpenFF Gen2 Optimization Dataset Protomers v1.0",
    "OpenFF Protein Capped 1-mers 3-mers Optimization Dataset v1.0",
    "OpenFF Iodine Chemistry Optimization Dataset v1.0",
    "XtalPi Shared Fragments OptimizationDataset v1.0",
    "XtalPi 20-percent Fragments OptimizationDataset v1.0",
    "OpenFF Torsion Benchmark Supplement v1.0",
    "OpenFF Torsion Multiplicity Optimization Training Coverage Supplement v1.0",
    "OpenFF Torsion Multiplicity Optimization Benchmarking Coverage Supplement v1.0",
    "OpenFF Iodine Fragment Opt v1.0",
    "OpenFF Sulfur Optimization Training Coverage Supplement v1.0",
    "OpenFF Sulfur Optimization Benchmarking Coverage Supplement v1.0",
    "OpenFF Lipid Optimization Training Supplement v1.0",
    "OpenFF Lipid Optimization Benchmark Supplement v1.0",
    "OpenFF Cresset Additional Coverage Optimizations v4.0",
    "OpenFF Protein PDB 4-mers v4.0"
]

IGNORE_IODINE = [
    "OpenFF Discrepancy Benchmark 1",
    "OpenFF Gen 2 Opt Set 2 Coverage",
    "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
    "SMIRNOFF Coverage Set 1",
    "OpenFF Ehrman Informative Optimization v0.2",
    "FDA optimization dataset 1",
    "Kinase Inhibitors: WBO Distributions",

    # ---
    "OpenFF Gen 2 Torsion Set 2 Coverage 2",
    "OpenFF Gen 2 Torsion Set 4 eMolecules Discrepancy 2",
]

In [19]:
optimization_datasets = "\n".join([f" - {x}" for x in OPTIMIZATION_WHITELISTS])
iodine_datasets = "\n".join([f" - {x}" for x in IGNORE_IODINE])
description = (
    "Hessian single points for the final molecules in the OpenFF datasets listed below "
    "at the B3LYP-D3BJ/DZVP level of theory. "
    "These are used for calculating MSM starting points in force field fits. "
    "The molecules here include the S, H, O, Br, F, N, P, Cl, I, C elements "
    "and the charge states {-4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0}. "
    "They range from 16-1425 Da (mean 224) and 4-99 heavy atoms. "
    "Records were filtered for successful completion, no connectivity changes, "
    "a non-2D structure (where all Z-coordinates are 0), "
    "and whether RDKit can parse the molecule with valid valence. "
    f"The datasets included are:\n{optimization_datasets}"
    "\n\nAny molecules in the below datasets had all molecules containing iodine filtered out, as those records were problematic.\n"
    f"{iodine_datasets}"
)

print(description)

Hessian single points for the final molecules in the OpenFF datasets listed below at the B3LYP-D3BJ/DZVP level of theory. These are used for calculating MSM starting points in force field fits. The molecules here include the S, H, O, Br, F, N, P, Cl, I, C elements and the charge states {-4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0}. They range from 16-1425 Da (mean 224) and 4-99 heavy atoms. Records were filtered for successful completion, no connectivity changes, a non-2D structure (where all Z-coordinates are 0), and whether RDKit can parse the molecule with valid valence. The datasets included are:
 - OpenFF Optimization Set 1
 - SMIRNOFF Coverage Set 1
 - OpenFF VEHICLe Set 1
 - OpenFF Discrepancy Benchmark 1
 - OpenFF Ehrman Informative Optimization v0.2
 - Pfizer discrepancy optimization dataset 1
 - FDA optimization dataset 1
 - Kinase Inhibitors: WBO Distributions
 - OpenFF Gen 2 Opt Set 1 Roche
 - OpenFF Gen 2 Opt Set 2 Coverage
 - OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy
 - OpenFF 

In [6]:
from qcelemental.models import DriverEnum


from collections import defaultdict
import qcportal
from openff.toolkit.topology import Molecule
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.units import unit
from qcportal.record_models import BaseRecord, RecordStatusEnum

from openff.qcsubmit.common_structures import Metadata, MoleculeAttributes, QCSpec
from openff.qcsubmit.datasets import BasicDataset

import json
import tqdm
import pickle

for record, molecule in tqdm.tqdm(opt_ds.to_records):
    if record.id not in opt_ids:
        continue
    records_by_cmiles[
        molecule.to_smiles(isomeric=True, explicit_hydrogens=True, mapped=True)
    ].append((record, molecule))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 324572/324572 [10:01<00:00, 539.77it/s]


In [20]:
metadata = Metadata(
    submitter="lilyminium",
    long_description_url=(
        "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/"
        + str(pathlib.Path.cwd().name)
    )
)

dataset = BasicDataset(
    dataset_name="OpenFF Optimization Hessians 2019-07 to 2025-03 v4.0",
    description=description,
    dataset_tagline="Hessian single points OpenFF optimization datasets from 2019-07 to 2025-03.",
    driver=DriverEnum.hessian,
    metadata={} if metadata is None else metadata,
    qc_specifications=({"default": QCSpec()}),
)

# this is basically what `create_basic_dataset` does, but with a progress bar
for records in tqdm.tqdm(records_by_cmiles.values()):
    base_record, base_molecule = records[0]
    base_molecule._conformers = [m.conformers[0] for _, m in records]

    dataset.add_molecule(
        index=base_molecule.to_smiles(
            isomeric=True, explicit_hydrogens=False, mapped=False
        ),
        molecule=None,
        initial_molecules=[rec.final_molecule for rec, _ in records],
        attributes=MoleculeAttributes.from_openff_molecule(base_molecule),
        extras=base_record.extras,
        keywords=base_record.specification.keywords,
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63446/63446 [19:28<00:00, 54.29it/s]


In [21]:
dataset._get_specifications()

{'default': QCSpecification(program='psi4', driver=<SinglepointDriver.hessian: 'hessian'>, method='b3lyp-d3bj', basis='dzvp', keywords={'maxiter': 200, 'scf_properties': [<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>]}, protocols=AtomicResultProtocols(wavefunction=<WavefunctionProtocolEnum.none: 'none'>, stdout=True, error_correction=ErrorCorrectionProtocol(default_policy=True, policies=None), native_files=<NativeFilesProtocolEnum.none: 'none'>))}

# Exporting dataset

In [22]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)



{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


# Dataset information

In [14]:
import numpy as np
from collections import Counter

In [8]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 63446
n_conformers: 297934


In [21]:
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 4.833672188004544 275
# heavy atoms
  1: 4
  2: 9
  3: 22
  4: 108
  5: 259
  6: 569
  7: 988
  8: 2162
  9: 3656
 10: 8039
 11: 8222
 12: 8491
 13: 6388
 14: 3611
 15: 2421
 16: 1801
 17: 1528
 18: 1472
 19: 1589
 20: 1549
 21: 1620
 22: 1601
 23: 1548
 24: 1311
 25: 1008
 26: 893
 27: 857
 28: 785
 29: 720
 30: 720
 31: 618
 32: 556
 33: 437
 34: 379
 35: 170
 36: 132
 37: 78
 38: 80
 39: 48
 40: 52
 41: 38
 42: 44
 43: 31
 44: 35
 45: 18
 46: 22
 47: 17
 48: 16
 49: 10
 50: 12
 51: 12
 52: 12
 53: 17
 54: 16
 55: 6
 56: 12
 57: 5
 58: 4
 59: 2
 60: 2
 61: 8
 62: 5
 63: 5
 64: 5
 65: 3
 66: 2
 67: 3
 68: 6
 69: 4
 70: 1
 72: 1
 74: 1
 75: 2
 76: 1
 78: 1
 82: 3
 83: 1
 85: 2
 86: 1
 90: 1
 93: 1
 95: 1
 98: 1
 99: 1


In [15]:
from openff.units import unit
unique_charges = set()
masses = []
elements = set()

n_confs = []
n_heavy_atoms = []
for mol in tqdm.tqdm(dataset.molecules):
    n_confs.append(mol.n_conformers)
    n_heavy_atoms.append(mol.to_rdkit().GetNumHeavyAtoms())
    unique_charges.add(mol.total_charge.m)
    mass = sum([atom.mass.m for atom in mol.atoms])
    masses.append(mass)
    els = set([atom.symbol for atom in mol.atoms])
    elements |= els
unique_charges

63446it [05:04, 208.28it/s]


{-4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0}

In [16]:
n_confs = np.array(n_confs)
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 1 4.695867351763704 275
# heavy atoms
  1: 1
  2: 7
  3: 19
  4: 93
  5: 223
  6: 499
  7: 876
  8: 2022
  9: 3265
 10: 7315
 11: 7891
 12: 8140
 13: 6091
 14: 3374
 15: 2254
 16: 1698
 17: 1460
 18: 1420
 19: 1534
 20: 1507
 21: 1576
 22: 1574
 23: 1527
 24: 1285
 25: 998
 26: 881
 27: 852
 28: 780
 29: 712
 30: 710
 31: 605
 32: 551
 33: 430
 34: 371
 35: 167
 36: 125
 37: 78
 38: 77
 39: 46
 40: 51
 41: 38
 42: 41
 43: 27
 44: 33
 45: 16
 46: 21
 47: 16
 48: 16
 49: 10
 50: 11
 51: 10
 52: 10
 53: 16
 54: 14
 55: 6
 56: 11
 57: 5
 58: 4
 59: 1
 60: 2
 61: 8
 62: 5
 63: 4
 64: 5
 65: 3
 66: 1
 67: 3
 68: 4
 69: 3
 70: 1
 72: 1
 74: 1
 75: 2
 76: 1
 78: 1
 82: 3
 83: 1
 85: 2
 86: 1
 90: 1
 95: 1
 99: 1


In [17]:
masses = np.array(masses)
print("MW (min, mean, max):", masses.min(), masses.mean(), masses.max())

MW (min, mean, max): 16.042568 224.35978900176767 1425.3385582000012


In [18]:
print(elements)

{'S', 'H', 'O', 'Br', 'F', 'N', 'P', 'Cl', 'I', 'C'}


In [23]:
def print_field(od, field): print(f"  * {field}: {od[field]}")

fields = ["basis", "implicit_solvent", "keywords", "maxiter", "method", "program"]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print_field(od, field)
    print("  * SCF properties:")
    for field in od["scf_properties"]:
        print(f"    * {field}")

* Spec: default
  * basis: DZVP
  * implicit_solvent: None
  * keywords: {}
  * maxiter: 200
  * method: B3LYP-D3BJ
  * program: psi4
  * SCF properties:
    * dipole
    * quadrupole
    * wiberg_lowdin_indices
    * mayer_indices
