In [1]:
import openff.qcsubmit
import openff.toolkit
import openeye
import qcelemental
import qcportal
import pyarrow
import pyarrow.dataset as ds
import numpy as np

import tqdm
import pathlib

from openff.units import unit

from openff.toolkit import Molecule
from openff.toolkit.utils import OpenEyeToolkitWrapper, ToolkitRegistry

from openff.qcsubmit.datasets import BasicDataset
from openff.qcsubmit.factories import BasicDatasetFactory
from qcelemental.models import DriverEnum

# print("OpenFF QCSubmit:", openff.qcsubmit.__version__) # 0.55.0
print("OpenFF Toolkit:", openff.toolkit.__version__)
print("OpenEye:", openeye.__version__)
print("QCElemental:", qcelemental.__version__)
print("QCPortal:", qcportal.__version__)
print("PyArrow:", pyarrow.__version__)

OpenFF Toolkit: 0.16.8
OpenEye: 2024.2.1
QCElemental: 0.28.0
QCPortal: 0.54.1
PyArrow: 20.0.0


## Load molecules

In [2]:
input_directory = pathlib.Path("input-structures")
sdf_files = sorted(input_directory.glob("*-0.sdf"))
molecules = []
for sdf_file in tqdm.tqdm(sdf_files):
    pattern = sdf_file.stem.rsplit("-", maxsplit=1)[0]
    mol_files = sorted(input_directory.glob(f"{pattern}-*.sdf"))
    if len(mol_files) == 1: # only 1 conformer, skip
        continue
    for mol_file in mol_files:
        mol = Molecule.from_file(mol_file, "SDF")
        molecules.append(mol)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1409/1409 [00:13<00:00, 106.50it/s]


In [3]:
factory = BasicDatasetFactory(driver=DriverEnum.gradient)

In [4]:
description = (
    "A dataset of single-point calculations generated to train bond and angle parameters "
    "for a test case of torsion drives for the t17 and t18 torsions in Sage 2.2.1. "
    "Conformers were generated following the process in "
    "https://github.com/lilyminium/refit-t17-t18-torsions/tree/main/01_generate-singlepoints .\n"
    "In short, for a particular molecular graph, a single conformer was generated. "
    "This molecule was assigned parameters from the Sage 2.2.1 force field. "
    "Additional torsion restraints of 1e5 kJ/mol were applied to restrain every torsion in the molecule. "
    "An MD simulation was then conducted at 500K and with 0.1 fs timestep. "
    "Frames were grouped by torsion similarity, and conformers were chosen from the biggest cluster.\n"
    "This process was repeated for every molecule in which a t17 or t18 torsion has been driven. "
    "Some additional small molecules were also included. "
    "Molecules for which the process only generated a single conformer were excluded from the dataset.\n"
    "This dataset is computed at the default OpenFF level of theory. "
    "The molecules here are all neutral and include the Br, C, Cl, F, H, I, N, O, S elements. "
    "They range from 30-307 Da (mean 162) and 2-21 heavy atoms. "
)
print(description)

A dataset of single-point calculations generated to train bond and angle parameters for a test case of torsion drives for the t17 and t18 torsions in Sage 2.2.1. Conformers were generated following the process in https://github.com/lilyminium/refit-t17-t18-torsions/tree/main/01_generate-singlepoints .
In short, for a particular molecular graph, a single conformer was generated. This molecule was assigned parameters from the Sage 2.2.1 force field. Additional torsion restraints of 1e5 kJ/mol were applied to restrain every torsion in the molecule. An MD simulation was then conducted at 500K and with 0.1 fs timestep. Frames were grouped by torsion similarity, and conformers were chosen from the biggest cluster.
This process was repeated for every molecule in which a t17 or t18 torsion has been driven. Some additional small molecules were also included. Molecules for which the process only generated a single conformer were excluded from the dataset.
This dataset is computed at the default 

In [5]:
dataset = factory.create_dataset(
    dataset_name="OpenFF CX3-CX4 singlepoints v4.0",
    molecules=molecules,
    description=description,
    tagline=(
        "A dataset of single-point calculations generated to train bond and angle parameters "
        "for a test case of torsion drives for the t17 and t18 torsions in Sage 2.2.1. "
    ),
)
dataset.metadata.submitter = "lilyminium"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path(".").resolve().name)
)

Deduplication                 : 100%|██████| 2973/2973 [00:09<00:00, 327.20it/s]
Preparation                   : 100%|█████████| 365/365 [00:14<00:00, 25.16it/s]


## Dataset information

In [6]:
import numpy as np
from collections import Counter

In [7]:
print("n_molecules:", dataset.n_molecules)
print("n_conformers:", dataset.n_records)

n_molecules: 365
n_conformers: 2938


In [8]:
from openff.units import unit
unique_charges = set()
masses = []
elements = set()

n_confs = []
n_heavy_atoms = []
for mol in tqdm.tqdm(dataset.molecules):
    n_confs.append(mol.n_conformers)
    n_heavy_atoms.append(mol.to_rdkit().GetNumHeavyAtoms())
    unique_charges.add(mol.total_charge.m)
    mass = sum([atom.mass.m for atom in mol.atoms])
    masses.append(mass)
    els = set([atom.symbol for atom in mol.atoms])
    elements |= els
unique_charges

365it [00:02, 147.11it/s]


{0.0}

In [9]:
print(", ".join(sorted(elements)))

Br, C, Cl, F, H, I, N, O, S


In [10]:
n_confs = np.array(n_confs)
print(
    "Number of conformers (min, mean, max):",
    n_confs.min(), n_confs.mean(), n_confs.max()
)
print("# heavy atoms")
counts = Counter(n_heavy_atoms)
for n_heavy in sorted(counts):
    print(f"{str(n_heavy):>3}: {counts[n_heavy]}")

Number of conformers (min, mean, max): 2 8.04931506849315 10
# heavy atoms
  2: 1
  3: 2
  4: 6
  5: 10
  6: 18
  7: 21
  8: 25
  9: 32
 10: 37
 11: 44
 12: 53
 13: 43
 14: 31
 15: 16
 16: 8
 17: 5
 18: 6
 19: 4
 20: 2
 21: 1


In [11]:
masses = np.array(masses)
print(f"MW (min, mean, max): {masses.min():.0f}, {masses.mean():.0f}, {masses.max():.0f}")

MW (min, mean, max): 30, 162, 307


In [12]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file('dataset.smi', 'smi')
dataset.visualize("dataset.pdf", columns=8)

print(dataset.qc_specifications)

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords={})}


In [13]:
def print_field(od, field): print(f"  * {field}: {od[field]}")

fields = ["basis", "implicit_solvent", "keywords", "maxiter", "method", "program"]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print_field(od, field)
    print("  * SCF properties:")
    for field in od["scf_properties"]:
        print(f"    * {field}")

* Spec: default
  * basis: DZVP
  * implicit_solvent: None
  * keywords: {}
  * maxiter: 200
  * method: B3LYP-D3BJ
  * program: psi4
  * SCF properties:
    * dipole
    * quadrupole
    * wiberg_lowdin_indices
    * mayer_indices
