In [1]:
import re
import tomllib
from pathlib import Path

import qcportal # avoid zstd disaster

import numpy as np
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components import TorsionIndexer
from openff.toolkit import Molecule, ForceField
from tqdm import tqdm

In [2]:
ff = ForceField("ff.offxml")
with open("test.toml", "rb") as f:
    config = tomllib.load(f)

In [3]:
junk = re.compile("[(,)]")

molecules = dict()
# not sure what this does, just increment to avoid overlap. without different sym_groups,
# I only get 58 dihedrals, but with it I get 59. 9 duplicate SMILES are still filtered out
# for sharing the central bond in the torsion, and there is 1 comment line in the input file.
# 69it from tqdm = 9 duplicates + 1 comment + 59 final records
sym_group = 0
with open("dataset.smi") as inp:
    for line in tqdm(inp, desc="Tagging torsions"):
        if line.startswith("#"):
            continue
        pid, cmiles, *rest = line.split()
        tors = tuple([int(junk.sub("", x)) for x in rest])
        mol = Molecule.from_mapped_smiles(cmiles, allow_undefined_stereo=True)
        labels = ff.label_molecules(mol.to_topology())[0]["ProperTorsions"]
        assert labels[tors].id == pid
        inchikey = mol.to_inchikey()
        if inchikey not in molecules:
            molecules[inchikey] = mol
        if not molecules[inchikey].properties.get("dihedrals"):
            molecules[inchikey].properties["dihedrals"] = TorsionIndexer()
        molecules[inchikey].properties["dihedrals"].add_torsion(tors, (-sym_group, sym_group), (-165, 180))
        sym_group += 1

Tagging torsions: 69it [00:02, 34.49it/s]


In [4]:
dataset_factory = TorsiondriveDatasetFactory()
dataset_factory.add_workflow_components(workflow_components.StandardConformerGenerator(max_conformers=10))

dataset = dataset_factory.create_dataset(
    dataset_name=config["name"],
    tagline=config["name"],
    description=config["short_description"],
    molecules=molecules.values(),
)

dataset.metadata.submitter = "ntBre"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(Path.cwd().name)
)

Deduplication                 : 100%|█████████| 58/58 [00:00<00:00, 2352.68it/s]
StandardConformerGenerator    :   0%|                    | 0/58 [00:00<?, ?it/s][18:51:32] UFFTYPER: Unrecognized charge state for atom: 9
[18:51:32] UFFTYPER: Unrecognized charge state for atom: 9
[18:51:32] UFFTYPER: Unrecognized charge state for atom: 11
[18:51:32] UFFTYPER: Unrecognized charge state for atom: 7
[18:51:33] UFFTYPER: Unrecognized charge state for atom: 1
[18:51:33] UFFTYPER: Unrecognized charge state for atom: 1
[18:51:33] UFFTYPER: Unrecognized charge state for atom: 3
StandardConformerGenerator    : 100%|███████████| 58/58 [00:01<00:00, 43.77it/s]
Preparation                   : 100%|███████████| 58/58 [00:01<00:00, 42.17it/s]


In [5]:
old_smiles = {m.to_smiles(isomeric=False) for m in molecules.values()}
new_smiles = {m.to_smiles(isomeric=False) for m in dataset.molecules}

assert not old_smiles.symmetric_difference(new_smiles)

In [6]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", dataset.n_records)
print("* Number of conformers per molecule (min, mean, max): "
      f"{confs.min()}, {confs.mean():.2f}, {confs.max()}")

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f'* Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'* Max molecular weight: {np.max(np.array(masses)):.2f}')
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))

* Number of unique molecules: 58
* Number of filtered molecules: 0
* Number of conformers: 59
* Number of conformers per molecule (min, mean, max): 1, 3.08, 10
* Mean molecular weight: 174.43
* Max molecular weight: 401.33
* Charges: [0.0, 1.0, 2.0]


In [7]:
print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")

def print_field(od, field): print(f"\t* {field}: {od[field]}")

fields = ["basis", "implicit_solvent", "keywords", "maxiter", "method", "program"]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print_field(od, field)
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")

## Metadata
* Elements: {N, Br, H, P, Cl, O, C, S}
* Spec: default
	* basis: DZVP
	* implicit_solvent: None
	* keywords: {}
	* maxiter: 200
	* method: B3LYP-D3BJ
	* program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices


In [8]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)