In [1]:
import tomllib
from pathlib import Path

import qcportal # avoid zstd disaster

import numpy as np
from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.toolkit import Molecule, ForceField
from tqdm import tqdm

In [2]:
ff = ForceField("ff.offxml")
with open("opt.toml", "rb") as f:
    config = tomllib.load(f)

In [3]:
molecules = list()
with open("train.opt.smi") as inp:
    for line in tqdm(inp, desc="Loading molecules"):
        if line.startswith("#"):
            continue
        pid, cmiles, *_ = line.split()
        mol = Molecule.from_smiles(cmiles, allow_undefined_stereo=True)
        labels = ff.label_molecules(mol.to_topology())[0]["ProperTorsions"]
        pids = [p.id for p in labels.values()]
        assert pid in pids
        molecules.append(mol)

  mol = Molecule.from_smiles(cmiles, allow_undefined_stereo=True)
Loading molecules: 43it [00:01, 35.72it/s]


In [4]:
dataset_factory = OptimizationDatasetFactory()
dataset_factory.add_workflow_components(
    workflow_components.StandardConformerGenerator(max_conformers=10, rms_cutoff=0.5)
)

dataset = dataset_factory.create_dataset(
    dataset_name=config["name"],
    tagline=config["name"],
    description=config["short_description"],
    molecules=molecules,
)

dataset.metadata.submitter = config["submitter"]
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(Path.cwd().name)
)

Deduplication                 : 100%|█████████| 43/43 [00:00<00:00, 1349.08it/s]
StandardConformerGenerator    :   0%|                    | 0/37 [00:00<?, ?it/s][12:19:33] UFFTYPER: Unrecognized charge state for atom: 9
[12:19:33] UFFTYPER: Unrecognized charge state for atom: 4
[12:19:33] UFFTYPER: Unrecognized charge state for atom: 3
StandardConformerGenerator    :   3%|▎           | 1/37 [00:00<00:18,  1.96it/s][12:19:33] UFFTYPER: Unrecognized charge state for atom: 27
[12:19:33] UFFTYPER: Unrecognized charge state for atom: 1
[12:19:33] UFFTYPER: Unrecognized charge state for atom: 0
[12:19:33] UFFTYPER: Unrecognized charge state for atom: 1
StandardConformerGenerator    : 100%|███████████| 37/37 [00:01<00:00, 31.00it/s]
Preparation                   : 100%|███████████| 37/37 [00:01<00:00, 31.25it/s]


In [5]:
old_smiles = {m.to_smiles(isomeric=False) for m in molecules}
new_smiles = {m.to_smiles(isomeric=False) for m in dataset.molecules}

assert not old_smiles.symmetric_difference(new_smiles)

In [6]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print("* Number of conformers per molecule (min, mean, max): "
      f"{confs.min()}, {confs.mean():.2f}, {confs.max()}")

masses = [
    [
        sum([atom.mass.m for atom in molecule.atoms])
        for molecule in dataset.molecules
    ]
]
print(f'* Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'* Max molecular weight: {np.max(np.array(masses)):.2f}')
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))

* Number of unique molecules: 37
* Number of filtered molecules: 0
* Number of conformers: 185
* Number of conformers per molecule (min, mean, max): 1, 5.00, 10
* Mean molecular weight: 187.31
* Max molecular weight: 489.48
* Charges: [0.0, 1.0, 2.0]


In [7]:
print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")

def print_field(od, field): print(f"\t* {field}: {od[field]}")

fields = ["basis", "implicit_solvent", "keywords", "maxiter", "method", "program"]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print_field(od, field)
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")

## Metadata
* Elements: {Br, S, H, P, N, O, Cl, C}
* Spec: default
	* basis: DZVP
	* implicit_solvent: None
	* keywords: {}
	* maxiter: 200
	* method: B3LYP-D3BJ
	* program: psi4
	* SCF properties:
		* dipole
		* quadrupole
		* wiberg_lowdin_indices
		* mayer_indices


In [8]:
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)