In [None]:
import pathlib
from tqdm import tqdm
import numpy


from openff.qcsubmit.utils import get_symmetry_classes, get_symmetry_group
from openff.toolkit import Molecule
from openff.qcsubmit.factories import TorsiondriveDatasetFactory
from openff.qcsubmit.workflow_components.utils import TorsionIndexer, SingleTorsion

In [2]:
molecules = Molecule.from_file(
    "TorsionNet500_qm_opt_geometries.sdf",
    file_format="sdf",
    allow_undefined_stereo=True,
)

assert len(molecules) == 12000, len(molecules)

assert len({molecule.to_inchikey() for molecule in molecules})

In [3]:
for molecule in tqdm(molecules):

    torsion_atoms = tuple([int(index) for index in molecule.properties['TORSION_ATOMS_FRAGMENT'].split(" ")])
    central_bond = tuple((torsion_atoms[1], torsion_atoms[2]))

    symmetry_classes = get_symmetry_classes(molecule)

    symmetry_group = get_symmetry_group(central_bond, symmetry_classes)

    molecule.properties['dihedrals'] = TorsionIndexer(
        torsions = {
           tuple((torsion_atoms[1], torsion_atoms[2])): SingleTorsion(
               torsion1=torsion_atoms,
               scan_range=None,
               scan_increment=[15],  # I think?
               symmetry_group1=symmetry_group,
           )})

100%|██████████| 12000/12000 [00:03<00:00, 3015.87it/s]


In [4]:
dataset_factory = TorsiondriveDatasetFactory()

In [None]:
dataset = dataset_factory.create_dataset(
    dataset_name="TorsionNet500 Re-optimization v1.0",
    molecules=molecules,
    tagline="TorsionNet500 re-optimized with OpenFF default spec",
    description="https://github.com/pfizer-opensource/TorsionNet/blob/main/data/TorsionNet500_qm_opt_geometries.sdf/",
    verbose=True,
)

dataset.metadata.submitter = "mattwthompson"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path.cwd().name)
)

In [None]:
assert dataset.n_molecules > 0, (dataset.n_filtered, dataset.n_molecules)

In [None]:
# summarize dataset for readme
confs = numpy.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
# With multiple torsions per unique molecule, n_molecules * confs.mean() no
# longer equals the number of conformers. instead, the number of dihedrals *
# confs.mean() should equal the number of conformers. The dataset contains one
# record per driven torsion (rather than combining multiple dihedrals into the
# same record), so n_records is the same as manually adding up len(dihedrals)
# for each record.
print("* Number of driven torsions:", dataset.n_records)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print(
    "* Number of conformers per molecule (min, mean, max): "
    f"{confs.min()}, {confs.mean():.2f}, {confs.max()}"
)

masses = [
    [sum([atom.mass.m for atom in molecule.atoms]) for molecule in dataset.molecules]
]
print(f"* Mean molecular weight: {numpy.mean(numpy.array(masses)):.2f}")
print(f"* Min molecular weight: {numpy.min(numpy.array(masses)):.2f}")
print(f"* Max molecular weight: {numpy.max(numpy.array(masses)):.2f}")
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))


print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")


fields = [
    "basis",
    "implicit_solvent",
    "keywords",
    "maxiter",
    "method",
    "program",
]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print(f"\t * {field}: {od[field]}")
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")


# export the dataset
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)

In [None]:
dataset.visualize("TorsionNet500.pdf")

In [None]:
dataset.molecules_to_file("TorsionNet500.smi", "smi")

In [None]:
dataset.metadata

In [None]:
dataset.export_dataset("dataset.json.bz2")