In [1]:
import pathlib

import numpy

from openff.qcsubmit import workflow_components
from openff.qcsubmit.factories import TorsiondriveDatasetFactory

In [2]:
dataset_factory = TorsiondriveDatasetFactory()

In [3]:
dataset = dataset_factory.create_dataset(
    dataset_name="TorsionNet500 Re-optimization v1.0",
    molecules="TorsionNet500_qm_opt_geometries.sdf",
    tagline="TorsionNet500 re-optimized with OpenFF default spec",
    description="https://github.com/pfizer-opensource/TorsionNet/blob/main/data/TorsionNet500_qm_opt_geometries.sdf/",
)

dataset.metadata.submitter = "mattwthompson"
dataset.metadata.long_description_url = (
    "https://github.com/openforcefield/qca-dataset-submission/tree/master/"
    "submissions/" + str(pathlib.Path.cwd().name)
)

Deduplication                 : 100%|████| 12000/12000 [01:50<00:00, 108.38it/s]
Preparation                   : 100%|████████| 500/500 [00:04<00:00, 117.64it/s]


In [13]:
assert dataset.n_molecules > 0, (dataset.n_filtered, dataset.n_molecules)

AssertionError: (500, 0)

In [6]:
# summarize dataset for readme
confs = numpy.array([len(mol.conformers) for mol in dataset.molecules])

print("* Number of unique molecules:", dataset.n_molecules)
# With multiple torsions per unique molecule, n_molecules * confs.mean() no
# longer equals the number of conformers. instead, the number of dihedrals *
# confs.mean() should equal the number of conformers. The dataset contains one
# record per driven torsion (rather than combining multiple dihedrals into the
# same record), so n_records is the same as manually adding up len(dihedrals)
# for each record.
print("* Number of driven torsions:", dataset.n_records)
print("* Number of filtered molecules:", dataset.n_filtered)
print("* Number of conformers:", sum(confs))
print(
    "* Number of conformers per molecule (min, mean, max): "
    f"{confs.min()}, {confs.mean():.2f}, {confs.max()}"
)

masses = [
    [sum([atom.mass.m for atom in molecule.atoms]) for molecule in dataset.molecules]
]
print(f"* Mean molecular weight: {numpy.mean(numpy.array(masses)):.2f}")
print(f"* Min molecular weight: {numpy.min(numpy.array(masses)):.2f}")
print(f"* Max molecular weight: {numpy.max(numpy.array(masses)):.2f}")
print("* Charges:", sorted(set(m.total_charge.m for m in dataset.molecules)))


print("## Metadata")
print(f"* Elements: {{{', '.join(dataset.metadata.dict()['elements'])}}}")


fields = [
    "basis",
    "implicit_solvent",
    "keywords",
    "maxiter",
    "method",
    "program",
]
for spec, obj in dataset.qc_specifications.items():
    od = obj.dict()
    print("* Spec:", spec)
    for field in fields:
        print(f"\t * {field}: {od[field]}")
    print("\t* SCF properties:")
    for field in od["scf_properties"]:
        print(f"\t\t* {field}")


# export the dataset
dataset.export_dataset("dataset.json.bz2")
dataset.molecules_to_file("output.smi", "smi")
dataset.visualize("dataset.pdf", columns=8)

* Number of unique molecules: 0
* Number of driven torsions: 0
* Number of filtered molecules: 500
* Number of conformers: 0


ValueError: zero-size array to reduction operation minimum which has no identity

In [None]:
dataset.visualize("TorsionNet500.pdf")

Problematic atoms are:
Atom atomic num: 7, name: , idx: 12, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 13, aromatic: True, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 22, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 7, name: , idx: 8, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: True, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 13, aromatic: False, chiral: False



In [None]:
dataset.molecules_to_file("TorsionNet500.smi", "smi")

In [None]:
dataset.metadata

Metadata(submitter='Pavan Behara, Josh Horton, David Dotson', creation_date=datetime.date(2021, 11, 11), collection_type='DataSet', dataset_name='TorsionNet500 Single Points Dataset v1.0', short_description='TorsionNet500 geometries with openff default spec', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-09-TorsionNet500-single-points', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-09-TorsionNet500-single-points'), long_description='Data source: https://github.com/PfizerRD/TorsionNet/blob/main/data/', elements={'N', 'F', 'S', 'O', 'C', 'Cl', 'H'})

In [None]:
dataset.export_dataset("dataset.json.bz2")