In [1]:
from openff.qcsubmit.common_structures import QCSpec, SCFProperties
from openff.qcsubmit.factories import BasicDatasetFactory
import numpy as np
from qcelemental.models.results import WavefunctionProtocolEnum
from qcportal.models.common_models import DriverEnum
from openeye import oechem

In [2]:
factory = BasicDatasetFactory(driver=DriverEnum.gradient,
                              qc_specifications={'spice_default': QCSpec(method='wb97m-d3bj', 
                                                                   basis='def2-tzvppd', 
                                                                   program='psi4', 
                                                                   spec_name='spice_default', 
                                                                   spec_description='SPICE quantum chemistry specification',
                                                                   store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues, 
                                                                   implicit_solvent=None, 
                                                                   maxiter=200, 
                                                                   scf_properties=[SCFProperties.Dipole, SCFProperties.Quadrupole, SCFProperties.WibergLowdinIndices, 
                                                                                   SCFProperties.MayerIndices, SCFProperties.MBISCharges],
                                                                   keywords=None)},
                             store_wavefunction=WavefunctionProtocolEnum.orbitals_and_eigenvalues)

In [3]:
dataset = factory.create_dataset(dataset_name="SPICE DES Monomers Single Points Dataset v1.0", molecules="des-monomers.hdf5", tagline="QM dataset for ML", description="Data source: https://github.com/openmm/qmdataset/tree/main/des370k")

Problematic atoms are:
Atom atomic num: 6, name: , idx: 1, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 0, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 6, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 13, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 15, name: , idx: 1, aromatic: False, chiral: True with bonds:
bond order: 2, chiral: False to atom atomic num: 8, name: , idx: 0, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 2, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 9, aromatic: False, chiral: False

Pr

In [4]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Problematic atoms are:
Atom atomic num: 6, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 13, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 15, name: , idx: 8, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 7, aromatic: False, chiral: False
bond order: 2, chiral: False to atom atomic num: 8, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 10, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 12, aromatic: False, chiral: False



Number of unique molecules        374
Number of filtered molecules      0
Number of conformers              18700
Number of conformers min mean max 50  50.00 50


Problematic atoms are:
Atom atomic num: 6, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 13, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 15, name: , idx: 8, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 7, aromatic: False, chiral: False
bond order: 2, chiral: False to atom atomic num: 8, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 10, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 12, aromatic: False, chiral: False



Mean molecular weight: 95.89
Max molecular weight: 284.78


Problematic atoms are:
Atom atomic num: 6, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 13, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 15, name: , idx: 8, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 7, aromatic: False, chiral: False
bond order: 2, chiral: False to atom atomic num: 8, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 10, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 12, aromatic: False, chiral: False



Charges: [-1.0, 0.0, 1.0]


In [5]:
dataset.visualize("des_monomers.pdf")

Problematic atoms are:
Atom atomic num: 6, name: , idx: 7, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 4, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 8, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 13, aromatic: False, chiral: False

Problematic atoms are:
Atom atomic num: 15, name: , idx: 8, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 7, aromatic: False, chiral: False
bond order: 2, chiral: False to atom atomic num: 8, name: , idx: 9, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 10, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 12, aromatic: False, chiral: False



In [6]:
dataset.molecules_to_file("des_monomers.smi", "smi")

In [7]:
dataset.metadata.submitter = 'Josh Horton, Pavan Behara, David Dotson'

In [8]:
dataset.metadata.long_description_url = 'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-15-QMDataset-DES-monomers-single-points'

In [9]:
dataset.metadata

Metadata(submitter='Josh Horton, Pavan Behara, David Dotson', creation_date=datetime.date(2021, 11, 15), collection_type='DataSet', dataset_name='SPICE DES Monomers Single Points Dataset v1.0', short_description='QM dataset for ML', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-15-QMDataset-DES-monomers-single-points', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-15-QMDataset-DES-monomers-single-points'), long_description='Data source: https://github.com/openmm/qmdataset/tree/main/des370k', elements={'I', 'C', 'Br', 'P', 'Cl', 'H', 'S', 'O', 'F', 'N'})

In [10]:
dataset.export_dataset("dataset.json.bz2")