In [1]:
from openff.qcsubmit.common_structures import QCSpec, SCFProperties
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit import workflow_components
import numpy as np
from qcelemental.models.results import WavefunctionProtocolEnum
from qcportal.models.common_models import DriverEnum
from openeye import oechem
import logging

In [2]:
logging.getLogger("openff").setLevel(logging.ERROR)

In [3]:
factory = OptimizationDatasetFactory()
factory.add_workflow_components(workflow_components.StandardConformerGenerator(max_conformers=50))
factory.dict()

{'qc_specifications': {'default': {'method': 'B3LYP-D3BJ',
   'basis': 'DZVP',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': None}},
 'driver': 'gradient',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'type': 'OptimizationDatasetFactory',
 'workflow': [{'type': 'StandardConformerGenerator',
   'rms_cutoff': None,
   'max_conformers': 50,
   'clear_existing': True}],
 'optimization_program': {'program': 'geometric',
  'coordsys': 'dlc',
  'enforce': 0.0,
  'epsilon': 1e-05,
  'reset': True,
  'qccnv': False,
  'molcnv': False,
  'check': 0,
  'trust': 0.1,
  'tmax': 0.3,
  'maxiter': 300,
  'convergence_set': 'GAU',
  'constraints': {}}}

In [4]:
dataset = factory.create_dataset(dataset_name="OpenFF multiplicity correction optimization set v1.0", 
                                 molecules="dataset.smi", tagline="QM Dataset for fitting multiplicity corrected torsions", 
                                 description="Data source: https://github.com/openforcefield/qca-dataset-submission/blob/master/submissions/2022-04-29-OpenFF-multiplicity-correction-optimization-data-v1.0/")

Deduplication                 : 100%|█████████| 99/99 [00:00<00:00, 1650.07it/s]
RDKit ERROR: [21:39:45] UFFTYPER: Unrecognized charge state for atom: 4
RDKit ERROR: [21:39:45] UFFTYPER: Unrecognized charge state for atom: 1
RDKit ERROR: [21:39:45] UFFTYPER: Unrecognized charge state for atom: 6
[21:39:45] UFFTYPER: Unrecognized charge state for atom: 4
[21:39:45] UFFTYPER: Unrecognized charge state for atom: 1
[21:39:45] UFFTYPER: Unrecognized charge state for atom: 6
RDKit ERROR: [21:39:46] UFFTYPER: Unrecognized charge state for atom: 6
[21:39:46] UFFTYPER: Unrecognized charge state for atom: 6
RDKit ERROR: [21:39:46] UFFTYPER: Unrecognized charge state for atom: 1
[21:39:46] UFFTYPER: Unrecognized charge state for atom: 1
RDKit ERROR: [21:39:46] UFFTYPER: Unrecognized charge state for atom: 11
RDKit ERROR: [21:39:46] UFFTYPER: Unrecognized charge state for atom: 2
[21:39:46] UFFTYPER: Unrecognized charge state for atom: 11
[21:39:46] UFFTYPER: Unrecognized charge state for atom: 2


StandardConformerGenerator    :  63%|██████▉    | 62/99 [00:03<00:01, 27.12it/s]RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 2
[21:39:48] UFFTYPER: Unrecognized charge state for atom: 2
RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 1
RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 5
RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 1
[21:39:48] UFFTYPER: Unrecognized charge state for atom: 1
[21:39:48] UFFTYPER: Unrecognized charge state for atom: 5
[21:39:48] UFFTYPER: Unrecognized charge state for atom: 1
StandardConformerGenerator    :  72%|███████▉   | 71/99 [00:03<00:00, 30.97it/s]RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 2
[21:39:48] UFFTYPER: Unrecognized charge state for atom: 2
RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 4
RDKit ERROR: [21:39:48] UFFTYPER: Unrecognized charge state for atom: 1
[21:39:48] UFFTYPER: Unrecognized charge 

Preparation                   : 100%|███████████| 99/99 [00:02<00:00, 42.83it/s]


In [5]:
#issue: 26 molecules are filtered
#need to fix this 

confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        99
Number of filtered molecules      0
Number of conformers              400
Number of conformers min mean max 1   4.04 50
Mean molecular weight: 172.23
Max molecular weight: 317.32
Charges: [-1.0, 0.0, 1.0]


In [6]:
dataset.visualize("dataset.pdf")

In [7]:
dataset.metadata.submitter = 'Jessica Maat'

In [8]:
dataset.metadata.long_description_url = 'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-04-29-OpenFF-multiplicity-correction-optimization-set-v1.0'

In [9]:
dataset.metadata

Metadata(submitter='Jessica Maat', creation_date=datetime.date(2022, 5, 24), collection_type='OptimizationDataset', dataset_name='OpenFF multiplicity correction optimization set v1.0', short_description='QM Dataset for fitting multiplicity corrected torsions', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-04-29-OpenFF-multiplicity-correction-optimization-set-v1.0', scheme='https', host='github.com', tld='com', host_type='domain', port='443', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2022-04-29-OpenFF-multiplicity-correction-optimization-set-v1.0'), long_description='Data source: https://github.com/openforcefield/qca-dataset-submission/blob/master/submissions/2022-04-29-OpenFF-multiplicity-correction-optimization-data-v1.0/', elements={'N', 'O', 'C', 'S', 'H', 'P'})

In [10]:
dataset.export_dataset("dataset.json.bz2")