In [4]:
from qcportal import FractalClient
from qcportal.models.records import RecordStatusEnum
from openff.qcsubmit.common_structures import QCSpec, SCFProperties
from openff.qcsubmit.factories import OptimizationDatasetFactory
from openff.qcsubmit import workflow_components
import numpy as np
from qcelemental.models.results import WavefunctionProtocolEnum
from qcportal.models.common_models import DriverEnum
from openeye import oechem
from openff.qcsubmit.results.filters import (
    ConformerRMSDFilter, ConnectivityFilter,
    ElementFilter,
    HydrogenBondFilter,
    SMARTSFilter,
    RecordStatusFilter,
    ResultRecordFilter,
)

from openff.qcsubmit.results import (
    OptimizationResultCollection,)


In [5]:
default_filters = [
        RecordStatusFilter(status=RecordStatusEnum.complete),
        ConnectivityFilter(tolerance=1.2),
        ElementFilter(
            # The elements supported by SMIRNOFF
            allowed_elements=["H", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"]
        ),
    ]

# Pull down the optimization datasets
client = FractalClient()

optimization_set = OptimizationResultCollection.from_server(client=client,
                                                           datasets=["OpenFF Optimization Set 1",
                                                                     "OpenFF Discrepancy Benchmark 1",
                                                                     "OpenFF Gen 2 Opt Set 1 Roche",
                                                                     "OpenFF Gen 2 Opt Set 2 Coverage",
                                                                     "OpenFF Gen 2 Opt Set 3 Pfizer Discrepancy",
                                                                     "OpenFF Gen 2 Opt Set 4 eMolecules Discrepancy",
                                                                     "OpenFF Gen 2 Opt Set 5 Bayer",
                                                                     "SMIRNOFF Coverage Set 1",
                                                                     ])

smarts_to_include = ["[#53:1]~[*:2]"]

optimization_set = optimization_set.filter(*default_filters,
                                            SMARTSFilter(smarts_to_include=smarts_to_include))
                                           

1


In [7]:
len(optimization_set.entries['https://api.qcarchive.molssi.org:443/'])

293

In [9]:
records_and_molecules = optimization_set.to_records()

In [11]:
molecules = []
for item in records_and_molecules:
    molecules.append(item[1])

In [12]:
factory = OptimizationDatasetFactory()
factory.add_workflow_components(workflow_components.StandardConformerGenerator(max_conformers=50))
factory.dict()

{'qc_specifications': {'default': {'method': 'B3LYP-D3BJ',
   'basis': 'DZVP',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': None}},
 'driver': 'gradient',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'type': 'OptimizationDatasetFactory',
 'workflow': [{'type': 'StandardConformerGenerator',
   'rms_cutoff': None,
   'max_conformers': 50,
   'clear_existing': True}],
 'optimization_program': {'program': 'geometric',
  'coordsys': 'dlc',
  'enforce': 0.0,
  'epsilon': 1e-05,
  'reset': True,
  'qccnv': False,
  'molcnv': False,
  'check': 0,
  'trust': 0.1,
  'tmax': 0.3,
  'maxiter': 300,
  'convergence_set': 'GAU',
  'constraints': {}}}

In [13]:
dataset = factory.create_dataset(dataset_name="OpenFF Iodine Chemistry Optimization Dataset v1.0", molecules=molecules, tagline="Optimization set created from Gen1 and Gen2 molecules containing iodine", description="Molecules containing iodine with the correct auxiliary basis set for DZVP")

Deduplication                 : 100%|█████████| 293/293 [00:05<00:00, 52.56it/s]
StandardConformerGenerator    : 100%|███████████| 68/68 [00:05<00:00, 11.64it/s]
Preparation                   : 100%|███████████| 68/68 [00:04<00:00, 15.89it/s]


In [14]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        68
Number of filtered molecules      0
Number of conformers              250
Number of conformers min mean max 1   3.68 22
Mean molecular weight: 318.83
Max molecular weight: 440.19
Charges: [0.0]


In [15]:
dataset.visualize("dataset.pdf")

In [16]:
dataset.metadata.submitter = 'Pavan Behara'

In [17]:
dataset.metadata.long_description_url = 'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-07-27-OpenFF-iodine-optimization-set'

In [18]:
dataset.metadata

Metadata(submitter='Pavan Behara', creation_date=datetime.date(2022, 7, 27), collection_type='OptimizationDataset', dataset_name='OpenFF Iodine Chemistry Optimization Dataset v1.0', short_description='Optimization set created from Gen1 and Gen2 molecules containing iodine', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-07-27-OpenFF-iodine-optimization-set', scheme='https', host='github.com', tld='com', host_type='domain', port='443', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2022-07-27-OpenFF-iodine-optimization-set'), long_description='Molecules containing iodine with the correct auxiliary basis set for DZVP', elements={'C', 'F', 'O', 'H', 'Br', 'Cl', 'N', 'I', 'S'})

In [19]:
dataset.export_dataset("dataset.json.bz2")