In [2]:
from openff.qcsubmit.datasets import load_dataset, OptimizationDataset
from openff.qcsubmit.common_structures import QCSpec

In [8]:
esp_50k_I = load_dataset('esp_50k_I_singlepoint_dataset.json.bz2')

In [9]:
print(esp_50k_I.n_molecules)
print(esp_50k_I.n_records)

5604
6043


In [10]:
esp_50k_I.dataset_name

'ESP 50k opt Iodines'

In [11]:
esp_50k_I.dict()

{'qc_specifications': {'wb97x-d/def2-tzvpp': {'method': 'wb97x-d',
   'basis': 'def2-tzvpp',
   'program': 'psi4',
   'spec_name': 'wb97x-d/def2-tzvpp',
   'spec_description': 'wb97x-d/def2-tzvpp gas',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'energy',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'ESP 50k opt Iodines',
 'dataset_tagline': 'ESP DATASET',
 'type': 'DataSet',
 'description': 'A combined dataset of the recap 50k molecules filtered for Cls and Brs which are then converted to I',
 'metadata': {'submitter': 'nca121',
  'creation_date': datetime.date(2024, 10, 15),
  'collection_type': 'DataSet',
  'dataset_name': 'ESP 50k opt Iodines',
  'short_description': 'ESP DATASET',
  'long_description_url': None,
  'long_description': 'A combined dataset of the reca

In [3]:
opt_dataset = OptimizationDataset(
    dataset_name="Kuano charge dataset add Iodines v1.0",
    description="An optimisation dataset made by filtering the 50K recap set from Simon and the Br sub set from Lily \
        and replacing all the Br and Cl with Iodines, optimised with AIMNET2 wb97m-d3",
    dataset_tagline="Kuano charge dataset with Iodines"
)

In [4]:
opt_dataset.clear_qcspecs()
opt_dataset.add_qc_spec(
    method="wb97m-d3",
    basis=None,
    program="AIMNET2",
    spec_name="aimnet2",
    spec_description="The wb97m-d3 aimnet2 model"
)

In [5]:
opt_dataset.dict()

{'qc_specifications': {'aimnet2': {'method': 'wb97m-d3',
   'basis': None,
   'program': 'aimnet2',
   'spec_name': 'aimnet2',
   'spec_description': 'The wb97m-d3 aimnet2 model',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'Kuano charge dataset add Iodines v1.0',
 'dataset_tagline': 'Kuano charge dataset with Iodines',
 'type': 'OptimizationDataset',
 'description': 'An optimisation dataset made by filtering the 50K recap set from Simon and the Br sub set from Lily         and replacing all the Br and Cl with Iodines, optimised with AIMNET2 wb97m-d3',
 'metadata': {'submitter': 'nca121',
  'creation_date': datetime.date(2024, 10, 17),
  'collection_type': 'OptimizationDataset',
  'dataset_name': 'Kuano charge dat

In [12]:
import tqdm
for index, entry in tqdm.tqdm(esp_50k_I.dataset.items()):
    opt_dataset.add_molecule(
        index=index,
        molecule=entry.get_off_molecule(),
        extras=entry.extras,
        keywords=entry.keywords,
        attributes=entry.attributes
    ) 

100%|██████████| 5606/5606 [00:59<00:00, 94.30it/s] 


In [13]:
opt_dataset.n_molecules

5604

In [14]:
opt_dataset.n_records

6043

In [15]:
opt_dataset.dict(exclude={"dataset"})

{'qc_specifications': {'aimnet2': {'method': 'wb97m-d3',
   'basis': None,
   'program': 'aimnet2',
   'spec_name': 'aimnet2',
   'spec_description': 'The wb97m-d3 aimnet2 model',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'Kuano charge dataset add Iodines v1.0',
 'dataset_tagline': 'Kuano charge dataset with Iodines',
 'type': 'OptimizationDataset',
 'description': 'An optimisation dataset made by filtering the 50K recap set from Simon and the Br sub set from Lily         and replacing all the Br and Cl with Iodines, optimised with AIMNET2 wb97m-d3',
 'metadata': {'submitter': 'nca121',
  'creation_date': datetime.date(2024, 10, 17),
  'collection_type': 'OptimizationDataset',
  'dataset_name': 'Kuano charge dat

In [None]:
# set the trajectory to only save the final structure to save space
from qcelemental.models.procedures import OptimizationProtocols
opt_dataset.protocols = OptimizationProtocols(trajectory="final")

In [None]:
opt_dataset.dict(exclude={"dataset"})

In [None]:
opt_dataset.export_dataset("opt_dataset.json.xz")