In [9]:
from openff.qcsubmit.datasets import load_dataset, OptimizationDataset
from openff.qcsubmit.common_structures import QCSpec

In [2]:
dataset_50k = load_dataset('50k-dataset.json.xz')
br_dataset = load_dataset('br-sub-dataset.json')

In [3]:
print(dataset_50k.n_molecules)
print(dataset_50k.n_records)

49527
65116


In [4]:
print(br_dataset.n_molecules)
print(br_dataset.n_records)

610
650


In [5]:
combined_dataset = dataset_50k + br_dataset

In [6]:
print(combined_dataset.n_molecules)
print(combined_dataset.n_records)

50137
65766


In [7]:
combined_dataset.export_dataset('combined_dataset.json.xz')

In [8]:
combined_dataset.dataset_name

'OpenFF ESP Fragment Conformers v1.0'

In [10]:
combined_dataset.dict(exclude={'dataset'})

{'qc_specifications': {'HF/6-31G*': {'method': 'hf',
   'basis': '6-31G*',
   'program': 'psi4',
   'spec_name': 'HF/6-31G*',
   'spec_description': 'The standard HF/6-31G* basis used to derive RESP style charges.',
   'store_wavefunction': 'orbitals_and_eigenvalues',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'energy',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'OpenFF ESP Fragment Conformers v1.0',
 'dataset_tagline': 'HF/6-31G* conformers of diverse fragments.',
 'type': 'DataSet',
 'description': 'A dataset that contains a diverse set of fragments generated from the Enamine 10K and 50K diversity libraries, the curated ZINC and ChEMBL (eps=78) molecules provided by Riniker and Bleiziffer and the `OpenFF Industry Benchmark Season 1 Public` molecule set.\n\nThe fragments were generated using the `R

In [13]:
combined_opt_dataset = OptimizationDataset(
    dataset_name="Kuano charge dataset initial v1.0",
    description="An optimisation dataset made by combining the 50K recap set from Simon and the Br sub set from Lily, optimised with AIMNET2 wb97m-d3",
    dataset_tagline="Kuano initial charge dataset"
)

In [16]:
combined_opt_dataset.clear_qcspecs()
combined_opt_dataset.add_qc_spec(
    method="wb97m-d3",
    basis=None,
    program="AIMNET2",
    spec_name="aimnet2",
    spec_description="The wb97m-d3 aimnet2 model"
)

In [17]:
combined_opt_dataset.dict()

{'qc_specifications': {'aimnet2': {'method': 'wb97m-d3',
   'basis': None,
   'program': 'aimnet2',
   'spec_name': 'aimnet2',
   'spec_description': 'The wb97m-d3 aimnet2 model',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'Kuano charge dataset initial v1.0',
 'dataset_tagline': 'Kuano initial charge dataset',
 'type': 'OptimizationDataset',
 'description': 'An optimisation dataset made by combining the 50K recap set from Simon and the Br sub set from Lily, optimised with AIMNET2 wb97m-d3',
 'metadata': {'submitter': 'njh238',
  'creation_date': datetime.date(2024, 6, 20),
  'collection_type': 'OptimizationDataset',
  'dataset_name': 'Kuano charge dataset initial v1.0',
  'short_description': 'Kuano initial charg

In [20]:
import tqdm
for index, entry in tqdm.tqdm(combined_dataset.dataset.items()):
    combined_opt_dataset.add_molecule(
        index=index,
        molecule=entry.get_off_molecule(),
        extras=entry.extras,
        keywords=entry.keywords,
        attributes=entry.attributes
    )

100%|██████████| 50209/50209 [03:03<00:00, 273.12it/s]


In [21]:
combined_opt_dataset.n_molecules

50137

In [22]:
combined_opt_dataset.n_records

65766

In [23]:
combined_opt_dataset.dict(exclude={"dataset"})

{'qc_specifications': {'aimnet2': {'method': 'wb97m-d3',
   'basis': None,
   'program': 'aimnet2',
   'spec_name': 'aimnet2',
   'spec_description': 'The wb97m-d3 aimnet2 model',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'Kuano charge dataset initial v1.0',
 'dataset_tagline': 'Kuano initial charge dataset',
 'type': 'OptimizationDataset',
 'description': 'An optimisation dataset made by combining the 50K recap set from Simon and the Br sub set from Lily, optimised with AIMNET2 wb97m-d3',
 'metadata': {'submitter': 'njh238',
  'creation_date': datetime.date(2024, 6, 20),
  'collection_type': 'OptimizationDataset',
  'dataset_name': 'Kuano charge dataset initial v1.0',
  'short_description': 'Kuano initial charg

In [24]:
# set the trajectory to only save the final structure to save space
from qcelemental.models.procedures import OptimizationProtocols
combined_opt_dataset.protocols = OptimizationProtocols(trajectory="final")

In [26]:
combined_opt_dataset.dict(exclude={"dataset"})

{'qc_specifications': {'aimnet2': {'method': 'wb97m-d3',
   'basis': None,
   'program': 'aimnet2',
   'spec_name': 'aimnet2',
   'spec_description': 'The wb97m-d3 aimnet2 model',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': {}}},
 'driver': 'deferred',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'dataset_name': 'Kuano charge dataset initial v1.0',
 'dataset_tagline': 'Kuano initial charge dataset',
 'type': 'OptimizationDataset',
 'description': 'An optimisation dataset made by combining the 50K recap set from Simon and the Br sub set from Lily, optimised with AIMNET2 wb97m-d3',
 'metadata': {'submitter': 'njh238',
  'creation_date': datetime.date(2024, 6, 20),
  'collection_type': 'OptimizationDataset',
  'dataset_name': 'Kuano charge dataset initial v1.0',
  'short_description': 'Kuano initial charg

In [27]:
combined_opt_dataset.export_dataset("combined_opt_dataset.json.xz")