In [1]:
from pprint import pprint

import numpy as np
from openeye import oechem
from openff.qcsubmit.common_structures import QCSpec
from openff.qcsubmit.factories import BasicDatasetFactory
from openff.toolkit.topology import Molecule

In [2]:
factory = BasicDatasetFactory()
factory.dict()

{'qc_specifications': {'default': {'method': 'B3LYP-D3BJ',
   'basis': 'DZVP',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None,
   'maxiter': 200,
   'scf_properties': ['dipole',
    'quadrupole',
    'wiberg_lowdin_indices',
    'mayer_indices'],
   'keywords': None}},
 'driver': 'energy',
 'priority': 'normal',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'type': 'BasicDatasetFactory',
 'workflow': []}

In [4]:
# mock a dataset with dummy information for now
dataset = factory.create_dataset(dataset_name="ML_Single_Point_Dataset", molecules="dipeptides.hdf5", tagline="XXXXXXXXX", description="XXXXXXXX")

Deduplication                 : 100%|█████████| 677/677 [00:12<00:00, 54.93it/s]






Preparation                   : 100%|█████████| 677/677 [02:57<00:00,  3.81it/s]


In [5]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        677
Number of filtered molecules      0
Number of conformers              33850
Number of conformers min mean max 50  50.00 50
Mean molecular weight: 313.72
Max molecular weight: 445.51
Charges: [-2.0, -1.0, 0.0, 1.0, 2.0]


In [7]:
dataset.visualize("dipeptides.pdf")

In [8]:
dataset.molecules_to_file("dipeptides.smi", "smi")

In [9]:
dataset.metadata

Metadata(submitter='joshua', creation_date=datetime.date(2021, 9, 20), collection_type='DataSet', dataset_name='ML_Single_Point_Dataset', short_description='XXXXXXXXX', long_description_url=None, long_description='XXXXXXXX', elements={'C', 'N', 'O', 'S', 'H'})

In [10]:
dataset.export_dataset("dataset.json.bz2")