In [1]:
from openff.qcsubmit.common_structures import QCSpec, SCFProperties
from openff.qcsubmit.factories import BasicDatasetFactory
import numpy as np
from qcelemental.models.results import WavefunctionProtocolEnum
from qcportal.models.common_models import DriverEnum
from openeye import oechem
import logging
logging.getLogger("openff").setLevel(logging.ERROR)

In [2]:
import qcportal as ptl
from collections import Counter
client = ptl.FractalClient.from_file()
ds = client.get_collection("Dataset", "SPICE DES370K Single Points Dataset v1.0")

In [3]:
%%time
spec = ds.list_records().iloc[0].to_dict()
spec

CPU times: user 26.3 ms, sys: 5.13 ms, total: 31.4 ms
Wall time: 28.2 ms


{'driver': 'gradient',
 'program': 'psi4',
 'method': 'wb97m-d3bj',
 'basis': 'def2-tzvppd',
 'keywords': 'spice_default_no_mbis',
 'name': 'WB97M-D3BJ/def2-tzvppd-spice_default'}

In [3]:
%%time
recs = ds.get_records(method=spec['method'], basis=spec['basis'], program=spec['program'], keywords=spec['keywords'])

CPU times: user 3min 12s, sys: 25.8 s, total: 3min 38s
Wall time: 1h 8min 45s


In [10]:
from collections import defaultdict
import pandas as pd

status_dict = defaultdict(dict)
indx = len(recs)
num_error = 0

err_recs = []

for ii in range(indx):
    elif recs.iloc[ii].record.status == 'ERROR':
        err_recs.append(recs.iloc[ii].record)
        num_error += 1
        
with open('failed_record_ids_SPICE_DES370K_Single_Points_Dataset_v1.0.txt', 'w') as f:
    for item in err_recs:
        print(item.id)
        f.write("%s\n" % item.id)

In [4]:
with open('failed_record_ids_SPICE_DES370K_Single_Points_Dataset_v1.0.txt', 'r') as f:
    errored_ids = f.read().splitlines()

In [5]:
molecules = []
for rec_id in errored_ids:
    result = client.query_results(rec_id)[0].dict()
    molecule = client.query_molecules(result['molecule'])
    molecules.append(molecule[0])

In [6]:
len(molecules)

3637

In [7]:
from openff.toolkit.topology import Molecule
import json

offmols = []
for mol in molecules:
    offmols.append(Molecule.from_qcschema(json.loads(mol.json())))

In [8]:
factory = BasicDatasetFactory(driver=DriverEnum.gradient,
                              qc_specifications={'spice_default': QCSpec(method='wb97m-d3bj', 
                                                                   basis='def2-tzvppd', 
                                                                   program='psi4', 
                                                                   spec_name='spice_default', 
                                                                   spec_description='SPICE quantum chemistry specification',
                                                                   store_wavefunction=None, 
                                                                   implicit_solvent=None, 
                                                                   maxiter=200, 
                                                                   scf_properties=[SCFProperties.Dipole, SCFProperties.Quadrupole, SCFProperties.WibergLowdinIndices, 
                                                                                   SCFProperties.MayerIndices],
                                                                   keywords={'wcombine': False})},
                             store_wavefunction=None)

In [9]:
dataset = factory.create_dataset(dataset_name="SPICE DES370K Single Points Dataset Supplement v1.0", molecules=offmols, tagline="QM dataset for ML", description="For the molecules that failed MBIS convergence on the original submission this is a supplement, excluding the MBIS charge calculation in scf_properties, to make use of forces and energies. Original submission of DES370K: https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-08-QMDataset-DES370K-single-points. Main data source: https://github.com/openmm/qmdataset/tree/main/des370k")

Deduplication                 : 100%|███████| 3637/3637 [02:49<00:00, 21.49it/s]
Preparation                   : 100%|███████████| 93/93 [00:08<00:00, 10.59it/s]


In [10]:
confs = np.array([len(mol.conformers) for mol in dataset.molecules])

print("Number of unique molecules       ", dataset.n_molecules)
print("Number of filtered molecules     ", dataset.n_filtered)
print("Number of conformers             ", dataset.n_records)
print("Number of conformers min mean max",
      confs.min(), "{:6.2f}".format(confs.mean()), confs.max())

masses = []
for molecule in dataset.molecules:
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)

print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.max(np.array(masses)):.2f}')
print("Charges:", sorted(set(m.total_charge/m.total_charge.unit for m in dataset.molecules)))

Number of unique molecules        93
Number of filtered molecules      0
Number of conformers              3631
Number of conformers min mean max 1  39.04 2677
Mean molecular weight: 101.83
Max molecular weight: 304.85
Charges: [-1.0, 0.0, 1.0]


In [11]:
dataset.visualize("des370k_supplement.pdf")

In [12]:
dataset.molecules_to_file("des370k_supplement.smi", "smi")

In [13]:
dataset.metadata.submitter = 'Josh Horton, Pavan Behara, David Dotson'

In [14]:
dataset.metadata.long_description_url = 'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-02-18-QMDataset-DES370K-single-points-supplement'

In [15]:
dataset.metadata

Metadata(submitter='Josh Horton, Pavan Behara, David Dotson', creation_date=datetime.date(2022, 2, 19), collection_type='DataSet', dataset_name='SPICE DES370K Single Points Dataset Supplement v1.0', short_description='QM dataset for ML', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2022-02-18-QMDataset-DES370K-single-points-supplement', scheme='https', host='github.com', tld='com', host_type='domain', port='443', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2022-02-18-QMDataset-DES370K-single-points-supplement'), long_description='For the molecules that failed MBIS convergence on the original submission this is a supplement, excluding the MBIS charge calculation in scf_properties, to make use of forces and energies. Original submission of DES370K: https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-11-08-QMDataset-DES370K-single-points. Main data source: https://g

In [16]:
dataset.export_dataset("dataset.json.bz2")