In [1]:
import openff.qcsubmit
import qcportal as ptl
import numpy as np

from openff.toolkit.topology import Molecule
from openff.qcsubmit.datasets import BasicDataset

In [2]:
openff.qcsubmit.__version__
client = ptl.FractalClient()

In [3]:
import pickle
import json
def download_torsiondrive_data(dataset_name):
    # load dataset from public qcfractal server
    ds = client.get_collection("TorsionDriveDataset", dataset_name)
    spec_name = "MP2/heavy-aug-cc-pVTZ" #Reference specification
    print(f"Loading TorsionDrive Scans from [ {dataset_name} ] spec [{spec_name}]")
    print(f"Found {len(ds.df)} data entries")
    # load torsiondrive record ids from the dataset
    map_record_id_entry_index = {}
    for entry_index in ds.df.index:
        data_entry = ds.get_entry(entry_index)
        td_record_id = data_entry.object_map[spec_name]
        map_record_id_entry_index[td_record_id] = entry_index, data_entry.attributes
    print(f"Found {len(map_record_id_entry_index)} torsiondrive records")
    # query all torsiondrive records at the same time
    td_record_ids = list(map_record_id_entry_index.keys())
    torsiondrive_data = {}
    for i, td_record in enumerate(client.query_procedures(id=td_record_ids), 1):
        entry_index, attributes = map_record_id_entry_index[td_record.id]
        print(f"{i:5d} : {entry_index:50s} status {td_record.status}")
        if td_record.status == 'COMPLETE':
            torsiondrive_data[entry_index] = {
                'initial_molecules': client.query_molecules(td_record.initial_molecule),
                'final_molecules': td_record.get_final_molecules(),
                'final_energies': td_record.get_final_energies(),
                'final_gradients': {gid: np.array(res.return_result) for gid, res in td_record.get_final_results().items()},
                'keywords': td_record.keywords.dict(),
                'attributes': attributes,
            }
    print(f'Downloaded torsion drive data for {len(torsiondrive_data)} completed entries')
    # save as pickle file
    with open('torsiondrive_data.pickle', 'wb') as pfile:
        pickle.dump(torsiondrive_data, pfile)
    return torsiondrive_data

In [4]:
dataset_name='OpenFF Theory Benchmarking Set v1.0'
torsiondrive_data= download_torsiondrive_data(dataset_name)

Loading TorsionDrive Scans from [ OpenFF Theory Benchmarking Set v1.0 ] spec [MP2/heavy-aug-cc-pVTZ]
Found 59 data entries
Found 59 torsiondrive records
    1 : [H]c1c(c(c(c(c1[H])[H])[N-][S:3](=[O:4])(=O)[c:2]2[c:1](c(c(c(c2[H])[H])[H])[H])[H])[H])[H] status COMPLETE
    2 : [H]c1c(c([c:1](c(c1[H])[H])[N-:2][S:3](=[O:4])(=O)c2c(c(c(c(c2[H])[H])[H])[H])[H])[H])[H] status COMPLETE
    3 : [H]c1c([c:1]([c:2](c(c1[H])[H])[N-:3][S:4](=O)(=O)c2c(c(c(c(c2[H])[H])[H])[H])[H])[H])[H] status COMPLETE
    4 : [H]c1c(c([n:1][c:2](c1[H])[N-:3][C:4]2=NC(=C(O2)[H])[H])[H])[H] status COMPLETE
    5 : [H]c1c(c(n[c:4](c1[H])[N-:3][C:2]2=NC(=C([O:1]2)[H])[H])[H])[H] status COMPLETE
    6 : [H]c1c(c([n+](c(c1[H])C2=NC(=[C:4]([N:3]2[C:2]([H:1])([H])[H])[H])[H])[H])[H])[H] status COMPLETE
    7 : [H]c1c(c([n+:1]([c:2](c1[H])[C:3]2=[N:4]C(=C(N2C([H])([H])[H])[H])[H])[H])[H])[H] status COMPLETE
    8 : [H]c1c([c:1]([c:2](c(c1[H])[H])[c:3]2[c:4](c(c(c(c2[N+]([H])([H])[H])[H])[H])[H])[H])[H])[H] status COMPLET

In [5]:
from openff.qcsubmit.factories import BasicDatasetFactory
from qcportal.models.common_models import DriverEnum

factory = BasicDatasetFactory(driver=DriverEnum.energy)

In [6]:
factory

BasicDatasetFactory(qc_specifications={'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords=None)}, driver=<DriverEnum.energy: 'energy'>, priority='normal', dataset_tags=['openff'], compute_tag='openff', type='BasicDatasetFactory', workflow=[])

In [7]:
dataset_name="OpenFF Theory Benchmarking Single Point Energies v1.0"
description="A basic dataset for benchmarking different levels of theory"
tagline="Single point energies at final geometries of torsiondrives calculated at reference spec MP2/heavy-aug-cc-pVTZ"
spec_description=description

In [8]:
dataset = factory.create_dataset(dataset_name=dataset_name, molecules=[], description=description, tagline=tagline)
# Change the default spec name

Deduplication                 : 0it [00:00, ?it/s]
Preparation                   : 0it [00:00, ?it/s]


In [9]:
dataset.add_qc_spec(basis='DEF2-TZVP',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/DEF2-TZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DEF2-TZVPD',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/DEF2-TZVPD',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DEF2-TZVPP',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/DEF2-TZVPP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DEF2-TZVPPD',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/DEF2-TZVPPD',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DEF2-QZVP',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/DEF2-QZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='6-31+G**',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/6-31+G**',
 store_wavefunction='none')

dataset.add_qc_spec(basis='6-311+G**',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3BJ/6-311+G**',
 store_wavefunction='none')

dataset.add_qc_spec(basis='TZVP',
 implicit_solvent=None,
 keywords=None,
 method='B97-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B97-D3BJ/TZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='M05-2X-D3',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='M05-2X-D3/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='M06-2X-D3',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='M06-2X-D3/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='M08-HX-D3',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='M08-HX-D3/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='WB97X-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='WB97X-D3BJ/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='WB97M-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='WB97M-D3BJ/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='WB97M-V',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='WB97M-V/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='PW6B95-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='PW6B95-D3BJ/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='PW6B95-D3',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='PW6B95-D3/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='DZVP',
 implicit_solvent=None,
 keywords=None,
 method='B3LYP-D3MBJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='B3LYP-D3MBJ/DZVP',
 store_wavefunction='none')

dataset.add_qc_spec(basis='aug-cc-pVTZ',
 implicit_solvent=None,
 keywords=None,
 method='MP2',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='MP2/aug-cc-pVTZ',
 store_wavefunction='none')

dataset.add_qc_spec(basis='heavy-aug-cc-pVTZ',
 implicit_solvent=None,
 keywords=None,
 method='DSD-BLYP-D3BJ',
 program='psi4',
 spec_description='Single point calculation',
 spec_name='DSD-BLYP-D3BJ/heavy-aug-cc-pVTZ',
 store_wavefunction='none')

dataset.add_qc_spec(basis=None,
 implicit_solvent=None,
 keywords={"scf_type": "df", "mp2_type": "df", "guess": "sad", "cc_type": "df", "freeze_core": True},
 method="mp2/heavy-aug-cc-pv[tq]z + D:ccsd(t)/heavy-aug-cc-pvdz",
 program='psi4',
 spec_description='Single point calculation',
 spec_name='DF-CCSD(T)/CBS',
 store_wavefunction='none')

In [10]:
dataset.qc_specifications

{'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords=None),
 'B3LYP-D3BJ/DEF2-TZVP': QCSpec(method='B3LYP-D3BJ', basis='DEF2-TZVP', program='psi4', spec_name='B3LYP-D3BJ/DEF2-TZVP', spec_description='Single point calculation', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>, implicit_solvent=None, maxiter=200, scf_properties=[<SCFProperties.Dipole: 'dipole'>, <SCFProperties.Quadrupole: 'quadrupole'>, <SCFProperties.WibergLowdinIndices: 'wiberg_lowdin_indices'>, <SCFProperties.MayerIndices: 'mayer_indices'>], keywords=None),
 'B3LYP-D3BJ/DEF2-TZVPD': QCSpec

In [11]:
from openff.qcsubmit.datasets import DatasetEntry
from openff.toolkit.topology import Molecule as OFFMolecule

for idx, (entry_index, td_record) in enumerate(torsiondrive_data.items()):
    attributes = td_record["attributes"]
    dihedrals = list(td_record['keywords']['dihedrals'][0])
    initial_molecules = []
    for gid, final_molecule in td_record['final_molecules'].items():
        initial_molecules.append(final_molecule)
    
    data_entry = DatasetEntry(index=idx,  initial_molecules=initial_molecules, attributes=attributes, extras = {}, keywords = {"scf_type": "df", "mp2_type": "df", "guess":"sad"})
    dataset.dataset[idx] = data_entry



In [12]:
dataset.metadata.submitter = 'pavankum'
dataset.metadata.dataset_name = 'OpenFF Theory Benchmarking Single Point Energies v1.0'
dataset.metadata.long_description = 'Single point calculations of torsiondrive final geometries from reference spec (MP2/heavy-aug-cc-pVTZ) calculation with different basis sets and functionals'
dataset.metadata.long_description_url = 'https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-09-06-theory-bm-single-points'

In [13]:
dataset.metadata

Metadata(submitter='pavankum', creation_date=datetime.date(2021, 9, 9), collection_type='DataSet', dataset_name='OpenFF Theory Benchmarking Single Point Energies v1.0', short_description='Single point energies at final geometries of torsiondrives calculated at reference spec MP2/heavy-aug-cc-pVTZ', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2021-09-06-theory-bm-single-points', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2021-09-06-theory-bm-single-points'), long_description='Single point calculations of torsiondrive final geometries from reference spec (MP2/heavy-aug-cc-pVTZ) calculation with different basis sets and functionals', elements=set())

In [14]:
dataset.n_molecules

39

In [15]:
dataset.n_records

1416

In [16]:
dataset.n_qc_specs

21

In [17]:
# export the dataset
dataset.export_dataset("dataset.json.bz2")