This dataset is a consolidation of the theory benchmark torsiondrive datasets. Here we reuse a submitted dataset JSON to preserve the exact inputs and trigger task deduplication at the archive level but we combine multiple existing and new QC specifications.

In [57]:
import os, json, tqdm
from qcsubmit.factories import TorsiondriveDatasetFactory
from qcsubmit.datasets import TorsiondriveDataset
from qcsubmit.datasets import load_dataset
from qcsubmit import workflow_components 
from qcsubmit.common_structures import TorsionIndexer 
from openforcefield.topology import Molecule as OFFMolecule

In [58]:
factory = TorsiondriveDatasetFactory()
factory.scf_properties = ['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']
conformer_generator = workflow_components.StandardConformerGenerator(max_conformers=5)
# want to remove methyl group rotations if qcsubmit support the feature.
factory.add_workflow_component(conformer_generator)

In [59]:
# now write the settings out
factory.export_settings("theory-bm-set_settings.yaml")

In [60]:
# now create the dataset from the selected_mol2s, containing mol2 files of additional charged molecules
dataset = factory.create_dataset(
    dataset_name="OpenFF Theory Benchmarking Set v1.0",
    molecules='selected_mol2s',
    description="A torsiondrive dataset for theory benchmarking",
    tagline="Torsiondrives for theory benchmarking",
    verbose=False)


basis_sets = ['def2-tzvp', 'def2-tzvpd', 'def2-tzvpp', 'def2-tzvppd', 'def2-qzvp', '6-31+g**', '6-311+g**']
for basis in basis_sets: 
    spec_name = "B3LYP-D3BJ/%s" % basis.upper()
    spec_description = "A torsiondrive dataset for benchmarking B3LYP-D3BJ/%s " % basis.upper()
    print(f'spec_name: {spec_name}, spec_description: {spec_description}')
    dataset.add_qc_spec(method="B3LYP-D3BJ",
                        basis=basis,
                        program="psi4", 
                        spec_name=spec_name,
                        spec_description=spec_description)      

Deduplication                 : 100%|███████████| 9/9 [00:00<00:00, 1075.52it/s]


spec_name: B3LYP-D3BJ/DEF2-TZVP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVP 
spec_name: B3LYP-D3BJ/DEF2-TZVPD , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVPD 
spec_name: B3LYP-D3BJ/DEF2-TZVPP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVPP 
spec_name: B3LYP-D3BJ/DEF2-TZVPPD , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVPPD 
spec_name: B3LYP-D3BJ/DEF2-QZVP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-QZVP 
spec_name: B3LYP-D3BJ/6-31+G** , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/6-31+G** 
spec_name: B3LYP-D3BJ/6-311+G** , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/6-311+G** 




In [61]:
print(f'Number of additional charged molecules: {dataset.n_molecules}')
print(f'Number of torsions generated from the additional set: {dataset.n_records}')

Number of additional charged molecules: 9
Number of torsions generated from the additional set: 23


In [62]:
# load the past dataset
first_dataset = load_dataset("dataset 2.json")



In [63]:
# check the details of the dataset
first_dataset.dict(exclude={"dataset"})

{'qc_specifications': {'default': {'method': 'B3LYP-D3BJ',
   'basis': 'DZVP',
   'program': 'psi4',
   'spec_name': 'default',
   'spec_description': 'Standard OpenFF optimization quantum chemistry specification.',
   'store_wavefunction': 'none',
   'implicit_solvent': None}},
 'dataset_name': 'OpenFF Theory Benchmarking Set B3LYP-D3BJ def2-TZVP v1.0',
 'dataset_tagline': 'Torsiondrives for benchmarking B3LYP-D3BJ/def2-TZVP',
 'dataset_type': 'TorsiondriveDataset',
 'maxiter': 200,
 'driver': <DriverEnum.gradient: 'gradient'>,
 'scf_properties': ['dipole',
  'quadrupole',
  'wiberg_lowdin_indices',
  'mayer_indices'],
 'priority': 'normal',
 'description': 'A torsiondrive dataset for benchmarking B3LYP-D3BJ/def2-TZVP',
 'dataset_tags': ['openff'],
 'compute_tag': 'openff',
 'metadata': {'submitter': 'hyesu',
  'creation_date': datetime.date(2020, 7, 30),
  'collection_type': 'TorsiondriveDataset',
  'dataset_name': 'OpenFF Theory Benchmarking Set B3LYP-D3BJ def2-TZVP v1.0',
  'short_

In [64]:
# add the first dataset into the extended one
all_data = dataset + first_dataset

In [65]:
print(f'Number of total molecules: {all_data.n_molecules}')
print(f'Number of total 1-D torsions: {all_data.n_records}')

Number of total molecules: 40
Number of total 1-D torsions: 59


In [66]:
from openeye import oechem
import numpy as np
masses = []
for molecule in all_data.molecules: 
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)
print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.amax(np.array(masses)):.2f}')

Mean molecular weight: 168.00
Max molecular weight: 233.29


In [67]:
all_data.metadata.elements

{'C', 'Cl', 'F', 'H', 'N', 'O', 'P', 'S'}

In [68]:
all_data.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-12-18-OpenFF-Theory-Benchmarking-Set-v1.0"
all_data.metadata.submitter = 'hyejang'

In [69]:
# export the dataset
all_data.export_dataset("dataset.json.bz2")

In [70]:
all_data.molecules_to_file("theory-bm-set-extended.smi", "smi")

In [71]:
# export the molecules to pdf with torsions highlighted
all_data.visualize("theory-bm-set-extended.pdf", toolkit='openeye')