In [1]:
import qcsubmit
qcsubmit.__version__

'0.1.1'

In [2]:
import qcportal as ptl
import numpy as np
from openforcefield.topology import Molecule
from qcsubmit.results import OptimizationCollectionResult
from qcsubmit.datasets import OptimizationDataset

client = ptl.FractalClient()

In [3]:
import pickle
import json
def download_torsiondrive_data(dataset_name):
    # load dataset from public qcfractal server
    ds = client.get_collection("TorsionDriveDataset", dataset_name)
    spec_name = ds.list_specifications().index[0]
    print(f"Loading TorsionDrive Scans from [ {dataset_name} ] spec [{spec_name}]")
    print(f"Found {len(ds.df)} data entries")
    # load torsiondrive record ids from the dataset
    map_record_id_entry_index = {}
    for entry_index in ds.df.index:
        data_entry = ds.get_entry(entry_index)
        td_record_id = data_entry.object_map[spec_name]
        map_record_id_entry_index[td_record_id] = entry_index, data_entry.attributes
    print(f"Found {len(map_record_id_entry_index)} torsiondrive records")
    # query all torsiondrive records at the same time
    td_record_ids = list(map_record_id_entry_index.keys())
    torsiondrive_data = {}
    for i, td_record in enumerate(client.query_procedures(id=td_record_ids), 1):
        entry_index, attributes = map_record_id_entry_index[td_record.id]
        print(f"{i:5d} : {entry_index:50s} status {td_record.status}")
        if td_record.status == 'COMPLETE':
            torsiondrive_data[entry_index] = {
                'initial_molecules': client.query_molecules(td_record.initial_molecule),
                'final_molecules': td_record.get_final_molecules(),
                'final_energies': td_record.get_final_energies(),
                'final_gradients': {gid: np.array(res.return_result) for gid, res in td_record.get_final_results().items()},
                'keywords': td_record.keywords.dict(),
                'attributes': attributes,
            }
    print(f'Downloaded torsion drive data for {len(torsiondrive_data)} completed entries')
    # save as pickle file
    with open('torsiondrive_data.pickle', 'wb') as pfile:
        pickle.dump(torsiondrive_data, pfile)
    return torsiondrive_data

In [4]:
dataset_name='OpenFF Theory Benchmarking Set B3LYP-D3BJ def2-TZVPD v1.0'
torsiondrive_data= download_torsiondrive_data(dataset_name)

Loading TorsionDrive Scans from [ OpenFF Theory Benchmarking Set B3LYP-D3BJ def2-TZVPD v1.0 ] spec [default]
Found 36 data entries
Found 36 torsiondrive records
    1 : 0                                                  status COMPLETE
    2 : 1                                                  status COMPLETE
    3 : 2                                                  status COMPLETE
    4 : 3                                                  status COMPLETE
    5 : 4                                                  status COMPLETE
    6 : 5                                                  status COMPLETE
    7 : 6                                                  status COMPLETE
    8 : 7                                                  status COMPLETE
    9 : 8                                                  status COMPLETE
   10 : 9                                                  status COMPLETE
   11 : 10                                                 status COMPLETE
   12 : 11    

In [5]:
from qcsubmit.factories import OptimizationDatasetFactory
factory = OptimizationDatasetFactory()

In [6]:
dataset_name="OpenFF Theory Benchmakring Set WB97X-V def2-TZVPD v1.0"
description="An optimization dataset for benchmarking wb97x-v/def2-TZVPD"
tagline="Optimized geometries for benchmaking wb97x-v/def2-TZVPD"
spec_description=description

In [7]:
dataset = factory.create_dataset(dataset_name=dataset_name, molecules=[], description=description, tagline=tagline)
# Change the default spec name
dataset.qc_specifications.pop('default')
dataset.add_qc_spec(method="wb97x-v",
                    basis="def2-TZVPD",
                    program="psi4", 
                    spec_name="default",
                    spec_description=spec_description) 

Deduplication                 : 0it [00:00, ?it/s]
Preparation                   : 0it [00:00, ?it/s]


In [8]:
from qcsubmit.datasets import DatasetEntry
from qcsubmit.constraints import Constraints
from openforcefield.topology import Molecule as OFFMolecule

for idx, (entry_index, td_record) in enumerate(torsiondrive_data.items()):
    attributes = td_record["attributes"]
    dihedrals = list(td_record['keywords']['dihedrals'][0])
    initial_molecules = []
    for gid, final_molecule in td_record['final_molecules'].items():
        initial_molecules.append(final_molecule)
    
    data_entry = DatasetEntry(index=idx,  initial_molecules=initial_molecules, attributes=attributes, extras = {}, keywords = {})
    data_entry.add_constraint(constraint='freeze',  constraint_type='dihedral', indices= dihedrals)
    dataset.dataset[idx] = data_entry

In [9]:
dataset.n_molecules

31

In [10]:
dataset.n_records

864

In [11]:
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-11-25-theory-bm-set-wb97x-v-def2-tzvpd"
dataset.metadata.submitter = 'hyejang'

In [12]:
# export the dataset
dataset.export_dataset("dataset.json.bz2")

In [13]:
dataset.molecules_to_file("theory-bm-set-curated.smi", "smi")
# export the molecules to pdf with torsions highlighted
dataset.visualize("theory-bm-set-curated.pdf", toolkit='openeye')