In [1]:
import os, json, tqdm
from qcsubmit.factories import TorsiondriveDatasetFactory
from qcsubmit.datasets import TorsiondriveDataset
from qcsubmit import workflow_components 
from qcsubmit.common_structures import TorsionIndexer 
from openforcefield.topology import Molecule as OFFMolecule

In [2]:
factory = TorsiondriveDatasetFactory()
factory.scf_properties = ['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices']
conformer_generator = workflow_components.StandardConformerGenerator(max_conformers=5)
# want to remove methyl group rotations if qcsubmit support the feature.
factory.add_workflow_component(conformer_generator)

In [3]:
# now write the settings out
factory.export_settings("theory-bm-set_settings.yaml")

In [4]:
# now create the dataset from the selected_mol2s, containing mol2 files of additional charged molecules
dataset = factory.create_dataset(
    dataset_name="OpenFF Theory Benchmarking Set v1.0",
    molecules='selected_mol2s',
    description="A torsiondrive dataset for theory benchmarking",
    tagline="Torsiondrives for theory benchmarking",
    verbose=False)

# Change the default spec name
dataset.qc_specifications.pop('default')

basis_sets = ['dzvp', 'def2-tzvp', 'def2-tzvpd', 'def2-tzvpp', 'def2-tzvppd', 'def2-qzvp', '6-31+gss', '6-311+gss']
for basis in basis_sets: 
    spec_name = "B3LYP-D3BJ/%s " % basis.upper()
    spec_description = "A torsiondrive dataset for benchmarking B3LYP-D3BJ/%s " % basis.upper()
    print(f'spec_name: {spec_name}, spec_description: {spec_description}')
    dataset.add_qc_spec(method="B3LYP-D3BJ",
                        basis=basis,
                        program="psi4", 
                        spec_name=spec_name,
                        spec_description=spec_description)      

Deduplication                 : 100%|████████████| 9/9 [00:00<00:00, 629.99it/s]
spec_name: B3LYP-D3BJ/DZVP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DZVP 
spec_name: B3LYP-D3BJ/DEF2-TZVP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVP 
spec_name: B3LYP-D3BJ/DEF2-TZVPD , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVPD 
spec_name: B3LYP-D3BJ/DEF2-TZVPP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVPP 
spec_name: B3LYP-D3BJ/DEF2-TZVPPD , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-TZVPPD 
spec_name: B3LYP-D3BJ/DEF2-QZVP , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/DEF2-QZVP 
spec_name: B3LYP-D3BJ/6-31+GSS , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/6-31+GSS 
spec_name: B3LYP-D3BJ/6-311+GSS , spec_description: A torsiondrive dataset for benchmarking B3LYP-D3BJ/6-311+GSS 


In [5]:
print(f'Number of additional charged molecules: {dataset.n_molecules}')
print(f'Number of torsions generated from the additional set: {dataset.n_records}')

Number of additional charged molecules: 9
Number of torsions generated from the additional set: 23


In [6]:
with open('input_torsions.json') as infile:
    selected_torsions = json.load(infile)

In [7]:
output = []
# import pprint
for idx, (canonical_torsion_index, torsion_data) in enumerate(tqdm.tqdm(selected_torsions.items())):
    attributes = torsion_data["attributes"]
    torsion_atom_indices = torsion_data["atom_indices"]
    grid_spacings = [15] * len(torsion_atom_indices)
    initial_molecules = torsion_data["initial_molecules"]
    # molecule = OFFMolecule.from_qcschema(torsion_data, client=client) # not working for some reason. need to  dig into
    molecule = OFFMolecule.from_qcschema(torsion_data)
    molecule.generate_conformers(n_conformers = 5)
    output.append(f'{idx}: {molecule.n_conformers}')
    dataset.add_molecule(index=idx,  molecule= molecule, attributes=attributes, dihedrals=torsion_atom_indices)
# pprint.pprint(output)

100%|██████████| 36/36 [00:05<00:00,  6.00it/s]


In [8]:
print(f'Number of total molecules: {dataset.n_molecules}')
print(f'Number of total 1-D torsions: {dataset.n_records}')

Number of total molecules: 40
Number of total 1-D torsions: 59


In [9]:
from openeye import oechem
import numpy as np
masses = []
for molecule in dataset.molecules: 
    oemol = molecule.to_openeye()
    mass = oechem.OECalculateMolecularWeight(oemol)
    masses.append(mass)
print(f'Mean molecular weight: {np.mean(np.array(masses)):.2f}')
print(f'Max molecular weight: {np.amax(np.array(masses)):.2f}')

Mean molecular weight: 168.00
Max molecular weight: 233.29


In [10]:
dataset.metadata.elements

{'C', 'Cl', 'F', 'H', 'N', 'O', 'P', 'S'}

In [11]:
dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-12-14-theory-bm-torsiondrive-set-v1.0"
dataset.metadata.submitter = 'hyejang'

In [12]:
# export the dataset
dataset.export_dataset("dataset.json.bz2")

In [13]:
dataset.export_dataset("dataset.json")

In [14]:
dataset.molecules_to_file("theory-bm-set-curated.smi", "smi")

In [15]:
# export the molecules to pdf with torsions highlighted
dataset.visualize("theory-bm-set-curated.pdf", toolkit='openeye')