Using QCSubmit to make an OptimizationDataset from the starting PDBS.

In [1]:
from qcsubmit.datasets import OptimizationDataset
from qcsubmit.serializers import deserialize
from qcsubmit.constraints import Constraints
from qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule



In [2]:
# make the dataset and set the name and info
dataset = OptimizationDataset(dataset_name="OpenFF-Disaccharides-v1.0", dataset_tagline="Constrained Disaccharide Optimizations", description="Constrained Disaccharide Optimizations for force field development generated by mdgx.")

In [3]:
dataset.metadata.submitter = "David Cerutti"

In [4]:
dataset

OptimizationDataset(qc_specifications={'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>)}, dataset_name='OpenFF-Disaccharides-v1.0', dataset_tagline='Constrained Disaccharide Optimizations', dataset_type='OptimizationDataset', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], priority='normal', description='Constrained Disaccharide Optimizations for force field development generated by mdgx.', dataset_tags=['openff'], compute_tag='openff', metadata=Metadata(submitter='David Cerutti', creation_date=datetime.date(2020, 8, 11), collection_type='OptimizationDataset', dataset_name='OpenFF-Disaccharides-v1.0', short_description='Constrained Disaccharide Optimizations', long_description_url=None, long_description='Constrained D

Now lets look at how we could make a dataset entry using the input PDBS and a constraints txt file.
Here all of the data is in the Input_data folder which contains an input_pdb file which can be used to generate the CMILES data we need. We also have a constraints JSON file and the dataset.json file which in this case we use to extract the starting geometry. 

It would probably be better to supply the constraints file and a final geometry PDB file.

## IMPORTANT
The geometry in the JSON is in angstroms not bohr and must be converted!


In [5]:
# lets make a function to process each molecule
import os
from simtk import unit
import numpy as np

def add_molecule_to_dataset(dataset, folder_path):
    """
    For the given folder path process the input molecule into the optimization dataset with valid constraints.
    """
    print(folder_path)
    
    factory = OptimizationDatasetFactory()
    
    # get the constraints
    constraint_data = deserialize(os.path.join(folder_path, "constraints.json"))
    # create a new constraint
    constraint = Constraints()
    for const_type, constraints in constraint_data["keywords"]["constraints"].items():
        if const_type == "freeze":
            for constraint_info in constraints:
                constraint.add_freeze_constraint(constraint_type=constraint_info["type"], indices=constraint_info["indices"])
        else:
            for constraint_info in constraints:
                constraint.add_set_constraint(constraint_type=constraint_info["type"], indices=constraint_info["indices"], value=constraint_info["value"])
    
    # get the molecule data
    for file in os.listdir(folder_path):
        if file.endswith(".pdb"):
            # get the input molecule
            mol = Molecule.from_file(os.path.join(folder_path, file))
            # remove the input conformer
            mol._conformers = []

            
            
        elif file.endswith(".json") and "constraints" not in file:
            # this has the coords and name
            mol_data = deserialize(os.path.join(folder_path, file))
            
    # now we should have everything we need
    # make the new geometry and get the name, only one mol per json
    for mol_json in mol_data["dataset"].values():
        index = mol_json["index"]
        conformer = unit.Quantity(np.array(mol_json["initial_molecules"][0]["geometry"]).reshape((mol.n_atoms, 3)), unit.angstrom)
        mol.add_conformer(conformer)
        break
    assert mol.n_conformers == 1
    # now add the molecule
    attributes = factory.create_cmiles_metadata(mol)
    dataset.add_molecule(index=index, molecule=mol, attributes=attributes, constraints=constraint)
    

In [6]:
# now for each folder lets add the molecule to the dataset
for folder in os.listdir("Input_data"):
    if "ipynb" not in folder:
        add_molecule_to_dataset(dataset, os.path.join("Input_data",folder))
    
dataset

Input_data/DAllpa1-2DAllpa1-OME
Input_data/DAllpa1-2DAllpb1-OME


OptimizationDataset(qc_specifications={'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>)}, dataset_name='OpenFF-Disaccharides-v1.0', dataset_tagline='Constrained Disaccharide Optimizations', dataset_type='OptimizationDataset', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], priority='normal', description='Constrained Disaccharide Optimizations for force field development generated by mdgx.', dataset_tags=['openff'], compute_tag='openff', metadata=Metadata(submitter='David Cerutti', creation_date=datetime.date(2020, 8, 11), collection_type='OptimizationDataset', dataset_name='OpenFF-Disaccharides-v1.0', short_description='Constrained Disaccharide Optimizations', long_description_url=None, long_description='Constrained D

In [7]:
dataset.n_molecules

2

In [8]:
dataset.n_records

2

In [9]:
dataset.metadata.elements

{'C', 'H', 'O'}

In [108]:
dataset.export_dataset("dataset.json")

In [14]:
mol = Molecule.from_mapped_smiles("[H:32][C@:31]1([C@:33]([C@@:35]([O:42][C@:27]([C@:29]1([H:30])[O:47][H:48])([H:28])[O:26][C@@:8]2([C@:10]([C@:12]([C@@:14]([O:21][C@:6]2([H:7])[O:5][C:2]([H:1])([H:3])[H:4])([H:15])[C:16]([H:17])([H:18])[O:19][H:20])([H:13])[O:22][H:23])([H:11])[O:24][H:25])[H:9])([H:36])[C:37]([H:38])([H:39])[O:40][H:41])([H:34])[O:43][H:44])[O:45][H:46]")
mol.generate_conformers(n_conformers=1)

In [15]:
mol

NGLWidget()

In [16]:
entry = dataset.get_molecule_entry(mol)

In [17]:
entry

['OME_2NA_0NA']