Using QCSubmit to make an OptimizationDataset from the starting PDBS and JSON.

In [50]:
from qcsubmit.datasets import OptimizationDataset
from qcsubmit.serializers import deserialize
from qcsubmit.constraints import Constraints
from qcsubmit.factories import OptimizationDatasetFactory
from openforcefield.topology import Molecule
from openforcefield.utils.toolkits import RDKitToolkitWrapper

In [51]:
# make the dataset and set the name and info
dataset = OptimizationDataset(dataset_name="OpenFF-Disaccharides-v1.0", dataset_tagline="Constrained Disaccharide Optimizations", description="Constrained Disaccharide Optimizations for force field development generated by mdgx.")

In [52]:
dataset.metadata.submitter = "David Cerutti"

In [53]:
dataset

OptimizationDataset(qc_specifications={'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>)}, dataset_name='OpenFF-Disaccharides-v1.0', dataset_tagline='Constrained Disaccharide Optimizations', dataset_type='OptimizationDataset', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], priority='normal', description='Constrained Disaccharide Optimizations for force field development generated by mdgx.', dataset_tags=['openff'], compute_tag='openff', metadata=Metadata(submitter='David Cerutti', creation_date=datetime.date(2020, 8, 24), collection_type='OptimizationDataset', dataset_name='OpenFF-Disaccharides-v1.0', short_description='Constrained Disaccharide Optimizations', long_description_url=None, long_description='Constrained D

Now lets look at how we could make a dataset entry using the input JSON which contains the conectivity, atoms, constraints and coordinates in bohr.
Here all of the data is in the First1000 folder which contains a folder for each molecule which has the starting geometry in JSON format. 

## Steps:
- Extract the molecule information from a JSON sybmols, conectivity, coords
- Build an openFF molecule
- Write out a PDB file
- Read the file back to triger seterochem and bonding checks in openeye
- add all geometries
- add to the dataset

It would probably be better to supply the input PDB files to ensure correct CMILES.



In [67]:
# lets make a function to process each molecule
import os
from simtk import unit
import numpy as np

def extract_molecule_data(molecule_data):
    """
    Given the molecule data dict extract only the inportant bits.
    Here we assume there is one conformation per JSON.
    """
    name = list(molecule_data["dataset"].keys())[0]
    molecule = molecule_data["dataset"][name]["initial_molecules"][0]
    return name, molecule

def build_off_mol(name, molecule_data):
    """
    Build an off molecule from the name and input data.
    """
    atom_map = {"H": 1, "C": 6, "O": 8, "N":7}
    off_mol = Molecule()
    off_mol.name = name
    # add atoms
    for atom in molecule_data["symbols"]:
        off_mol.add_atom(atom_map[atom], 0, False)
    # add bonds
    for bond in molecule_data["connectivity"]:
        off_mol.add_bond(*bond, False)
    # add geometry
    conformer = unit.Quantity(np.array(molecule_data["geometry"]).reshape(-1, 3), unit.bohr)
    off_mol.add_conformer(conformer)
    
    return off_mol
    

def add_molecule_to_dataset(dataset, folder_path):
    """
    For the given folder path process the input molecule into the optimization dataset with valid constraints.
    """
    print(folder_path)
    molecules = []
    factory = OptimizationDatasetFactory()
    
    for file in os.listdir(folder_path):
        if file.endswith(".json"):
            # get the input data
            molecule_data = deserialize(os.path.join(folder_path, file))
            name, mol_data = extract_molecule_data(molecule_data)
            molecules.append(build_off_mol(name, mol_data)) 
#             # create a new constraint
#             constraint = Constraints()
#             for const_type, constraints in constraint_data["keywords"]["constraints"].items():
#                 if const_type == "freeze":
#                     for constraint_info in constraints:
#                         constraint.add_freeze_constraint(constraint_type=constraint_info["type"], indices=constraint_info["indices"])
#                 else:
#                     for constraint_info in constraints:
#                         constraint.add_set_constraint(constraint_type=constraint_info["type"], indices=constraint_info["indices"], value=constraint_info["value"])
            
    # get the molecule data as pdb
    molecule = molecules.pop()
    file_name = os.path.join(folder_path, f"{molecule.name}.pdb")
    molecule.to_file(file_name, "pdb", toolkit_registry=RDKitToolkitWrapper())
    # now read back
    final_mol = Molecule.from_file(file_name)
    # get the CMILES
    attributes = factory.create_cmiles_metadata(final_mol)
    # now add all conformers
    for conformer in molecules:
        final_mol.add_conformer(conformer._conformers[0])
    dataset.add_molecule(index=final_mol.name, molecule=final_mol, attributes=attributes)

            
            
#         elif file.endswith(".json") and "constraints" not in file:
#             # this has the coords and name
#             mol_data = deserialize(os.path.join(folder_path, file))
            
    # now we should have everything we need
    # make the new geometry and get the name, only one mol per json
#     for mol_json in mol_data["dataset"].values():
#         index = mol_json["index"]
#         conformer = unit.Quantity(np.array(mol_json["initial_molecules"][0]["geometry"]).reshape((mol.n_atoms, 3)), unit.angstrom)
#         mol.add_conformer(conformer)
#         break
#     assert mol.n_conformers == 1
#     # now add the molecule
#     attributes = factory.create_cmiles_metadata(mol)
#     dataset.add_molecule(index=index, molecule=mol, attributes=attributes, constraints=constraint)
    

In [68]:
# now for each folder lets add the molecule to the dataset
for folder in os.listdir("First1000"):
    if "ipynb" not in folder and os.path.isdir(os.path.join("First1000", folder)):
        add_molecule_to_dataset(dataset, os.path.join("First1000",folder))
    
dataset

First1000/All_Sor
First1000/All_Psi
First1000/All_Xyl
First1000/All_Glc
First1000/All_Alt
First1000/All_Fuc
First1000/All_Tag
First1000/All_Gal
First1000/All_Qui
First1000/All_Ara
First1000/All_All
First1000/All_Lyx
First1000/All_Rha
First1000/All_Tal
First1000/All_Man
First1000/All_Fru
First1000/All_Rib
First1000/All_Gul


OptimizationDataset(qc_specifications={'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>)}, dataset_name='OpenFF-Disaccharides-v1.0', dataset_tagline='Constrained Disaccharide Optimizations', dataset_type='OptimizationDataset', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], priority='normal', description='Constrained Disaccharide Optimizations for force field development generated by mdgx.', dataset_tags=['openff'], compute_tag='openff', metadata=Metadata(submitter='David Cerutti', creation_date=datetime.date(2020, 8, 24), collection_type='OptimizationDataset', dataset_name='OpenFF-Disaccharides-v1.0', short_description='Constrained Disaccharide Optimizations', long_description_url=None, long_description='Constrained D

In [69]:
dataset.n_molecules

18

In [70]:
dataset.n_records

966

In [59]:
dataset.metadata.elements

{'C', 'H', 'O'}

In [71]:
dataset.export_dataset("dataset.json")

In [72]:
dataset.visualize("Disaccharide.pdf")

In [74]:
dataset.molecules_to_file("Disaccharide.smi", "smi")

In [60]:
mol = Molecule.from_mapped_smiles("[H:42][C@@:41]1([C@@:37]([C@:30]([O:29][C@@:27]([C@@:45]1([H:46])[O:47][H:48])([H:28])[O:26][C@:16]2([C@:12]([C@@:6]([O:25][C@@:18]2([H:19])[C:20]([H:21])([H:22])[O:23][H:24])([C:7]([H:8])([H:9])[O:10][H:11])[O:5][C:2]([H:1])([H:3])[H:4])([H:13])[O:14][H:15])[H:17])([H:31])[C:32]([H:33])([H:34])[O:35][H:36])([H:38])[O:39][H:40])[O:43][H:44]")
mol.generate_conformers(n_conformers=1)

In [61]:
mol

NGLWidget()

In [62]:
entry = dataset.get_molecule_entry(mol)

In [63]:
entry

['OME_4bU_0nA']

In [64]:
molecules= list(dataset.molecules)

In [65]:
molecules[6]

NGLWidget(max_frame=47)

In [66]:
for molecule in dataset.molecules:
    print(dataset.get_molecule_entry(molecule))

['OME_4bU_0nA']
['OME_4pU_0NB']
['OME_2xU_0NB']
['OME_3GB_0NB']
['OME_2EB_0NB']
['OME_3fA_0NB']
['OME_4JD_0nB']
['OME_2LB_0nB']
['OME_4QB_0nB']
['OME_3AU_0NB']
['OME_2nA_0NA']
['OME_2dU_0NA']
['OME_3HB_0NB']
['OME_4tA_0NB']
['OME_4MA_0nA']
['OME_3CU_0nB']
['OME_2rU_0NB']
['OME_2KB_0NB']
