# QCSubmit protein prep

Instructions from David Cerutti
The way it's set up, we are scanning phi and psi of the central residue with a random selection of ALA, GLY, SER, or VAL to the N- or C-terminus, and the customary ACE and NME blocking groups outside of that.  We are targeting B3LYP, def2-dzvpp with Becke-Johnson D3 corrections for the optimization and thereafter to use B3LYP, ma-def2-tzvpp for the single point calculation getting the energy and gradient.  That is "minimally augmented" def2-triple zeta / PP.  These calculations on tetrapeptides of this sort tend to take about 4-8 hours on one CPU with ORCA, so we'll see how the QC infrastructure does with them.

This notebook will create the full dataset as version2 as we have already ran the initial test. 

Here all molecule mol2 files have a corresponding frozen dihedrals selection file and both must be read and used when making the dataset.

The mol2 files also have incorrect bond order so we MUST RE-SAVE THE FILE AS PDB SO WE CAN GET THE CORRECT ORDERS!

In [1]:
from qcsubmit.factories import OptimizationDatasetFactory
from qcsubmit.datasets import OptimizationDataset
import os
from openforcefield.topology import Molecule
from openforcefield.utils.toolkits import RDKitToolkitWrapper


In [2]:
# now we need to loop through the folders and load the molecules and frozen dihedrals list
# lets make sure that the dihedrals are the same in every file
def gather_restraints(folder):
    """
    create a set of all of the restraints in the given folder.
    """
    constraints = set()
    for file in os.listdir(folder):
        if file.endswith(".frz"):
            # this is a constraint file
            with open(os.path.join(folder, file)) as cons_data:
                for line in cons_data.readlines()[1:]:
                    constraint = [int(x) for x in line.split()[:4]]
                    constraints.add(tuple(constraint))
                    
    return constraints


                    

In [3]:
# we need a function to get all of the conformers from the folder and make an openff molecule
def gather_conformers(folder):
    """
    create a molecule from all of the conformers in the folder.
    """
    molecules = []
    for file in os.listdir(folder):
        if file.endswith(".mol2"):
            # this is one of the molecules
            mol2 = Molecule.from_file(os.path.join(folder, file))
            # now save again in PDB format
            name = file.split(".")[0]
            # fix issue with openeye reordering when writing files.
            mol2.to_file(os.path.join(folder, name + ".pdb"), "pdb", RDKitToolkitWrapper())
            # load up the new molecule
            pdb = Molecule.from_file(os.path.join(folder, name + ".pdb"))
            molecules.append(pdb)
            
    # now we need to condense all of the conformers down to one molecule.
    return_mol = molecules.pop()
    for molecule in molecules:
        return_mol.add_conformer(molecule.conformers[0])
        
    return return_mol

In [11]:
# now we need to loop over all of the folders and for each one gather the molecule and all of its conformers
# and all of the constraints and add them to the dataset.
protein_dataset = OptimizationDataset(dataset_name="OpenFF Protein Fragments v2.0", dataset_tagline="Constrained optimization of various protein fragments.")
factory = OptimizationDatasetFactory()
for folder in os.listdir("Input_files"):
    folder_path = os.path.join("Input_files", folder)
    if not os.path.isdir(folder_path):
        print(folder)
        continue
    for combination in os.listdir(folder_path):
        combination_path = os.path.join(folder_path, combination)
        print("collecting from folder", combination_path)
        if ".tar" not in combination_path:
            txt_constraints = gather_restraints(combination_path)
            conformers = gather_conformers(combination_path)
            # create the data needed for the optimization
            index = combination.lower()
            attributes = factory.create_cmiles_metadata(conformers)
            # make the constraints for geometric
            geometric_constraints = []
            for constraint in txt_constraints:
                geometric_constraints.append({"type": "dihedral", "indices": list(constraint)})
            keywords = {"constraints": {"freeze": geometric_constraints}}
            protein_dataset.add_molecule(index=index, molecule=conformers, attributes=attributes, keywords=keywords)
        
        
protein_dataset

.DS_Store
collecting from folder Input_files/CYS/GLY_CYS_ALA
collecting from folder Input_files/CYS/GLY_CYS_VAL
collecting from folder Input_files/CYS/GLY_CYS_GLY
collecting from folder Input_files/CYS/GLY_CYS_SER
collecting from folder Input_files/CYS/SER_CYS_VAL
collecting from folder Input_files/CYS/SER_CYS_GLY
collecting from folder Input_files/CYS/SER_CYS_ALA
collecting from folder Input_files/CYS/SER_CYS_SER
collecting from folder Input_files/CYS/VAL_CYS_VAL
collecting from folder Input_files/CYS/ConfCYS.tar.gz
collecting from folder Input_files/CYS/VAL_CYS_GLY
collecting from folder Input_files/CYS/ALA_CYS_SER
collecting from folder Input_files/CYS/VAL_CYS_ALA
collecting from folder Input_files/CYS/ALA_CYS_VAL
collecting from folder Input_files/CYS/ALA_CYS_GLY
collecting from folder Input_files/CYS/VAL_CYS_SER
collecting from folder Input_files/CYS/ALA_CYS_ALA
collecting from folder Input_files/ASP/GLY_ASP_ALA




collecting from folder Input_files/ASP/GLY_ASP_GLY




collecting from folder Input_files/ASP/GLY_ASP_VAL




collecting from folder Input_files/ASP/GLY_ASP_SER




collecting from folder Input_files/ASP/VAL_ASP_GLY
collecting from folder Input_files/ASP/VAL_ASP_VAL




collecting from folder Input_files/ASP/VAL_ASP_ALA




collecting from folder Input_files/ASP/ConfASP.tar.gz
collecting from folder Input_files/ASP/ALA_ASP_SER




collecting from folder Input_files/ASP/ALA_ASP_GLY
collecting from folder Input_files/ASP/ALA_ASP_VAL
collecting from folder Input_files/ASP/ALA_ASP_ALA




collecting from folder Input_files/ASP/VAL_ASP_SER




collecting from folder Input_files/ASP/SER_ASP_GLY
collecting from folder Input_files/ASP/SER_ASP_VAL
collecting from folder Input_files/ASP/SER_ASP_ALA
collecting from folder Input_files/ASP/SER_ASP_SER




collecting from folder Input_files/ARG/ConfARG.tar.gz
collecting from folder Input_files/ARG/GLY_ARG_SER




collecting from folder Input_files/ARG/GLY_ARG_VAL




collecting from folder Input_files/ARG/GLY_ARG_GLY




collecting from folder Input_files/ARG/GLY_ARG_ALA




collecting from folder Input_files/ARG/VAL_ARG_SER




collecting from folder Input_files/ARG/ALA_ARG_ALA




collecting from folder Input_files/ARG/ALA_ARG_VAL




collecting from folder Input_files/ARG/ALA_ARG_GLY
collecting from folder Input_files/ARG/ALA_ARG_SER




collecting from folder Input_files/ARG/VAL_ARG_ALA
collecting from folder Input_files/ARG/VAL_ARG_VAL




collecting from folder Input_files/ARG/VAL_ARG_GLY
collecting from folder Input_files/ARG/SER_ARG_SER




collecting from folder Input_files/ARG/SER_ARG_ALA
collecting from folder Input_files/ARG/SER_ARG_VAL
collecting from folder Input_files/ARG/SER_ARG_GLY
collecting from folder Input_files/GLU/GLY_GLU_GLY
collecting from folder Input_files/GLU/GLY_GLU_VAL
collecting from folder Input_files/GLU/GLY_GLU_ALA
collecting from folder Input_files/GLU/ConfGLU.tar.gz
collecting from folder Input_files/GLU/VAL_GLU_ALA




collecting from folder Input_files/GLU/VAL_GLU_GLY
collecting from folder Input_files/GLU/VAL_GLU_VAL




collecting from folder Input_files/GLU/ALA_GLU_ALA
collecting from folder Input_files/GLU/ALA_GLU_GLY
collecting from folder Input_files/GLU/ALA_GLU_VAL
collecting from folder Input_files/GLN/ALA_GLN_GLY
collecting from folder Input_files/GLN/ALA_GLN_VAL
collecting from folder Input_files/GLN/ALA_GLN_ALA
collecting from folder Input_files/GLN/VAL_GLN_SER
collecting from folder Input_files/GLN/VAL_GLN_GLY
collecting from folder Input_files/GLN/VAL_GLN_VAL
collecting from folder Input_files/GLN/VAL_GLN_ALA
collecting from folder Input_files/GLN/ALA_GLN_SER
collecting from folder Input_files/GLN/SER_GLN_SER
collecting from folder Input_files/GLN/SER_GLN_GLY
collecting from folder Input_files/GLN/SER_GLN_VAL
collecting from folder Input_files/GLN/SER_GLN_ALA
collecting from folder Input_files/GLN/GLY_GLN_SER
collecting from folder Input_files/GLN/ConfGLN.tar.gz
collecting from folder Input_files/GLN/GLY_GLN_ALA
collecting from folder Input_files/GLN/GLY_GLN_GLY
collecting from folder Input

OptimizationDataset(qc_specifications={'default': QCSpec(method='B3LYP-D3BJ', basis='DZVP', program='psi4', spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', store_wavefunction=<WavefunctionProtocolEnum.none: 'none'>)}, dataset_name='OpenFF Protein Fragments v2.0', dataset_tagline='Constrained optimization of various protein fragments.', dataset_type='OptimizationDataset', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], priority='normal', description='An optimization dataset using geometric.', dataset_tags=['openff'], compute_tag='openff', metadata=Metadata(submitter='joshuahorton', creation_date=datetime.date(2020, 8, 12), collection_type='OptimizationDataset', dataset_name='OpenFF Protein Fragments v2.0', short_description='Constrained optimization of various protein fragments.', long_description_url=None, long_description='An optimization dat

In [12]:
protein_dataset.n_molecules

185

In [13]:
protein_dataset.n_records

6716

In [21]:
protein_dataset.metadata.elements

{'C', 'H', 'N', 'O', 'S'}

In [22]:
protein_dataset.visualize("protein_dataset.pdf", columns=3)

In [23]:
# manual provenance fix
import openeye
protein_dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/2020-07-06-OpenFF-Protein-Fragments-Initial"
protein_dataset.provenance = factory.provenance()
protein_dataset.provenance["openeye"] = openeye.__version__

In [24]:
protein_dataset.export_dataset("protein_dataset.json")

In [25]:
protein_dataset._get_missing_basis_coverage()