# QCSubmit protein prep

Instructions from David Cerutti
The way it's set up, we are scanning phi and psi of the central residue with a random selection of ALA, GLY, SER, or VAL to the N- or C-terminus, and the customary ACE and NME blocking groups outside of that.  We are targeting B3LYP, def2-dzvpp with Becke-Johnson D3 corrections for the optimization and thereafter to use B3LYP, ma-def2-tzvpp for the single point calculation getting the energy and gradient.  That is "minimally augmented" def2-triple zeta / PP.  These calculations on tetrapeptides of this sort tend to take about 4-8 hours on one CPU with ORCA, so we'll see how the QC infrastructure does with them.

This first notebook will create a version 1 of the dataset with only one central residue for the frozen optimizations, once complete we can use QCsubmit to pull the results and create a single point dataset. 

Here all molecule mol2 files have a corresponding frozen dihedrals selection file and both must be read and used when making the dataset.

In [57]:
from qcsubmit.factories import OptimizationDatasetFactory
from qcsubmit.datasets import OptimizationDataset
import os
from openforcefield.topology import Molecule


In [58]:
# now we need to loop through the folders and load the molecules and frozen dihedrals list
# lets make sure that the dihedrals are the same in every file
def gather_restraints(folder):
    """
    create a set of all of the restraints in the given folder.
    """
    constraints = set()
    for file in os.listdir(folder):
        if file.endswith(".frz"):
            # this is a constraint file
            with open(os.path.join(folder, file)) as cons_data:
                for line in cons_data.readlines()[1:]:
                    constraint = [int(x) for x in line.split()[:4]]
                    constraints.add(tuple(constraint))
                    
    return constraints


                    

In [59]:
# we need a function to get all of the conformers from the folder and make an openff molecule
def gather_conformers(folder):
    """
    create a molecule from all of the conformers in the folder.
    """
    molecules = []
    for file in os.listdir(folder):
        if file.endswith(".mol2"):
            # this is one of the molecules
            mol = Molecule.from_file(os.path.join(folder, file))
            molecules.append(mol)
            
    # now we need to condense all of the conformers down to one molecule.
    return_mol = molecules.pop()
    for molecule in molecules:
        return_mol.add_conformer(molecule.conformers[0])
        
    return return_mol

In [60]:
# now we need to loop over all of the folders and for each one gather the molecule and all of its conformers
# and all of the constraints and add them to the dataset.
protein_dataset = OptimizationDataset(dataset_name="OpenFF Protein Fragments v1.0", dataset_tagline="Constrained optimization of various protein fragments.", basis="def2-dzvpp")
factory = OptimizationDatasetFactory()
for folder in os.listdir("."):
    if "_ALA_" in folder:
        txt_constraints = gather_restraints(folder)
        conformers = gather_conformers(folder)
        # create the data needed for the optimization
        index = folder.lower()
        attributes = factory.create_cmiles_metadata(conformers)
        # make the constraints for geometric
        geometric_constraints = []
        for constraint in txt_constraints:
            geometric_constraints.append({"type": "dihedral", "indices": list(constraint)})
        keywords = {"constraints": {"freeze": geometric_constraints}}
        protein_dataset.add_molecule(index=index, molecule=conformers, attributes=attributes, keywords=keywords)
        
        
protein_dataset

OptimizationDataset(dataset_name='OpenFF Protein Fragments v1.0', dataset_tagline='Constrained optimization of various protein fragments.', dataset_type='OptimizationDataset', method='B3LYP-D3BJ', basis='def2-dzvpp', program='psi4', maxiter=200, driver=<DriverEnum.gradient: 'gradient'>, scf_properties=['dipole', 'quadrupole', 'wiberg_lowdin_indices', 'mayer_indices'], spec_name='default', spec_description='Standard OpenFF optimization quantum chemistry specification.', priority='normal', description='An optimization dataset using geometric.', dataset_tags=['openff'], compute_tag='openff', metadata=Metadata(submitter='joshuahorton', creation_date=datetime.date(2020, 7, 6), collection_type='OptimizationDataset', dataset_name='OpenFF Protein Fragments v1.0', short_description='Constrained optimization of various protein fragments.', long_description_url=None, long_description='An optimization dataset using geometric.', elements={'O', 'C', 'H', 'N'}), provenance={}, dataset={'gly_ala_ser':

In [61]:
protein_dataset.n_molecules

16

In [62]:
protein_dataset.n_records

576

In [63]:
protein_dataset.visualize("protein_dataset.pdf", columns=3)

In [68]:
# manual provenance fix
import openeye
protein_dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/2020-07-06-OpenFF-Protein-Fragments-Initial"
protein_dataset.provenance = factory.provenance()
protein_dataset.provenance["openeye"] = openeye.__version__

In [69]:
protein_dataset.export_dataset("protein_dataset.json")

In [70]:
protein_dataset._get_missing_basis_coverage()

KeyError: 'def2-dzvpp'