# QCSubmit protein prep

Instructions from David Cerutti
The way it's set up, we are scanning phi and psi of the central residue with a random selection of ALA, GLY, SER, or VAL to the N- or C-terminus, and the customary ACE and NME blocking groups outside of that. 

This notebook will go through the files supplied and extract all of the starting conformations and the restraints built by David using mdgx.

Here all molecule mol2 files have a corresponding frozen dihedrals selection file and both must be read and used when making the dataset.

The mol2 files also have incorrect bond order so we MUST RE-SAVE THE FILE AS PDB SO WE CAN GET THE CORRECT ORDERS using Openeye to perceive the values. 

In [19]:
from qcsubmit.factories import OptimizationDatasetFactory
from qcsubmit.datasets import OptimizationDataset
import os
from openforcefield.topology import Molecule
from openforcefield.utils.toolkits import RDKitToolkitWrapper


In [20]:
# now we need to loop through the folders and load the molecules and frozen dihedrals list
# lets make sure that the dihedrals are the same in every file
def gather_restraints(folder):
    """
    create a set of all of the restraints in the given folder.
    """
    constraints = set()
    for file in os.listdir(folder):
        if file.endswith(".frz"):
            # this is a constraint file
            with open(os.path.join(folder, file)) as cons_data:
                for line in cons_data.readlines()[1:]:
                    constraint = [int(x) - 1 for x in line.split()[:4]]
                    constraints.add(tuple(constraint))
                    
    return constraints


                    

In [21]:
# we need a function to get all of the conformers from the folder and make an openff molecule
def gather_conformers(folder):
    """
    create a molecule from all of the conformers in the folder.
    """
    molecules = []
    for file in os.listdir(folder):
        if file.endswith(".mol2"):
            # this is one of the molecules
            mol2 = Molecule.from_file(os.path.join(folder, file))
            # now save again in PDB format
            name = file.split(".")[0]
            # fix issue with openeye reordering when writing files.
            mol2.to_file(os.path.join(folder, name + ".pdb"), "pdb", RDKitToolkitWrapper())
            # load up the new molecule
            pdb = Molecule.from_file(os.path.join(folder, name + ".pdb"))
            molecules.append(pdb)
            
    # now we need to condense all of the conformers down to one molecule.
    return_mol = molecules.pop()
    for molecule in molecules:
        return_mol.add_conformer(molecule.conformers[0])
        
    return return_mol

In [22]:
# now we need to loop over all of the folders and for each one gather the molecule and all of its conformers
# and all of the constraints and add them to the dataset.
protein_dataset = OptimizationDataset(dataset_name="OpenFF Protein Peptide Fragments constrained v1.0", dataset_tagline="Constrained optimization of various protein peptide fragments.", description="Constrained optimizations of protein peptide fragments sampling the phi and psi torsions of the central residue.")
factory = OptimizationDatasetFactory()
for folder in os.listdir("Input_files"):
    folder_path = os.path.join("Input_files", folder)
    if not os.path.isdir(folder_path):
        continue
    for combination in os.listdir(folder_path):
        combination_path = os.path.join(folder_path, combination)
        if ".tar" not in combination_path:
            txt_constraints = gather_restraints(combination_path)
            conformers = gather_conformers(combination_path)
            # create the data needed for the optimization
            index = combination.lower()
            attributes = factory.create_cmiles_metadata(conformers)
            # make the constraints for geometric
            geometric_constraints = []
            for constraint in txt_constraints:
                geometric_constraints.append({"type": "dihedral", "indices": list(constraint)})
            keywords = {"constraints": {"freeze": geometric_constraints}}
            protein_dataset.add_molecule(index=index, molecule=conformers, attributes=attributes, keywords=keywords)
        
        



In [23]:
protein_dataset.molecules_to_file("fragments.smi", "smi")

In [24]:
protein_dataset.n_molecules

185

In [25]:
protein_dataset.n_records

6716

In [26]:
protein_dataset.metadata.elements

{'C', 'H', 'N', 'O', 'S'}

In [27]:
protein_dataset.visualize("protein_dataset.pdf", columns=3)

In [28]:
# manual provenance fix
import openeye
protein_dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-08-12-OpenFF-Protein-Fragments-version2"
protein_dataset.provenance = factory.provenance()
protein_dataset.provenance["openeye"] = openeye.__version__
protein_dataset.metadata.submitter = "jthorton"

In [29]:
protein_dataset.metadata

Metadata(submitter='jthorton', creation_date=datetime.date(2020, 10, 30), collection_type='OptimizationDataset', dataset_name='OpenFF Protein Peptide Fragments constrained v1.0', short_description='Constrained optimization of various protein peptide fragments.', long_description_url=HttpUrl('https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-08-12-OpenFF-Protein-Fragments-version2', scheme='https', host='github.com', tld='com', host_type='domain', path='/openforcefield/qca-dataset-submission/tree/master/submissions/2020-08-12-OpenFF-Protein-Fragments-version2'), long_description='Constrained optimizations of protein peptide fragments sampling the phi and psi torsions of the central residue.', elements={'H', 'C', 'S', 'O', 'N'})

In [30]:
protein_dataset.export_dataset("dataset.json.xz")

In [17]:
protein_dataset._get_missing_basis_coverage()