# Protein Fragments Unconstrained Optimization - Preparation

In [1]:
import os

from qcsubmit.factories import OptimizationDataset, OptimizationDatasetFactory
from openforcefield.topology import Molecule
from openforcefield.utils.toolkits import RDKitToolkitWrapper, UndefinedStereochemistryError



We'll use the same artifacts produced for submitting Optimizations in "2020-08-12-OpenFF-Protein-Fragments-version2".

In [3]:
!tar -xzf ../2020-08-12-OpenFF-Protein-Fragments-version2/Input_files.tar.gz -C .

## Preparation steps

In [2]:
# we need a function to get all of the conformers from the folder and make an openff molecule
def gather_conformers(folder):
    """
    create a molecule from all of the conformers in the folder.
    """
    molecules = []
    for file in os.listdir(folder):
        if file.endswith(".mol2") and not file.startswith('.'):
            # this is one of the molecules
            try:
                mol2 = Molecule.from_file(os.path.join(folder, file))
            except UndefinedStereochemistryError:
                # if we get undefined stereochemistry, we skip this conformer
                print(f"Undefined stereochemistry for {folder}/{file}; skipping")
                continue
            # now save again in PDB format
            name = file.split(".")[0]
            # fix issue with openeye reordering when writing files.
            mol2.to_file(os.path.join(folder, name + ".pdb"), "pdb", RDKitToolkitWrapper())
            # load up the new molecule
            pdb = Molecule.from_file(os.path.join(folder, name + ".pdb"))
            molecules.append(pdb)
            
    # now we need to condense all of the conformers down to one molecule.
    return_mol = molecules.pop()
    for molecule in molecules:
        return_mol.add_conformer(molecule.conformers[0])
        
    return return_mol

In [3]:
# now we need to loop over all of the folders and for each one gather the molecule and all of its conformers
# and all of the constraints and add them to the dataset.
protein_dataset = OptimizationDataset(dataset_name="OpenFF Protein Peptide Fragments unconstrained v1.0", 
                                      dataset_tagline="Unconstrained optimization of various protein fragments.")
factory = OptimizationDatasetFactory()

for folder in os.listdir("Input_files"):
    folder_path = os.path.join("Input_files", folder)
    
    if not os.path.isdir(folder_path):
        print(f"Skipping {folder}")
        continue
        
    for combination in os.listdir(folder_path):
        combination_path = os.path.join(folder_path, combination)
        
        if not os.path.isdir(combination_path):
            print(f"Skipping {combination}")
            continue
                
        print("collecting from folder", combination_path)
        
        conformers = gather_conformers(combination_path)
        
        # create the data needed for the optimization
        index = combination.lower()
        attributes = factory.create_cmiles_metadata(conformers)
        
        protein_dataset.add_molecule(index=index, molecule=conformers, attributes=attributes)

Skipping ._.DS_Store
collecting from folder Input_files/GLN/VAL_GLN_GLY
collecting from folder Input_files/GLN/ALA_GLN_GLY
collecting from folder Input_files/GLN/GLY_GLN_SER
collecting from folder Input_files/GLN/SER_GLN_VAL
collecting from folder Input_files/GLN/GLY_GLN_ALA
collecting from folder Input_files/GLN/ALA_GLN_VAL
collecting from folder Input_files/GLN/GLY_GLN_GLY
Skipping ConfGLN.tar.gz
collecting from folder Input_files/GLN/SER_GLN_SER
collecting from folder Input_files/GLN/ALA_GLN_SER
collecting from folder Input_files/GLN/SER_GLN_GLY
collecting from folder Input_files/GLN/SER_GLN_ALA
collecting from folder Input_files/GLN/VAL_GLN_VAL
collecting from folder Input_files/GLN/ALA_GLN_ALA
collecting from folder Input_files/GLN/GLY_GLN_VAL
collecting from folder Input_files/GLN/VAL_GLN_SER
Undefined stereochemistry for Input_files/GLN/VAL_GLN_SER/Conf524.mol2; skipping
Skipping ._ConfGLN.tar.gz
collecting from folder Input_files/GLN/VAL_GLN_ALA
Skipping ._GLN
Skipping ._ASN
co



Skipping ._VAL_ARG_GLY
Skipping ._GLY_ARG_VAL
collecting from folder Input_files/ARG/GLY_ARG_GLY




collecting from folder Input_files/ARG/SER_ARG_ALA




Skipping ._GLY_ARG_GLY
Skipping ._ALA_ARG_GLY
collecting from folder Input_files/ARG/ALA_ARG_ALA




Skipping ._GLY_ARG_SER
collecting from folder Input_files/ARG/VAL_ARG_VAL




Skipping ._ConfARG.tar.gz
collecting from folder Input_files/ARG/SER_ARG_SER




collecting from folder Input_files/ARG/GLY_ARG_VAL




collecting from folder Input_files/ARG/ALA_ARG_GLY




Skipping ._VAL_ARG_VAL
Skipping ._VAL_ARG_ALA
Skipping ._GLY_ARG_ALA
Skipping ._ALA_ARG_VAL
collecting from folder Input_files/ARG/SER_ARG_VAL
Undefined stereochemistry for Input_files/ARG/SER_ARG_VAL/Conf508.mol2; skipping




collecting from folder Input_files/ARG/GLY_ARG_ALA
Skipping ._SER_ARG_SER
Skipping ConfARG.tar.gz
collecting from folder Input_files/ARG/SER_ARG_GLY
collecting from folder Input_files/ARG/ALA_ARG_SER
Undefined stereochemistry for Input_files/ARG/ALA_ARG_SER/Conf515.mol2; skipping
Skipping ._ALA_ARG_SER
Skipping ._VAL_ARG_SER
collecting from folder Input_files/ARG/ALA_ARG_VAL




collecting from folder Input_files/ARG/VAL_ARG_ALA
Skipping ._ALA_ARG_ALA
collecting from folder Input_files/ARG/VAL_ARG_SER
collecting from folder Input_files/ARG/VAL_ARG_GLY
collecting from folder Input_files/GLU/GLY_GLU_GLY




Skipping ._ConfGLU.tar.gz
collecting from folder Input_files/GLU/GLY_GLU_ALA




collecting from folder Input_files/GLU/VAL_GLU_GLY




collecting from folder Input_files/GLU/VAL_GLU_ALA
Undefined stereochemistry for Input_files/GLU/VAL_GLU_ALA/Conf467.mol2; skipping




collecting from folder Input_files/GLU/GLY_GLU_VAL
Undefined stereochemistry for Input_files/GLU/GLY_GLU_VAL/Conf517.mol2; skipping
collecting from folder Input_files/GLU/ALA_GLU_ALA




collecting from folder Input_files/GLU/VAL_GLU_VAL
Undefined stereochemistry for Input_files/GLU/VAL_GLU_VAL/Conf488.mol2; skipping




collecting from folder Input_files/GLU/ALA_GLU_VAL
Skipping ConfGLU.tar.gz
collecting from folder Input_files/GLU/ALA_GLU_GLY
Skipping ._GLU
Skipping ConfASP.tar.gz
collecting from folder Input_files/ASP/SER_ASP_VAL




collecting from folder Input_files/ASP/VAL_ASP_VAL




collecting from folder Input_files/ASP/ALA_ASP_VAL
collecting from folder Input_files/ASP/VAL_ASP_ALA
Skipping ._ConfASP.tar.gz
Skipping ._VAL_ASP_GLY
collecting from folder Input_files/ASP/ALA_ASP_ALA
Skipping ._GLY_ASP_ALA
collecting from folder Input_files/ASP/GLY_ASP_VAL
Skipping ._SER_ASP_GLY
Skipping ._ALA_ASP_ALA
collecting from folder Input_files/ASP/ALA_ASP_GLY
Skipping ._GLY_ASP_SER
Skipping ._ALA_ASP_VAL
Skipping ._GLY_ASP_GLY
collecting from folder Input_files/ASP/SER_ASP_ALA




collecting from folder Input_files/ASP/GLY_ASP_ALA
Skipping ._ALA_ASP_GLY
Skipping ._GLY_ASP_VAL
collecting from folder Input_files/ASP/GLY_ASP_GLY




collecting from folder Input_files/ASP/SER_ASP_SER




collecting from folder Input_files/ASP/VAL_ASP_GLY
collecting from folder Input_files/ASP/GLY_ASP_SER




Skipping ._VAL_ASP_ALA
Skipping ._SER_ASP_ALA
collecting from folder Input_files/ASP/SER_ASP_GLY
Skipping ._ALA_ASP_SER
collecting from folder Input_files/ASP/VAL_ASP_SER
collecting from folder Input_files/ASP/ALA_ASP_SER
Skipping ._HIE
Skipping ._CYS
collecting from folder Input_files/GLY/VAL_GLY_ALA
Skipping ConfGLY.tar.gz
Skipping ._GLY_GLY_SER
collecting from folder Input_files/GLY/SER_GLY_ALA
Skipping ._GLY_GLY_ALA
collecting from folder Input_files/GLY/VAL_GLY_VAL
Skipping ._SER_GLY_GLY
collecting from folder Input_files/GLY/SER_GLY_VAL
Skipping ._ALA_GLY_SER
collecting from folder Input_files/GLY/VAL_GLY_SER
collecting from folder Input_files/GLY/GLY_GLY_VAL
Skipping ._ALA_GLY_VAL
Skipping ._ALA_GLY_ALA
Skipping ._SER_GLY_ALA
Skipping ._GLY_GLY_VAL
collecting from folder Input_files/GLY/SER_GLY_SER
Skipping ._ALA_GLY_GLY
Skipping ._VAL_GLY_GLY
collecting from folder Input_files/GLY/ALA_GLY_GLY
collecting from folder Input_files/GLY/GLY_GLY_ALA
collecting from folder Input_files/

In [4]:
protein_dataset.n_molecules

185

In [5]:
protein_dataset.n_records

6709

In [6]:
protein_dataset.metadata.elements

{'C', 'H', 'N', 'O', 'S'}

In [7]:
protein_dataset.visualize("protein_dataset.pdf", columns=3)

In [8]:
# manual provenance fix
import openeye
protein_dataset.metadata.long_description_url = "https://github.com/openforcefield/qca-dataset-submission/tree/master/submissions/2020-10-27-OpenFF-Protein-Fragments-Unconstrained"
protein_dataset.metadata.submitter = 'dotsdl'
protein_dataset.provenance = factory.provenance()
protein_dataset.provenance["openeye"] = openeye.__version__

In [9]:
protein_dataset.export_dataset("dataset.json")

In [10]:
protein_dataset._get_missing_basis_coverage()

In [12]:
# compress dataset
! bzip2 dataset.json

In [13]:
protein_dataset.molecules_to_file('molecules.smi', 'smi')