In [None]:
#!/usr/bin/env python3

########################################################################
# File: problem18.ipynb

# Author: Nicholas Chan
# History: 11/15/2021 Created
########################################################################

# Assignment 5: problem 18
<br>
For this assignment, we were to find all the peptides that theoretical spectrums corresponding to an experimental spectrum <br>
given as input. This experimental spectrum was assumed to be ideal. To accomplish this, a branch and bound algorithm like <br>
the one our textbook mentioned could be utilitzed. This branch and bound algorithm would generate peptides starting from a list with <br>
an empty peptide which would be continuously expanded upon 1 amino acid at a time until peptides of masses equal to <br>
the parent mass from the experimental spectrum could be obtained. These peptides would then have their cyclopeptide spectrums<br>
checked with the experimental spectrum to determine if these peptides were indeed possible peptide encodings for the experimental spectrum. <br>

# TheoreticalSpectrum Class
<br>
TheoreticalSpectrum class creates an object which stores an ideal experimental spectrum (expSpec) <br>
in the form of an int list. Using expSpec, a branch and bound algorithm which <br>
branches peptides by amino acids consistent with the expSpec and bounds peptides which <br>
are not can be used to determine possible peptides that may encode for the peptide <br>
described by expSpec. A peptide is consistent if the spectrum of its cyclic form <br>
matches expSpec. To get to that point however, the peptide must have subseq masses <br>
contained in the spectrum (This part is a sort of consistency check within the branching step)

In [4]:
class TheoreticalSpectrum:
    '''
    TheoreticalSpectrum objects store ideal experimental spectrum given as input as well as its
    parent mass. TheoreticalSpectrum objects can run cyclopeptideSeq to find the potential peptides
    which encode for the ideal experimental specutrum (which is also cyclic). These 
    encoding peptides are represented by a list of ints corresponding to an amino acid mass.
    These peptides are also in their linear forms on output.
    '''
    # Amino acid to its integer mass table, 
    AA = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103,
          'I': 113, 'L': 113, 'N': 114, 'D': 115, 'K': 128, 'Q': 128, 'E': 129,
          'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
    
    def __init__(self, expSpec:"list[int]"):
        self.expSpec = expSpec # List of ints representing an ideal experimental spectrum
        self.parentMass = max(expSpec) # Used to check candidate spectrums, textbook says this is largest mass in an ideal spectrum
        
    def findCircSpec(self, seq:"list[int]"):
        ''' 
        Given a list of linear peptide aa masses, return its circular theoretical spectrum as a list of ints.
        Adapted from problem 17 solution, however peptides given as input are represented by
        lists of ints corresponding to amino acid masses instead of lists of strings corresponding to amino acids.
        '''
        cyclicPep = seq*2
        specList = [0, sum(seq)] # List for all subpep masses, where subpep come from circularized input pep
        for start in range(len(seq)): # For each position in linear pep, 
            for end in range(1,len(seq)): # generate substrings of len 1 to len(pep)-1
                specList.append(sum(cyclicPep[start : start+end]))
        return sorted(specList)
    
    def expand(self, peps:"set(tuple(int)"):
        ''' 
        To be used on growing spectrums (represented by set of tuple of ints) 
        of candidate peptides (represented by tuple of ints). Expand
        also checks for consistency somewhat. Consistency partially
        checked by use of total mass of candidate spectrum and valid amino acids
        given by matching single amino acids from ideal spectrum. Final check 
        needs to be in cyclopeptideSeq if cyclospectrum(Peptide) == expSpec.
        '''
        newPeps = set()
        for pep in peps: # pep is a tuple containing growing spectrum
            for aa in (self.AA.values()): # cycle over aminos
                if aa in self.expSpec: # if amino is contained in ideal expSpec -> since we know input spec is ideal we can ignore other aminos
                    tmpPep = list(pep)
                    tmpPep += [aa]
                    if sum(pep) in self.expSpec: # 1st check if newPep is consistent, 2nd check will be in cyclopeptideSeq() 
                        newPeps.add(tuple(tmpPep))
                    else: # newPep spectrum has mass not found in expSpec -> HOPELESS
                        continue
                else: # amino isn't contained in expSpec and should be skipped
                    continue
        return newPeps
    
    def cyclopeptideSeq(self):
        '''
        Inspired by psuedocode from textbook. cyclopeptideSeq utilizes a branch and bound algorithm
        to build a list of peptide compositions which each have a theoretical spectrum matching that
        of the experimental spectrum which is known to be ideal.
        '''
        peps = {()} # set containing only empty peptide, to be expanded
        matches = [] # Accumulate matched peps into this list
        while len(peps) != 0: # While candidate peps remain and can be expanded upon
            peps = self.expand(peps) # expand peps, in the process it edits out old peps and replaces them with new ones with +1 amino acid (exponential increase)
            toRemove = set() # hold onto peps inconsistent by composition
            for pep in peps: 
                if sum(pep) == self.parentMass:
                    if self.findCircSpec(list(pep)) == self.expSpec: # Comparison of list to a list, 2nd consistency check kinda, makes sure all aminos are right
                        matches.append(pep)
                    else:
                        toRemove.add(pep)
            # Consistency check for removing hopeless peps done in expand I think
            peps -= toRemove # Only way I found for deleting inconsistent peps was accumulating them and removing them before next expansion
        return matches
    
    def printMatches(self):
        '''
        printMatches method is used to produce string output adapted from
        cyclopeptideSeq's output for use in the main function.
        '''
        outList = []
        for spec in self.cyclopeptideSeq():
            outList.append("-".join([str(mass) for mass in spec]))
        outList.sort(reverse = True)
        outString = " ".join(outList)
        return outString

# Main Function
<br>
Parses an input text file as a list of ints which the TheoreticalSpectrum class requires as input. <br>
Input is assumed to be an ideal experimental mass spectrum. Output is printed to stdout. <br>

In [5]:
def main(infile, outfile='', inCL=None):
    '''
    main function parses in data specified by infile and creates 
    a list of amino acid masses which correspond to a cyclopeptide 
    spectrum. main function then creates a TheoreticalSpectrum object called 
    mySpec to generate output corresponding to possible linear peptides
    which encode for the cyclopeptide represented by the spectrum given
    as input.
    '''
    with open(infile,'r') as myfile:
        idealSpecRaw = myfile.readline().rstrip()
        idealSpecStr = idealSpecRaw.split(" ")
        idealSpec = [int(mass) for mass in idealSpecStr]
        mySpec = TheoreticalSpectrum(idealSpec)
        outstring = mySpec.printMatches()
        print(outstring)

if __name__ == "__main__":
    main("data/rosalind_ba4e.txt")

99-99-163-101-115-186-115-131-131-113 99-99-113-131-131-115-186-115-101-163 99-163-101-115-186-115-131-131-113-99 99-113-131-131-115-186-115-101-163-99 186-115-131-131-113-99-99-163-101-115 186-115-101-163-99-99-113-131-131-115 163-99-99-113-131-131-115-186-115-101 163-101-115-186-115-131-131-113-99-99 131-131-115-186-115-101-163-99-99-113 131-131-113-99-99-163-101-115-186-115 131-115-186-115-101-163-99-99-113-131 131-113-99-99-163-101-115-186-115-131 115-186-115-131-131-113-99-99-163-101 115-186-115-101-163-99-99-113-131-131 115-131-131-113-99-99-163-101-115-186 115-101-163-99-99-113-131-131-115-186 113-99-99-163-101-115-186-115-131-131 113-131-131-115-186-115-101-163-99-99 101-163-99-99-113-131-131-115-186-115 101-115-186-115-131-131-113-99-99-163


In [3]:
# INSPECTION

# INSPECTION TEAM
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments