In [3]:
#Import biopython module to open FASTA files
from Bio import SeqIO

In [4]:
"""Inputs: peptide string, max length of split peptide. 
   Outputs: all possible splits that could be formed that are smaller in length than the maxed input """
def splitDictPeptide(peptide, maxed):
    length = len(peptide)
    # splits will hold all possible splits that can occur
    splits = []
    # splitRef will hold a direct reference to the characters making up each split string: for starting peptide ABC,
    # the split AC = [0,2] 
    splitRef = []
    
    # imbedded for loops build all possible splits
    for i in range(0, length):
        character = peptide[i]
        toAdd=""
        # add and append first character and add and append reference number which indexes this character
        toAdd+=character
        splits.append(toAdd)
        ref = []
        ref.append(i)
        temp = list(ref)  # use list because otherwise shared memory overwrites
        splitRef.append(temp)
        
        # iterates through every character after current and adds it to the most recent string if max size
        # requirement is satisfied
        for j in range(i+1, length):
            toAdd+=peptide[j]
            if(maxSize(toAdd, maxed)):
                
                splits.append(toAdd)
                ref.append(j)
                temp = list(ref)
                
                splitRef.append(temp)
                temp=[]
      
      
        ref = []
        
        
    return splits, splitRef

In [5]:
"""Input: splits: list of splits, splitRef: list of the character indexes for splits, mined/maxed: min and max
   size requirements, overlapFlag: boolean value true if overlapping combinations are undesired.
   Output: all combinations of possible splits which meets criteria"""
def combineOverlapPeptide(splits, splitRef, mined, maxed, overlapFlag):
    
    # initialise combinations array to hold the possible combinations from the input splits
    combine = []
   
    # iterate through all of the splits and build up combinations which meet min/max/overlap criteria
    for i in range(0, len(splits)):
        if(minSize(splits[i], mined)):
            #Add linear peptide, can include option if necessary
            combine.append(splits[i])
        
        # toAdd holds temporary for insertion in final matrix if it meets criteria
        toAdd=""
        
        for j in range(i+1, len(splits)): 
            # create forward combinaiton of i and j
            toAdd+=splits[i]
            toAdd+=splits[j]
    
    # look to combine all checks together in a future for clarity
            if(maxSize(toAdd, maxed) and minSize(toAdd, mined)):
            
                if(overlapFlag==True):
                    if(overlapComp(splitRef[i],splitRef[j])):
                        combine.append(toAdd)
                else:
                    combine.append(toAdd)
               
            #create backwards combination of i and j
            toAdd = ""
            toAdd+=splits[j]
            toAdd+=splits[i]
            
            if(maxSize(toAdd, maxed) and minSize(toAdd, mined)):
                if(overlapFlag==True):
                    if(overlapComp(splitRef[i],splitRef[j])):
                        combine.append(toAdd)
                else:
                    combine.append(toAdd)
               
                
                
        
            toAdd=""
    return combine

In [6]:
# ensures length of split is smaller than or equal to max
def maxSize(split, maxed):
    if(len(split)>=maxed):
        return False
    return True

In [7]:
# ensures length of split is greater than min
def minSize(split, mined):
    if(len(split)<mined):
        return False
    return True

In [8]:
# checks if there is an intersection between two strings. Likely input it the splitRef data. 
# Outputs True if no intersection
def overlapComp(ref1,ref2):
    S1 = set(ref1)
    S2 = set(ref2)
    if(len(S1.intersection(S2))==0):
        return True
    return False

In [9]:
# opens FASTA file
def addSequenceList(input_file):
    
    fasta_sequences = SeqIO.parse(open(input_file),'fasta')
    sequenceDictionary = {}
    for fasta in fasta_sequences:
        name, sequence = fasta.id, fasta.seq.tostring()
        sequenceDictionary[name] = sequence
    return sequenceDictionary

In [10]:
# Adapted from https://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-whilst-preserving-order
# removes duplicates given a list
def removeDups(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [11]:
# combines an array of strings into one string. Used for ultimately segments from multiple peptides
def combinePeptides(peptideList):
    finalPeptide = ''.join(peptideList)
    return finalPeptide


In [12]:
# generates most of the permutations possible when switching from A to a in all strings originally containing an A
# input is a list of all combinations
def modTest(combine):
    # A, B, C  convert to a, b, c
    modComb = []
    for string in combine:
        if 'A' in string:
            
            numOccur = string.count('A')
            print(numOccur)
            for i in range(0, numOccur):
                temp = string
                temp = temp.replace("A","a", i+1)
                print(temp)
                modComb.append(temp)
    print(modComb)

In [13]:
maxed = 12
mined= 0
overlap = True
splits, splitRef = splitDictPeptide("ABAAA", maxed)
print(splits)
splits = removeDups(splits)
print(splits)




['A', 'AB', 'ABA', 'ABAA', 'ABAAA', 'B', 'BA', 'BAA', 'BAAA', 'A', 'AA', 'AAA', 'A', 'AA', 'A']
['A', 'AB', 'ABA', 'ABAA', 'ABAAA', 'B', 'BA', 'BAA', 'BAAA', 'AA', 'AAA']


In [14]:
print(splits)
print(len(splits))

['A', 'AB', 'ABA', 'ABAA', 'ABAAA', 'B', 'BA', 'BAA', 'BAAA', 'AA', 'AAA']
11


In [15]:
combine = combineOverlapPeptide(splits, splitRef, mined, maxed, overlap)
combine = removeDups(combine)
print(combine)
print(len(combine))

['A', 'AB', 'BA', 'ABA', 'BAA', 'ABAA', 'BAAA', 'ABAAA', 'BAAAA', 'AAA', 'AAAA', 'AAAB', 'AAAAB', 'B', 'AAB', 'AA']
16


In [21]:
# taking FASTA dictionary and passing through our splits and combine functions
sequenceDictionary = addSequenceList("Example.fasta")
for key, value in sequenceDictionary.items():
    
    splits, splitRef = splitDictPeptide(value, maxed)
    
    splits = removeDups(splits)
    combine = combineOverlapPeptide(splits, splitRef, mined, maxed,  overlap)
    combine = removeDups(combine)
    
    print(len(splits))
    print(len(combine))
    
    break;



2343
1648018


Analysis of removing duplicates at split level and combined level:
    - Spit level: 3.1 mil to 1.8 mil
    - Combined level: 1.8 mil to 1.6 mil
    