# Sequence Motif Coverage Predictor

This program is designed to return the number of peptides for a protein (or set of proteins) which contain a given sequence motif (N-glyco motif) and are suitable for mass spectrometry. 

The program considers protein-level information from prediction tools such as `TMHMM`, `Phobius`, and `SignalP` in order to better interpret the context of possible sequence motifs. 

## I/O and Usage

### Usage

### Input


### Output



In [16]:
# This cell contains the definitions of functions to parse the various input file types, including:
    # TMHMM short format output
    # Phobius short format output
    # Signal P short format output
    # Fasta file

# 

# Dependencies
    # regular expressions (re)



def parse_TMHMM(file_name):
    import re
    with open(file_name, 'r') as fo:
        tm_dict = dict()
        for line in fo:
            line = line.rstrip()
            ID = line.split('\t')[0]
            length = int(line.split('\t')[1].split('=')[1])
            topo_str = line.split('\t')[5].split('=')[1]
            num_TM = int(line.split('\t')[4].split('=')[1])
            
            if len(topo_str) == 1:
                tm_dict[ID] = { 'length' : length, 'num_TM' : num_TM , 'topo' : 'n/a' }
            else:
                tm_dict[ID] = { 'length' : length, 'num_TM' : num_TM , 'topo' : topo_str } 
                
        
        for key in tm_dict:
            inside = list()
            outside = list()
            i1 = list()
            i2 = list()
            o1 = list()
            o2 = list()
            
            topo = tm_dict[key]['topo']
            
            if topo:
                
                if topo.startswith('i'):
                   i1.append(1)
                elif topo.startswith('o'):
                   o1.append(1)
                    
                for match in re.finditer(r'i(\d+?)-(\d+?)o', topo):
                    i2.append(int(match.group(1))-1)
                    o1.append(int(match.group(2))+1)
                for match in re.finditer(r'o(\d+?)-(\d+?)i', topo):
                    i1.append(int(match.group(2))+1)
                    o2.append(int(match.group(1))-1)
        
                if len(i1) > len(i2):
                    i2.append(tm_dict[key]['length'])
                elif len(o1) > len(o2):
                    o2.append(tm_dict[key]['length'])
             
            inside_residues = list()
            outside_residues = list()
            
            for i in range(len(i1)):
                inside_residues.extend(list(range(i1[i],i2[i]+1)))
                
            for i in range(len(o1)):
                outside_residues.extend(list(range(o1[i],o2[i]+1)))
                
            tm_dict[key]['inside'] = inside_residues
            tm_dict[key]['outside'] = outside_residues
            
                    
    return tm_dict

def parse_Phobius(file_name):
    import re
    with open(file_name, 'r') as fo:
        
        tm_dict = dict()
        
        with open('io_files/human_plus_leftovers.tab' , 'r') as fo2:
            length_dict = dict()
            header = True
            for line in fo2:
                if header:
                    header = False
                else:
                    line = line.rstrip()
                    ID = line.split()[0]
                    length = line.split('\t')[6]
                    length_dict[ID] = int(length)
                            
        for line in fo:
            line = line.rstrip()
            ID, num_TM, SP, topo_str = line.split()
            ID = ID.split('|')[1]
            
            if bool(length_dict.get(ID)):
                length = length_dict[ID]
            else:
                continue   
            
            if len(topo_str) < 1:
                tm_dict[ID] = { 'length' : length, 'topo' : 'n/a' , \
                                'num_TM' : int(num_TM), 'SP' : SP}
            else:
                tm_dict[ID] = { 'length' : length, 'topo' : topo_str ,  \
                                'num_TM' : int(num_TM), 'SP' : SP}
                
        for key in tm_dict:
            inside = list()
            outside = list()
            i1 = list()
            i2 = list()
            o1 = list()
            o2 = list()
            
            topo = tm_dict[key]['topo']
            
            if topo:
                
                if topo.startswith('i'):
                    i1.append(1)
                elif topo.startswith('o'):
                    o1.append(1)
                else:
                    match = re.search(r'\S+/(\S+?)([io])(\S*)', topo)
                    if match:
                        topo = match.group(2) + match.group(3)
                    else:
                        print(topo)
                    if topo.startswith('i'):
                        i1.append(int(match.group(1)))
                    elif topo.startswith('o'):
                        o1.append(int(match.group(1)))
                   
                   
                    
                for match in re.finditer(r'i(\d+?)-(\d+?)o', topo):
                    i2.append(int(match.group(1))-1)
                    o1.append(int(match.group(2))+1)
                for match in re.finditer(r'o(\d+?)-(\d+?)i', topo):
                    i1.append(int(match.group(2))+1)
                    o2.append(int(match.group(1))-1)
        
                if len(i1) > len(i2):
                    i2.append(tm_dict[key]['length'])
                elif len(o1) > len(o2):
                    o2.append(tm_dict[key]['length'])
             
            inside_residues = list()
            outside_residues = list()
            
            for i in range(len(i1)):
                inside_residues.extend(list(range(i1[i],i2[i]+1)))
                
            for i in range(len(o1)):
                outside_residues.extend(list(range(o1[i],o2[i]+1)))
                
            tm_dict[key]['inside'] = inside_residues
            tm_dict[key]['outside'] = outside_residues
            
    return tm_dict


def parse_signalP(file_name):
    
        with open(file_name, 'r') as fo:
            
            SP_dict = dict()
            
            for line in fo:
                if line.startswith('#'):
                    continue
                else:
                    line = line.rstrip()
                    ID = line.split()[0]
                    SP = line.split()[1]
                    score = line.split()[2]
                    
                    if SP.startswith('SP'):
                        SP = 'Y'
                    else:
                        SP = 0
            
                    SP_dict[ID] = { 'SP': SP , 'score' : score } 
        
        return SP_dict
    
def parse_Predisi(file_name):
    
    with open (file_name, 'r') as fo:
        
        predisi_dict = dict()
        header = True
        
        for line in fo:
            if header:
                header = False
            else:
                line = line.rstrip()
                ID = line.split('\t')[0].split('|')[1]
                SP = line.split('\t')[-2]
                score  = line.split('\t')[-4]
                
                predisi_dict[ID] = {'SP' : SP, 'score' : score}
                
                
    return predisi_dict
    
def parse_SPC(seq_dict, file_name):
    
    with open(file_name, 'r') as fo:
        
            SPC_dict = dict()
            header = True

            for line in fo:
                    if header:
                        header = False
                    else:
                        line = line.rstrip()
                        ID, SPC, BF, T, dC, DR = line.split(',') 
                        
                        SPC_dict[ID] = dict() 
                        SPC_dict[ID]['score'] = int(SPC)
                        
                        stringOut = ''
                        
                        if int(BF) == 1 :
                            stringOut += 'BF'
                        if int(T) == 1 :
                            stringOut += ',T'
                        if int(dC) == 1:
                            stringOut += ',dC'
                        if int(DR) == 1:
                            stringOut += ',DR'
                        
                        stringOut = stringOut.lstrip(',')                            
                        
                        SPC_dict[ID]['stringOut'] = stringOut
            
            for ID in seq_dict:
                
                if SPC_dict.get(ID) == None:
                    SPC_dict[ID] = {'score': 0, 'stringOut' : 'n/a'}
            
    return SPC_dict
        
        
            
def fasta_parser(fasta_filename):   
    

    fasta_fileobj = open(fasta_filename, 'r')	## create a file obj from the specified file

    sequence_name = ''				## initialize strings to populate from file object info
    sequence_desc = ''
    sequence_string = ''
    sequence_dict = {}

    for line in fasta_fileobj:  			## iterate through file object with for loop
        line = line.rstrip()			## strip white space on the right side (like a new line character!) 

        if line.startswith('>'):
            
            if len(sequence_string) > 0:
                sequence_dict[sequence_name] = sequence_string	
                sequence_string = ''  		## reset for the new sequence
            
            line = line.lstrip('>')  		## remove leading `>` char
            sequence_info = line.split(maxsplit=1)  ## split on only first space
            sequence_name = sequence_info[0].split('|')[1]
	
            if len(sequence_info) > 1:
                sequence_desc = sequence_info[1]
            else:					## sequence has no description, set to empty
                sequence_desc = ''
		
           
            line = line.lstrip('>')  		## remove leading `>` char
            sequence_info = line.split(maxsplit=1)  	## split on only first space
           
            if len(sequence_info) > 1:
                sequence_desc = sequence_info[1]
           
            else:
            # sequence has no description, set to empty
                sequence_desc = ''
             
        else:
            sequence_string += line  # incrementally elongate seq

# When we reach the end of the FASTA file, we drop out of the
# 'for' loop. However, we still have the last sequence record
# stored in memory, which we haven't processed yet, because we
# haven't observed a '>' symbol, so we must copy and paste any
# code that we used to process sequences above to the code block
# below. Check if sequence_string has a non-zero length to
# determine whether to execute the sequence processing code:

    if len(sequence_string) > 0:
        sequence_dict[sequence_name] = sequence_string
        
    return sequence_dict

  

In [22]:
# This cell contatins the definition of functions required 

import re

## 2 missed cleavages are used for trypsinize, where the number of missed cleavages are recorded

def trypsinize(prot_seq):
    peptides= []
    cut_sites=[0]
    indices = []
    pep = ''

    for i in range(0,len(prot_seq)-1):
        if prot_seq[i] == 'K' and prot_seq[i+1] != 'P':
            cut_sites.append(i+1)
        elif prot_seq[i] == 'R' and prot_seq[i+1] != 'P':
            cut_sites.append(i+1)
        
    if cut_sites[-1]!=len(prot_seq):
            cut_sites.append(len(prot_seq))
            
    if len(cut_sites)>2:
            
        for j in range(0,len(cut_sites)-3):

            pep = prot_seq[cut_sites[j]:cut_sites[j+1]]
            for i in range(cut_sites[j],cut_sites[j+1]):
                indices.append(i+1)
            peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 0})
            indices = []

            pep = prot_seq[cut_sites[j]:cut_sites[j+2]]
            for i in range(cut_sites[j],cut_sites[j+2]):
                indices.append(i+1)
            peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 1})
            indices = []

            pep = prot_seq[cut_sites[j]:cut_sites[j+3]]
            for i in range(cut_sites[j],cut_sites[j+3]):
                indices.append(i+1)
            peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 2})
            indices = []

        pep = prot_seq[cut_sites[-3]:cut_sites[-2]]
        for i in range(cut_sites[-3],cut_sites[-2]):
            indices.append(i+1)
        peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 0})
        indices = []

        pep = prot_seq[cut_sites[-3]:cut_sites[-1]]
        for i in range(cut_sites[-3],cut_sites[-1]):
            indices.append(i+1)
        peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 1})
        indices = []

        pep = prot_seq[cut_sites[-2]:cut_sites[-1]]
        for i in range(cut_sites[-2],cut_sites[-1]):
            indices.append(i+1)
        peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 0})
        indices = []
                    
    else: #there is no trypsin site in the protein sequence
        peptides.append({'seq' : prot_seq, 'indices' : range(1,len(prot_seq)+1), 'missed_cleavages' : 0})
        
    return peptides

def ion_mim(prot_seq, charge_state):
    
    mass_table = {
            "A" : 71.03711,
            "R" : 156.10111,
            "N" : 114.04293,
            "D" : 115.02694,
            "C" : 103.00919 + 57.02146,
            "E" : 129.04259,
            "Q" : 128.05858,
            "G" : 57.02146,
            "H" : 137.05891,
            "I" : 113.08406,
            "L" : 113.08406,
            "K" : 128.09496,
            "M" : 131.04049,
            "F" : 147.06841,
            "P" : 97.05276,
            "S" : 87.03203,
            "T" : 101.04768,
            "W" : 186.07931,
            "Y" : 163.06333,
            "V" : 99.06841
            }
    
    mass = 0
    
    for aa in mass_table:
        mass += prot_seq.count(aa) * mass_table[aa]
    
    
    ion_mass = mass + (charge_state * 1.007276)
    m_z = ion_mass/charge_state
    
    return m_z


def ok_for_MS(pep_list):
    
    for pep in pep_list:
        
        pep['okForMS'] = ''
        
        if len(pep['seq']) > 5 and (ion_mim(pep['seq'], 2) < 2000):
            pep['okForMS'] += '2'
            
        if len(pep['seq']) > 5 and (ion_mim(pep['seq'], 3) < 2000):
            pep['okForMS'] += ',3'
        
        if len(pep['okForMS']) > 0:
            pep['okForMS'] = pep['okForMS'].lstrip(',')
        else:
            pep['okForMS'] = None
            
    return pep_list      

def make_prot_dict(seq_dict, tm_dict, phob_dict, sp_dict, predisi_dict, SPC_dict):
    
    prot_dict = dict()
    
    for ID in seq_dict:
      
      seq = seq_dict[ID]
    
      if tm_dict.get(ID) and phob_dict.get(ID) and sp_dict.get(ID) and predisi_dict.get(ID):
        
        
        pep_list = trypsinize(seq)
        
        pep_list = ok_for_MS(pep_list)

        
        glyco_indices_S = list()
        
        for match in re.finditer(r'N[^P]S', seq ):
            glyco_indices_S.append(match.start()+1)
        
        glyco_indices_T = list()
        
        for match in re.finditer(r'N[^P]T', seq ):
            glyco_indices_T.append(match.start()+1)    
            
        glyco_indices_C = list()
        
        for match in re.finditer(r'N[^P]C', seq ):
            glyco_indices_C.append(match.start()+1)   
        
        glyco_indices_V = list()
        
        for match in re.finditer(r'N[^P]V', seq ):
            glyco_indices_V.append(match.start()+1)
            
        K_indices = list()
        C_indices = list()
        
        for i in range(0, len(seq)):
            if seq[i] == 'K':
                K_indices.append(i+1)
            elif seq[i] == 'C':
                C_indices.append(i+1) 
                
        

        
        prot_dict[ID] = {                             
                            'seq_info' :
                              { 
                                  'seq' : seq , 
                                  'seq_len' : len(seq) 
                              } , 
                             
                            'topo' :
                              {
                                 'TMHMM' :
                                    { 'inside' : tm_dict[ID]['inside']  ,
                                      'outside' : tm_dict[ID]['outside']  ,
                                      'num_TM' : tm_dict[ID]['num_TM'] ,
                                      'stringOut' : tm_dict[ID]['topo'] 
                                    } ,
                                  
                                 'Phobius' :
                                    { 'inside' : phob_dict[ID]['inside']  , 
                                      'outside' : phob_dict[ID]['outside']  ,
                                      'num_TM' : phob_dict[ID]['num_TM']  ,
                                      'stringOut' : phob_dict[ID]['topo'] 
                                    }
                              } , 
                           
                            'signal' : 
                              {
                                  'Phobius' : {'SP' : phob_dict[ID]['SP'] , 'score' : 0 } ,
                                  'SignalP' : {'SP' : sp_dict[ID]['SP'] , 'score' : sp_dict[ID]['score'] } ,
                                  'PrediSi' : {'SP' : predisi_dict[ID]['SP'] , 'score' : predisi_dict[ID]['score'] }
                              } , 
                            
                            'SPC' : {'score' : int(SPC_dict[ID]['score']) , 'stringOut' : SPC_dict[ID]['stringOut']}, 
            
                            'peptides' : pep_list, 

            
                            'motif_sites' : 
                               { 
                                 'NXS' :
                                   { 'all' : glyco_indices_S , 'extracellular' : dict()  } ,
                                   
                                 'NXT' :
                                   { 'all' : glyco_indices_T , 'extracellular' : dict()  } , 
                                   
                                 'NXC' :
                                   { 'all' : glyco_indices_C , 'extracellular' : dict()  } ,
                                   
                                 'NXV' :
                                   { 'all' : glyco_indices_V , 'extracellular' : dict()  } , 
                                   
                                 'C' :
                                   { 'all' : C_indices , 'extracellular' : dict()  } , 
                                 
                                 'K' :
                                   { 'all' : K_indices , 'extracellular' : dict()  } , 
                                    
                               }                          
                                           
                        }

    return prot_dict

def EC_analysis(prot):
    
    motif_list = ['NXS','NXT','NXC','NXV','C','K'] 
        
    for pep in prot['peptides']:

        for motif in motif_list:
        
            pep[motif] = dict()
            pep[motif]['extracellular'] = dict()
        
        for pred in ['TMHMM','Phobius']: 
            
            outside_indices = prot['topo'][pred]['outside']

            for motif in motif_list:

                motif_indices = prot['motif_sites'][motif]['all']
                out_motif = list( set(motif_indices) & set(outside_indices) )

                prot['motif_sites'][motif]['extracellular'][pred] = out_motif

                all = set(motif_indices) & set(pep['indices'])

                if all:    
                    pep[motif]['all'] = {'num' : len(all), 'indices' : list(all) }

                    out = set(out_motif) & set(pep['indices'])

                    if out:
                        pep[motif]['extracellular'][pred] = {'num' : len(out), 'indices' : list(out)}
                    else:
                        pep[motif]['extracellular'][pred] = {'num' : '0', 'indices' : 'n/a' }

                else: 
                    pep[motif]['all'] = {'num' : '0', 'indices' : 'n/a' }
                    pep[motif]['extracellular'][pred] = {'num' : '0', 'indices' : 'n/a' }
                            

    return prot




In [18]:
file_name = 'can_uniprot-proteome_UP000005640+reviewed_yes.fasta'
path = 'io_files/'

seqs_dict = fasta_parser(path + file_name)
print('sequnces:',len(seqs_dict))

sp = parse_signalP(path + 'signalp_out.tsv')
print('SignalP:',len(sp))

TMHMM_dict  = parse_TMHMM(path + 'TMHMM_out_clean.tsv')
print('TMHMM:',len(TMHMM_dict))

Phobius_dict = parse_Phobius(path + 'Phobius_out_clean.tsv')
print('Phobius:',len(Phobius_dict))

SPC_dict = parse_SPC(seqs_dict, path + 'SPC_by_Source.csv')
print('SPC:',len(SPC_dict))

predisi_dict = parse_Predisi(path + 'predisi.txt')
print('Predisi:',len(predisi_dict))

sequnces: 20416
SignalP: 20413
TMHMM: 20412
Phobius: 20412
SPC: 20424
Predisi: 20416


In [19]:
prot_dict = make_prot_dict(seqs_dict, TMHMM_dict, Phobius_dict, sp, predisi_dict, SPC_dict)



In [6]:
print(str(prot_dict['Q8N112']))

{'seq_info': {'seq': 'MPSLAPDCPLLAMPEETQEDSVAPMMPSQRSRGPLAPNHVHEVCLHQVESISDLHSGAGTLRPYLTEEARPWDELLGVLPPSLCAQAGCSPVYRRGGFLLLLALLVLTCLVLALLAVYLSVLQSESLRILAHTLRTQEETLLKLRLASLSQLRRLNSSEAQAPS', 'seq_len': 164}, 'topo': {'TMHMM': {'inside': [123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164], 'outside': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'num_TM': 1, 'stringOut': 'o100-122i'}, 'Phobius': {'inside': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,

In [None]:
with open('pepOut.tsv', 'w') as fo:
    
    header = ''
    fo.write(header + '\n')
    
    topo_list = ['all', 'Phobius', 'TMHMM']
    motif_list = ['NXS','NXT','NXC','NXV','C','K'] 
        
    for ID in prot_dict:

        prot_dict[ID] = EC_analysis(prot_dict[ID])

        peps = prot_dict[ID]['peptides']

        for pep in peps:

            out = list()
            pep_range = str(pep['indices'][0]) + '-' + str(pep['indices'][-1])
            uniq = ID + '_' + pep_range

            out.append(uniq)
            out.append(pep['seq'])
            out.append(ID)
            out.append(pep_range)
            out.append(str(pep['missed_cleavages']))

            if pep['okForMS']:
                out.append(pep['okForMS'])
            else:
                out.append('None')

            for topo in topo_list:

                for motif in motif_list:
                    if pep[motif]: 
                        if pep[motif].get(topo):
                            out.append(str(pep[motif][topo]['num']))
                        elif pep[motif]['extracellular'].get(topo):
                            out.append(str(pep[motif]['extracellular'][topo]['num']))
                    else:
                        out.append('0')

                for motif in motif_list:
                    if pep[motif]:
                        if pep[motif].get(topo):
                            out.append(str(pep[motif][topo]['indices']))
                        elif pep[motif]['extracellular'].get(topo):
                            out.append(str(pep[motif]['extracellular'][topo]['indices']))
                    else:
                        out.append('0')

            fo.write('\t'.join(out)+'\n')
                        
            

In [54]:
with open ('protOut.tsv', 'w') as fo:
    
    header = ''
    fo.write(header + '\n')
    
    motif_list = ['NXS','NXT','NXC','NXV','C','K']
    
    for ID in prot_dict:
        
        out = list()     
        prot = prot_dict[ID]
        
        out.append(ID)
        
        si = prot['seq_info']
        out.append(si['seq'])
        out.append(str(si['seq_len']))

        for meth in ['Phobius', 'TMHMM']:
            topo = prot['topo'][meth]
                   
            out.append(topo['stringOut'])
            out.append(str(topo['num_TM']))
            out.append(str(len(topo['inside'])))
            out.append(str(len(topo['outside'])))
                   
        for meth in ['Phobius', 'SignalP', 'PrediSi']:
            sig = prot['signal'][meth]
            sppc = 0
                   
            if sig['SP'] == 'Y':
                out.append('Y')
                sppc += 1
            else:
                out.append('N')
                   
            if meth == 'Phobius':
                out.append('n/a')
            else:
                out.append(str(sig['score']))
        out.append(str(sppc))       
        
        spc = prot['SPC']
        out.append(str(spc['score']))
        out.append(spc['stringOut'])      
        
        prot = EC_analysis(prot)
        
        ms = prot['motif_sites'] 
        for loc in ['all', 'Phobius', 'TMHMM']:
            for motif in motif_list:
                if loc == 'all':
                    sites = ms[motif][loc]
                else:
                    sites = ms[motif]['extracellular'][loc]
                
                ns = len(sites)
                out.append(str(ns))
                if ns:
                    out.append(str(sites))
                else:
                    out.append('n/a')
                
        peps = prot['peptides']
        
        pep0 = 0
        pep1 = 0
        pep2 = 0
        
        ok0 = 0
        ok1 = 0
        ok2 = 0
        
        counts = {'all' : dict(), 'Phobius': dict(), 'TMHMM' :dict()}
        for key in counts:
            counts[key] = [dict(), dict(), dict()]

            for i in counts[key]:
                for motif in motif_list:
                    i[motif] = 0
            
        for pep in peps:
            mc = pep['missed_cleavages']
            ok = pep['okForMS']
            
            if mc == 0:
                pep0 += 1
                pep1 += 1
                pep2 += 1
                if ok:
                    ok0 += 1
                    ok1 += 1
                    ok2 += 1
            
            elif mc == 1: 
                pep1 += 1
                pep2 += 1
                if ok:
                    ok1 += 1
                    ok2 += 1
                                   
            else:
                pep2 += 1
                if ok:
                    ok2 += 1
                    
            if ok:
                for motif in motif_list:
                    for loc in ['all', 'Phobius', 'TMHMM']: 
                        if loc == 'all':
                            if int(pep[motif][loc]['num']) > 0:
                                counts[loc][mc][motif] += 1
                        else:
                            if int(pep[motif]['extracellular'][loc]['num']) > 0:
                                counts[loc][mc][motif] += 1
                                                        
        for var in [pep0, pep1, pep2, ok0, ok1, ok2]:
            out.append(str(var))
            
        for loc in ['all', 'Phobius', 'TMHMM']:
            
            for i in [0,1,2]:
                
                for motif in motif_list:
                    
                    if i == 0:
                        count = counts[loc][i][motif]
                        
                    elif i == 1:
                        count = counts[loc][i][motif] + counts[loc][0][motif]
                                                
                    else:
                        count = counts[loc][i][motif] + counts[loc][1][motif] + counts[loc][0][motif]
                        
                    
                    out.append(str(count))
    
                        
        
        fo.write('\t'.join(out)+'\n')

In [44]:

counts = {'all' : dict(), 'Phobius': dict(), 'TMHMM' :dict()}
for key in counts:
    counts[key] = [dict(), dict(), dict()]
    
    for i in counts[key]:
        for motif in motif_list:
            i[motif] = 0
            
print(counts)


{'all': [{'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}, {'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}, {'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}], 'Phobius': [{'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}, {'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}, {'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}], 'TMHMM': [{'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}, {'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}, {'NXS': 0, 'NXT': 0, 'NXC': 0, 'NXV': 0, 'C': 0, 'K': 0}]}


In [49]:
        pep0 = 0
        pep1 = 0
        pep2 = 0
        
        ok0 = 0
        ok1 = 0
        ok2 = 0
        
        counts = {'all' : dict(), 'Phobius': dict(), 'TMHMM' :dict()}
        for key in counts:
            counts[key] = [dict(), dict(), dict()]

            for i in counts[key]:
                for motif in motif_list:
                    i[motif] = 0
    
        for pep in peps:
            mc = pep['missed_cleavages']
            ok = pep['okForMS']
            
            if mc == 0:
                pep0 += 1
                pep1 += 1
                pep2 += 1
                if ok:
                    ok0 += 1
                    ok1 += 1
                    ok2 += 1
            
            elif mc == 1: 
                pep1 += 1
                pep2 += 1
                if ok:
                    ok1 += 1
                    ok2 += 1
                                   
            else:
                pep2 += 1
                if ok:
                    ok2 += 1
                    
            if ok:
                for motif in motif_list:
                    for loc in ['all', 'Phobius', 'TMHMM']: 
                        if loc == 'all':
                            if int(pep[motif][loc]['num']) > 0:
                                counts[loc][mc][motif] += 1
                        else:
                            if int(pep[motif]['extracellular'][loc]['num']) > 0:
                                counts[loc][mc][motif] += 1