# Sequence Motif Coverage Predictor

This program is designed to return the number of peptides for a protein (or set of proteins) which contain a given sequence motif (N-glyco motif) and are suitable for mass spectrometry. 

The program considers protein-level information from prediction tools such as `TMHMM`, `Phobius`, and `SignalP` in order to better interpret the context of possible sequence motifs. 

## I/O and Usage

### Usage

### Input


### Output



In [1]:
# This cell contains the definitions of functions to parse the various input file types, including:
    # TMHMM short format output
    # Phobius short format output
    # Signal P short format output
    # Fasta file

# 

# Dependencies
    # regular expressions (re)



def parse_TMHMM(file_name):
    import re
    with open(file_name, 'r') as fo:
        tm_dict = dict()
        for line in fo:
            line = line.rstrip()
            ID = line.split('\t')[0]
            length = int(line.split('\t')[1].split('=')[1])
            topo_str = line.split('\t')[5].split('=')[1]
            num_TM = int(line.split('\t')[4].split('=')[1])
            
            if len(topo_str) == 1:
                tm_dict[ID] = { 'length' : length, 'num_TM' : num_TM , 'topo' : None }
            else:
                tm_dict[ID] = { 'length' : length, 'num_TM' : num_TM , 'topo' : topo_str } 
                
        
        for key in tm_dict:
            inside = list()
            outside = list()
            i1 = list()
            i2 = list()
            o1 = list()
            o2 = list()
            
            topo = tm_dict[key]['topo']
            
            if topo:
                
                if topo.startswith('i'):
                   i1.append(1)
                elif topo.startswith('o'):
                   o1.append(1)
                    
                for match in re.finditer(r'i(\d+?)-(\d+?)o', topo):
                    i2.append(int(match.group(1))-1)
                    o1.append(int(match.group(2))+1)
                for match in re.finditer(r'o(\d+?)-(\d+?)i', topo):
                    i1.append(int(match.group(2))+1)
                    o2.append(int(match.group(1))-1)
        
                if len(i1) > len(i2):
                    i2.append(tm_dict[key]['length'])
                elif len(o1) > len(o2):
                    o2.append(tm_dict[key]['length'])
             
            inside_residues = list()
            outside_residues = list()
            
            for i in range(len(i1)):
                inside_residues.extend(list(range(i1[i],i2[i]+1)))
                
            for i in range(len(o1)):
                outside_residues.extend(list(range(o1[i],o2[i]+1)))
                
            tm_dict[key]['inside'] = inside_residues
            tm_dict[key]['outside'] = outside_residues
            
                    
    return tm_dict

def parse_Phobius(file_name):
    import re
    with open(file_name, 'r') as fo:
        
        tm_dict = dict()
        
        with open('io_files/human_plus_leftovers.tab' , 'r') as fo2:
            length_dict = dict()
            header = True
            for line in fo2:
                if header:
                    header = False
                else:
                    line = line.rstrip()
                    ID = line.split()[0]
                    length = line.split('\t')[6]
                    length_dict[ID] = int(length)
                            
        for line in fo:
            line = line.rstrip()
            ID, num_TM, SP, topo_str = line.split()
            ID = ID.split('|')[1]
            
            if bool(length_dict.get(ID)):
                length = length_dict[ID]
            else:
                continue   
            
            if len(topo_str) == 1:
                tm_dict[ID] = { 'length' : length, 'topo' : None , \
                                'num_TM' : int(num_TM), 'SP' : SP}
            else:
                tm_dict[ID] = { 'length' : length, 'topo' : topo_str ,  \
                                'num_TM' : int(num_TM), 'SP' : SP}
                
        for key in tm_dict:
            inside = list()
            outside = list()
            i1 = list()
            i2 = list()
            o1 = list()
            o2 = list()
            
            topo = tm_dict[key]['topo']
            
            if topo:
                
                if topo.startswith('i'):
                    i1.append(1)
                elif topo.startswith('o'):
                    o1.append(1)
                else:
                    match = re.search(r'\S+/(\S+?)([io])(\S*)', topo)
                    if match:
                        topo = match.group(2) + match.group(3)
                    else:
                        print(topo)
                    if topo.startswith('i'):
                        i1.append(int(match.group(1)))
                    elif topo.startswith('o'):
                        o1.append(int(match.group(1)))
                   
                   
                    
                for match in re.finditer(r'i(\d+?)-(\d+?)o', topo):
                    i2.append(int(match.group(1))-1)
                    o1.append(int(match.group(2))+1)
                for match in re.finditer(r'o(\d+?)-(\d+?)i', topo):
                    i1.append(int(match.group(2))+1)
                    o2.append(int(match.group(1))-1)
        
                if len(i1) > len(i2):
                    i2.append(tm_dict[key]['length'])
                elif len(o1) > len(o2):
                    o2.append(tm_dict[key]['length'])
             
            inside_residues = list()
            outside_residues = list()
            
            for i in range(len(i1)):
                inside_residues.extend(list(range(i1[i],i2[i]+1)))
                
            for i in range(len(o1)):
                outside_residues.extend(list(range(o1[i],o2[i]+1)))
                
            tm_dict[key]['inside'] = inside_residues
            tm_dict[key]['outside'] = outside_residues
            
    return tm_dict


def parse_signalP(file_name):
    
        with open(file_name, 'r') as fo:
            
            SP_dict = dict()
            
            for line in fo:
                if line.startswith('#'):
                    continue
                else:
                    line = line.rstrip()
                    ID = line.split()[0]
                    SP = line.split()[1]
                    
                    if SP.startswith('SP'):
                        SP = 'Y'
                    else:
                        SP = 0
            
                    SP_dict[ID] = { 'SP': SP } 
        
        return SP_dict
    
def parse_Predisi(file_name):
    
    with open (file_name, 'r') as fo:
        
        predisi_dict = dict()
        header = True
        
        for line in fo:
            if header:
                header = False
            else:
                line = line.rstrip()
                ID = line.split('\t')[0].split('|')[1]
                SP = line.split('\t')[-2]
                
                predisi_dict[ID] = {'SP' : SP}
                
                
    return predisi_dict
    
def parse_SPC(seq_dict, file_name):
    
    with open(file_name, 'r') as fo:
        
            SPC_dict = dict()
            header = True

            for line in fo:
                    if header:
                        header = False
                    else:
                        line = line.rstrip()
                        ID, SPC = line.split(',')[0], line.split(',')[1]

                        SPC_dict[ID] = int(SPC)
            
            for ID in seq_dict:
                
                if SPC_dict.get(ID) == None:
                    SPC_dict[ID] = 0
            
    return SPC_dict
        
        
            
def fasta_parser(fasta_filename):   
    

    fasta_fileobj = open(fasta_filename, 'r')	## create a file obj from the specified file

    sequence_name = ''				## initialize strings to populate from file object info
    sequence_desc = ''
    sequence_string = ''
    sequence_dict = {}

    for line in fasta_fileobj:  			## iterate through file object with for loop
        line = line.rstrip()			## strip white space on the right side (like a new line character!) 

        if line.startswith('>'):
            
            if len(sequence_string) > 0:
                sequence_dict[sequence_name] = sequence_string	
                sequence_string = ''  		## reset for the new sequence
            
            line = line.lstrip('>')  		## remove leading `>` char
            sequence_info = line.split(maxsplit=1)  ## split on only first space
            sequence_name = sequence_info[0].split('|')[1]
	
            if len(sequence_info) > 1:
                sequence_desc = sequence_info[1]
            else:					## sequence has no description, set to empty
                sequence_desc = ''
		
           
            line = line.lstrip('>')  		## remove leading `>` char
            sequence_info = line.split(maxsplit=1)  	## split on only first space
           
            if len(sequence_info) > 1:
                sequence_desc = sequence_info[1]
           
            else:
            # sequence has no description, set to empty
                sequence_desc = ''
             
        else:
            sequence_string += line  # incrementally elongate seq

# When we reach the end of the FASTA file, we drop out of the
# 'for' loop. However, we still have the last sequence record
# stored in memory, which we haven't processed yet, because we
# haven't observed a '>' symbol, so we must copy and paste any
# code that we used to process sequences above to the code block
# below. Check if sequence_string has a non-zero length to
# determine whether to execute the sequence processing code:

    if len(sequence_string) > 0:
        sequence_dict[sequence_name] = sequence_string
        
    return sequence_dict

  

In [2]:
# This cell contatins the definition of functions required 

import re

## 2 missed cleavages are used for trypsinize, where the number of missed cleavages are recorded

def trypsinize(prot_seq):
    peptides= []
    cut_sites=[0]
    indices = []
    pep = ''

    for i in range(0,len(prot_seq)-1):
        if prot_seq[i] == 'K' and prot_seq[i+1] != 'P':
            cut_sites.append(i+1)
        elif prot_seq[i] == 'R' and prot_seq[i+1] != 'P':
            cut_sites.append(i+1)
        
    if cut_sites[-1]!=len(prot_seq):
            cut_sites.append(len(prot_seq))
            
    if len(cut_sites)>2:
            
        for j in range(0,len(cut_sites)-3):

            pep = prot_seq[cut_sites[j]:cut_sites[j+1]]
            for i in range(cut_sites[j],cut_sites[j+1]):
                indices.append(i+1)
            peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 0})
            indices = []

            pep = prot_seq[cut_sites[j]:cut_sites[j+2]]
            for i in range(cut_sites[j],cut_sites[j+2]):
                indices.append(i+1)
            peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 1})
            indices = []

            pep = prot_seq[cut_sites[j]:cut_sites[j+3]]
            for i in range(cut_sites[j],cut_sites[j+3]):
                indices.append(i+1)
            peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 2})
            indices = []

        pep = prot_seq[cut_sites[-3]:cut_sites[-2]]
        for i in range(cut_sites[-3],cut_sites[-2]):
            indices.append(i+1)
        peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 0})
        indices = []

        pep = prot_seq[cut_sites[-3]:cut_sites[-1]]
        for i in range(cut_sites[-3],cut_sites[-1]):
            indices.append(i+1)
        peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 1})
        indices = []

        pep = prot_seq[cut_sites[-2]:cut_sites[-1]]
        for i in range(cut_sites[-2],cut_sites[-1]):
            indices.append(i+1)
        peptides.append({'seq': pep,'indices': indices, 'missed_cleavages' : 0})
        indices = []
                    
    else: #there is no trypsin site in the protein sequence
        peptides.append({'seq' : prot_seq, 'indices' : range(1,len(prot_seq)+1), 'missed_cleavages' : 0})
        
    return peptides

def ion_mim(prot_seq, charge_state):
    
    mass_table = {
            "A" : 71.03711,
            "R" : 156.10111,
            "N" : 114.04293,
            "D" : 115.02694,
            "C" : 103.00919 + 57.02146,
            "E" : 129.04259,
            "Q" : 128.05858,
            "G" : 57.02146,
            "H" : 137.05891,
            "I" : 113.08406,
            "L" : 113.08406,
            "K" : 128.09496,
            "M" : 131.04049,
            "F" : 147.06841,
            "P" : 97.05276,
            "S" : 87.03203,
            "T" : 101.04768,
            "W" : 186.07931,
            "Y" : 163.06333,
            "V" : 99.06841
            }
    
    mass = 0
    
    for aa in mass_table:
        mass += prot_seq.count(aa) * mass_table[aa]
    
    
    ion_mass = mass + (charge_state * 1.007276)
    m_z = ion_mass/charge_state
    
    return m_z


def ok_for_MS(pep_list):
    
    for pep in pep_list:
        
        pep['okForMS'] = ''
        
        if len(pep['seq']) > 5 and (ion_mim(pep['seq'], 2) < 2000):
            pep['okForMS'] += '2'
            
        if len(pep['seq']) > 5 and (ion_mim(pep['seq'], 3) < 2000):
            pep['okForMS'] += ',3'
        
        if len(pep['seq']) > 5 and (ion_mim(pep['seq'], 4) < 2000):    
            pep['okForMS'] += ',4'
        
        else:
            pep['okForMS'] = None
            
    return pep_list      

def make_prot_dict(seq_dict, tm_dict, phob_dict, sp_dict, predisi_dict, SPC_dict):
    
    prot_dict = dict()
    
    for ID in seq_dict:
      
      seq = seq_dict[ID]
    
      if tm_dict.get(ID) and phob_dict.get(ID) and sp_dict.get(ID) and predisi_dict.get(ID):
        
        
        pep_list = trypsinize(seq)
        
        pep_list = ok_for_MS(pep_list)

        
        glyco_indices_S = list()
        
        for match in re.finditer(r'N[^P]S', seq ):
            glyco_indices_S.append(match.start()+1)
        
        glyco_indices_T = list()
        
        for match in re.finditer(r'N[^P]T', seq ):
            glyco_indices_T.append(match.start()+1)    
            
        glyco_indices_C = list()
        
        for match in re.finditer(r'N[^P]C', seq ):
            glyco_indices_C.append(match.start()+1)   
        
        glyco_indices_V = list()
        
        for match in re.finditer(r'N[^P]V', seq ):
            glyco_indices_V.append(match.start()+1)
            
        K_indices = list()
        C_indices = list()
        
        for i in range(0, len(seq)):
            if seq[i] == 'K':
                K_indices.append(i+1)
            elif seq[i] == 'C':
                C_indices.append(i+1) 
                
        

        
        prot_dict[ID] = {                             
                            'seq_info' :
                              { 
                                  'seq' : seq , 
                                  'seq_len' : len(seq) 
                              } , 
                             
                            'topo' :
                              {
                                 'TMHMM' :
                                    { 'inside' : tm_dict[ID]['inside']  ,
                                      'outside' : tm_dict[ID]['outside']  ,
                                      'num_TM' : tm_dict[ID]['num_TM'] } ,
                                 'Phobius' :
                                    { 'inside' : phob_dict[ID]['inside']  , 
                                      'outside' : phob_dict[ID]['outside']  ,
                                      'num_TM' : phob_dict[ID]['num_TM'] }  
                              } , 
                           
                            'signal' : 
                              {
                                  'Phobius' : phob_dict[ID]['SP'] ,
                                  'SignalP' : sp_dict[ID]['SP'] , 
                                  'PrediSi' : predisi_dict[ID]['SP']  
                              } , 
                            
                            'SPC' : int(SPC_dict[ID]), 
            
                            'peptides' : pep_list, 

            
                            'motif_sites' : 
                               { 
                                 'NXS' :
                                   { 'all' : glyco_indices_S , 'extracellular' : dict()  } ,
                                   
                                 'NXT' :
                                   { 'all' : glyco_indices_T , 'extracellular' : dict()  } , 
                                   
                                 'NXC' :
                                   { 'all' : glyco_indices_C , 'extracellular' : dict()  } ,
                                   
                                 'NXV' :
                                   { 'all' : glyco_indices_V , 'extracellular' : dict()  } , 
                                   
                                 'C' :
                                   { 'all' : C_indices , 'extracellular' : dict()  } , 
                                 
                                 'K' :
                                   { 'all' : K_indices , 'extracellular' : dict()  } , 
                                    
                               }                          
                                           
                        }
    print('step1_done')

    
    return prot_dict

def EC_analysis(prot_dict, pred_method):
    count = 0
    
    motif_list = ['NXS','NXT','NXC','NXV','C','K'] 
        
    for pred in pred_method: 
    
        for ID, value in prot_dict.items():
            count += 1

            if count%200 == 0:
                print('count is at',str(count))
            
            outside_indices = value['topo'][pred]['outside']
            
            if len(outside_indices) == 0:
                for motif in motif_list:
                    value['motif_sites'][motif]['extracellular'][pred] = []
            
            else: 
                for motif in motif_list:
                    
                    motif_indices = value['motif_sites'][motif]['all']
                    out_motif = list( set(motif_indices) & set(outside_indices) )

                    value['motif_sites'][motif]['extracellular'][pred] = out_motif 

                    for pep in value['peptides']:

                        if pep['okForMS'] == None:
                            pep[motif] = None

                        else:                 
                            pep[motif] = dict()
                            pep[motif]['extracellular'] = dict()

                            all = set(motif_indices) & set(pep['indices'])
                            if all:    
                                pep[motif]['all'] = {'num' : len(all), 'indices' : list(all) }

                                out = set(out_motif) & set(pep['indices'])

                                if out:
                                    pep[motif]['extracellular'][pred] = {'num' : len(out), 'indices' : list(out)}

    return prot_dict




In [3]:
file_name = 'can_uniprot-proteome_UP000005640+reviewed_yes.fasta'
path = 'io_files/'

seqs_dict = fasta_parser(path + file_name)
print('sequnces:',len(seqs_dict))

sp = parse_signalP(path + 'signalp_out.tsv')
print('SignalP:',len(sp))

TMHMM_dict  = parse_TMHMM(path + 'TMHMM_out_clean.tsv')
print('TMHMM:',len(TMHMM_dict))

Phobius_dict = parse_Phobius(path + 'Phobius_out_clean.tsv')
print('Phobius:',len(Phobius_dict))

SPC_dict = parse_SPC(seqs_dict, path + 'SPC_by_Source.csv')
print('SPC:',len(SPC_dict))

predisi_dict = parse_Predisi(path + 'predisi.txt')
print('Predisi:',len(predisi_dict))

sequnces: 20416
SignalP: 20413
TMHMM: 20412
Phobius: 20412
SPC: 20424
Predisi: 20416


In [4]:
prot_dict = make_prot_dict(seqs_dict, TMHMM_dict, Phobius_dict, sp, predisi_dict, SPC_dict)



step1_done


In [5]:
print('Prot dict:', len(prot_dict))

Prot dict: 20405


In [8]:
pep_count = 0
not_ok = 0

for prot in prot_dict:
    pep_count += len(prot_dict[prot]['peptides'])

    for pep in prot_dict[prot]['peptides']:
        if pep['okForMS'] == None:
            not_ok += 1
            
print(pep_count)
print(not_ok)

3629130
850683


In [None]:
pred_methods = ['TMHMM','Phobius']

EC_analysis(prot_dict, pred_methods)


count is at 200
count is at 400
count is at 600
count is at 800
count is at 1000
count is at 1200
count is at 1400
count is at 1600
count is at 1800
count is at 2000
count is at 2200
count is at 2400
count is at 2600
count is at 2800
count is at 3000
count is at 3200
count is at 3400
count is at 3600
count is at 3800
count is at 4000
count is at 4200
count is at 4400
count is at 4600
count is at 4800
count is at 5000
count is at 5200
count is at 5400
count is at 5600
count is at 5800
count is at 6000
count is at 6200
count is at 6400
count is at 6600
count is at 6800
count is at 7000
count is at 7200
count is at 7400
count is at 7600
count is at 7800
count is at 8000
count is at 8200
count is at 8400
count is at 8600
count is at 8800
count is at 9000
count is at 9200
count is at 9400
count is at 9600
count is at 9800
count is at 10000
count is at 10200
count is at 10400
count is at 10600
count is at 10800
count is at 11000
count is at 11200
count is at 11400
count is at 11600
count is 

In [9]:
print(prot_dict['Q8N112'])

{'seq_info': {'seq': 'MPSLAPDCPLLAMPEETQEDSVAPMMPSQRSRGPLAPNHVHEVCLHQVESISDLHSGAGTLRPYLTEEARPWDELLGVLPPSLCAQAGCSPVYRRGGFLLLLALLVLTCLVLALLAVYLSVLQSESLRILAHTLRTQEETLLKLRLASLSQLRRLNSSEAQAPS', 'seq_len': 164}, 'topo': {'TMHMM': {'inside': [123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164], 'outside': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'num_TM': 1}, 'Phobius': {'inside': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 3

## for proteins of various num TM domains, report the ratios of EC residues to IC residues

with open('TManalysis.tab', 'w') as fo:

    header = 'Accession'
    
    for pred in ['TMHMM', 'Phobius', 'consensus', 'inclusive']:
        header += '\t' + pred+' #TM' +'\t'+ pred+'I' +'\t'+ pred+'O' +'\t'+ pred+'Ratio(O/I)'
        
    fo.write(header+'\n')
    
    for ID in prot_dict:
        out = ''
        out += ID
        
        prot = prot_dict[ID]
                
        for pred in ['TMHMM', 'Phobius', 'consensus', 'inclusive']:
            num_TM = prot['topo'][pred]['num_TM']
            I = len(prot['topo'][pred]['inside'])
            O = len(prot['topo'][pred]['outside'])
            
            if I == 0:
                ratio = 'inf'
            else:
                ratio = str(O/I)
            
            out += '\t'+ str(num_TM) +'\t'+ str(I) +'\t'+ str(O) +'\t'+ ratio
            
        fo.write(out +'\n')
            


## for each glycomotif record the number that are predicted extracellular, have SPC, have SPC

counts = { 
    'no_info' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 } , 
    'SPC_only' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 } ,
    'SP_only' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 },
    'EC_only' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 },
    'SPC-EC' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 },
    'SP-EC' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 },
    'SPC-SP' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 },
    'all_info' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 },
    'total' : { 'S' : 0 , 'C' : 0 , 'T' : 0 , 'V' : 0 }
    }

for ID in prot_dict :
    prot = prot_dict[ID]
    
    SP = prot['signal']['inclusive']
    SPC = 
    
    for site in prot['glyco_sites']:
        sites_dict = prot['glyco_sites'][site]
        
        all = set(sites_dict['all'])
        inclusive = set(sites_dict['extracellular']['inclusive'])
        num_ic = len(all) - len(inclusive)
    
        counts['total'][site] += len(all)
        
        if SP == 'Y' and SPC > 0:
            counts['all_info'][site] += len(inclusive)
            counts['SPC-SP'][site] += num_ic
            
        elif SP == 'Y' and SPC == 0:
            counts['SP-EC'][site] += len(inclusive)
            counts['SP_only'][site] += num_ic
            
        elif SP != 'Y' and SPC > 0:
            counts['SPC-EC'][site] += len(inclusive)
            counts['SPC_only'][site] += num_ic
            
        else:
            counts['EC_only'][site] += len(inclusive)
            counts['no_info'][site] += num_ic
        
    


## for each accession --> report SPC, signal, and TM --> report number of glycopeptide, number of EC-glycopeptides 

with open('glycopep.tab', 'w') as fo:

    header = 'Accession\tSPC\tspInc\tspCon\tspTot\ttmInc\ttmCon'
    for aa in ['S', 'T', 'C', 'V']:
        header += '\t'+ aa+'gp' +'\t'+ aa+'gpInc' +'\t'+ aa+'gpCon'
    fo.write(header+'\n')
    
    for ID in prot_dict:
        out = ''
                
        prot = prot_dict[ID]
        SPC = str(prot['SPC'])
        spInc = prot['signal']['inclusive'] 
        spCon = prot['signal']['consensus']
        spTot = 0
        
        if prot['signal']['Phobius']  == 'Y':
            spTot += 1
        if prot['signal']['PrediSi']  == 'Y':
            spTot += 1
        if prot['signal']['SignalP']  == 'Y':
            spTot += 1            
            
        tmInc = str(prot['topo']['inclusive']['num_TM'])
        tmCon = str(prot['topo']['consensus']['num_TM'])
        
        out += ID +'\t'+ SPC +'\t'+ spInc +'\t'+ spCon +'\t'+ str(spTot) +'\t'+ tmInc +'\t'+ tmCon 
        
        for aa in ['S', 'T', 'C', 'V']:
            gp = len(prot['glycopeptides'][aa]['2']['all'])
            ECgpInc =  len(prot['glycopeptides'][aa]['2']['extracellular']['inclusive'])
            ECgpCon =  len(prot['glycopeptides'][aa]['2']['extracellular']['consensus'])
            
            out += '\t' + str(gp) +'\t'+ str(ECgpInc) +'\t'+ str(ECgpCon) 
            
        fo.write(out + '\n')
            
            

In [None]:
with open('prot_dict_new.out', 'w') as fo:
    fo.write(str(prot_dict))