# Python app: Find proteins containing c-terminal degrons

This app finds and selects all proteins that contain c-terminal degrons.

1. Upload human gene data &rarr; anotate gene_name, transcript_name, protein_name

2. Translate &rarr; anotate proteins

3. Find c-degron sequences: use consensus sequences 

4. Results visualization  

|Number|C-degrons|
|--:|---------:|
|1|-GG|  
|2|-RG|  
|3|-PG|  
|4|-XR|  
|5|-RXXG|  
|6|-EE| 
|7|-RXX|  
|8|-VX|  
|9|-AX|  
|10|-A|    

Varshavsky *et al* 2019 (**Fig S3** Supl material)  
Lin *et al* 2018  

### 1. Upload genomic data

In [1]:
from pyensembl import EnsemblRelease
from Bio import SeqIO
import pandas as pd
import re


In [1]:
#Pending: download data from ensembl, use pyensembl (pypi.org/project/pyensembl/)

#help(EnsemblRelease)


In [2]:
#Import cDNA sequences (from local file)

cdna_seqs = []
cdna_ids = []
for record in SeqIO.parse("Data/Homo_sapiens.GRCh38.cdna.abinitio.fa", "fasta"):
    cdna_seqs.append(str(record.seq))
    cdna_ids.append(record.id)

print(cdna_ids[0:3])
print(len(cdna_ids))
print(cdna_seqs[0:3])
print(len(cdna_seqs))

['GENSCAN00000000001', 'GENSCAN00000000002', 'GENSCAN00000000003']
51756
['ATGGAAAGAGGAAAGAAGAAAAGAATTTCCAATAAGTTACAACAAACTTTTCACCATTCTAAAGAACCCACTTTCCTTATCAACCAAGCTGGGCTTCTCTCTAGTGACTCCTATTCTAGCCTTTCCCCAGAAACAGAGAGTGTTAATCCTGGTGAAAATATAAAGACAGACACTCAGAAAAAGAGACCTGGGACTGTGATACTATCAAAACTGTCAAGTAGAAGAATTATATCGGAAAGCCAGCTTAGCCCCCCTGTGATCCCGGCCCGCAGGCCTGGATTCCGGGTATGCTATATCTGTGGCCGAGAATTTGGGTCCCAGTCAATTGCCATTCATGAACCCCAGTGCTTGCAGAAGTGGCATATTGAAAACAGCAAGTTGCCCAAGCATTTGAGGAGGCCAGAACCCTCCAAACCACAGTCTCTCAGCAGCAGTGGGTCCTACAGTCTTCAGGCAACTAACGAGGCTGCATTTCAGAGTGCCCAGGCTCAGCTGCTGCCCTGTGAATCCTGTGGCCGCACATTCTTGCCAGATCATCTTCTTGTTCATCACAGAAGCTGCAAGCCAAAGGGTGAGGGTCCCAGAGCACCACACTCAAACAGTTCTGATCATCTTACTGGCCTCAAGAAAGCTTGTAGTGGAACCCCAGCCCGACCAAGGACTGTTATCTGCTACATATGTGGTAAGGAATTTGGCACCCTGTCCCTTCCTATTCATGAGCCCAAATGCCTGGAAAAGTGGAAAATGGAAAATGACCGGCTCCCTGTGGAGCTCCACCAGCCACTCCCACAGAAGCCTCAGCCCCTTCCGAATGCACAGTCCAGCCAAGCGGGACCAAATCAAGCTCAGCTTGTGTTCTGCCCACATTGTAGCCGAATCTTTACCTCAGACCGCCTCCTGGTACACCAGAGAAGTTGTAAAACTC

- Create tables (IDs + sequences):

In [3]:
#Create cDNA sequences table

frame = {'ID': cdna_ids, 'Sequences': cdna_seqs}
cdna_df = pd.DataFrame(frame)
print(cdna_df)

                       ID                                          Sequences
0      GENSCAN00000000001  ATGGAAAGAGGAAAGAAGAAAAGAATTTCCAATAAGTTACAACAAA...
1      GENSCAN00000000002  ATGATGAACAGAATGGCCCCAGAGAATTTCCAGCCAGACCCTTTCA...
2      GENSCAN00000000003  ATGGATGACTCTAAGGGCAATGGAAAGAGGGCTAAGATTAGAGGTA...
3      GENSCAN00000000004  ATGAAGGAATATCTGGATCATGGAGCACTCGAGTTTTTGCTCCAAC...
4      GENSCAN00000000005  ATGGAAGCCCCTGAATACCTTGATTTGGATGAAATTGACTTTAGTG...
...                   ...                                                ...
51751  GENSCAN00000056639  ATGCATGAAGCTTTCTGCGGTTTAGAAGCCAAATCATTCTGGTGGT...
51752  GENSCAN00000056640  ATGATGGCACGTCGGGACCCCAAATCTTGGGCCAAGAGACTGGTGA...
51753  GENSCAN00000056641  ATGGGACTGACCCTGAACTTAGAGCCGCCCCCAGGCTGGAACCTGG...
51754  GENSCAN00000056642  ATGGAGGACGACTCACTCTACTTGGGAGGTGAGTGGCAGTTCAACC...
51755  GENSCAN00000056643  ATGACGGCACGTCGTGACCCCAAACCTGGGGCAAAGAGACTGGTGA...

[51756 rows x 2 columns]


In [4]:
#Import protein sequences

prot_seqs = []
prot_ids = []
for record in SeqIO.parse("Data/Homo_sapiens.GRCh38.pep.abinitio.fa", "fasta"):
    prot_seqs.append(str(record.seq))
    prot_ids.append(record.id)

print(prot_ids[0:3])
print(len(prot_ids))
print(prot_seqs[0:3])
print(len(prot_seqs))

['GENSCAN00000000001', 'GENSCAN00000000002', 'GENSCAN00000000003']
51756
['MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETESVNPGENIKTDTQKKRPGTVILSKLSSRRIISESQLSPPVIPARRPGFRVCYICGREFGSQSIAIHEPQCLQKWHIENSKLPKHLRRPEPSKPQSLSSSGSYSLQATNEAAFQSAQAQLLPCESCGRTFLPDHLLVHHRSCKPKGEGPRAPHSNSSDHLTGLKKACSGTPARPRTVICYICGKEFGTLSLPIHEPKCLEKWKMENDRLPVELHQPLPQKPQPLPNAQSSQAGPNQAQLVFCPHCSRIFTSDRLLVHQRSCKTHPYGPKYQNLNLGSKGGLKEYTNSKQQRNRAAPSVTDKVIHATQDALGEPGGALCL', 'MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKLALKRNLLESTGKGLVQELAIQIEAAAAAAAAAAISKASKPDPFIQEECAHLP', 'MDDSKGNGKRAKIRGKGPKIFLKSLLATLPNTSYVCASEPQLSPYLCEFFPGVNLLDVEHDRTNTGEQQSSKQMIVLHTKEVGMDIG']
51756


In [5]:
#Create protein list table

frame = {'ID': prot_ids, 'Sequences': prot_seqs}
prot_df = pd.DataFrame(frame)
print(prot_df)

                       ID                                          Sequences
0      GENSCAN00000000001  MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETE...
1      GENSCAN00000000002  MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKL...
2      GENSCAN00000000003  MDDSKGNGKRAKIRGKGPKIFLKSLLATLPNTSYVCASEPQLSPYL...
3      GENSCAN00000000004  MKEYLDHGALEFLLQQKQWSCFDSTAQWWAEGGNGDCRRNLDGEID...
4      GENSCAN00000000005  MEAPEYLDLDEIDFSDDISDNRSQGNRLQKLGLEDTDREDAMGFGS...
...                   ...                                                ...
51751  GENSCAN00000056639  MHEAFCGLEAKSFWWFNLCQKTPLSFWGRDEGQGAASKDALRTARA...
51752  GENSCAN00000056640  MMARRDPKSWAKRLVRAQTLQKQRRAPVGPRAPPPDEEDPRLKCKN...
51753  GENSCAN00000056641  MGLTLNLEPPPGWNLGSPARPPGGTGALPRGPRLVLSSFQAQAWRP...
51754  GENSCAN00000056642  MEDDSLYLGGEWQFNHFSKLTSSRPDAAFAEIQRTSLPEKSPLSSE...
51755  GENSCAN00000056643  MTARRDPKPGAKRLVRAQTLQKQRRAPVGPRAPPPDEEDPRLKCKN...

[51756 rows x 2 columns]


### 2. cDNA to RNA and translation

- From cDNA to RNA:

In [6]:
def cdna_to_rna(dna_seq_list):
    """Creates a list of rna sequences from a list of cdna sequences"""
    rna_seq_list = []
    for dna_seq in dna_seq_list:
        base_dna_rna = {'A':'A', 'T':'U', 'C':'C', 'G':'G'}
        rna_seq = ''
        for base in dna_seq:
             if base != 'N':
                rna_seq += base_dna_rna[base]
        rna_seq_list.append(rna_seq)
    return rna_seq_list

rna_seqs = cdna_to_rna(cdna_seqs)
print(rna_seqs[0:3])

['AUGGAAAGAGGAAAGAAGAAAAGAAUUUCCAAUAAGUUACAACAAACUUUUCACCAUUCUAAAGAACCCACUUUCCUUAUCAACCAAGCUGGGCUUCUCUCUAGUGACUCCUAUUCUAGCCUUUCCCCAGAAACAGAGAGUGUUAAUCCUGGUGAAAAUAUAAAGACAGACACUCAGAAAAAGAGACCUGGGACUGUGAUACUAUCAAAACUGUCAAGUAGAAGAAUUAUAUCGGAAAGCCAGCUUAGCCCCCCUGUGAUCCCGGCCCGCAGGCCUGGAUUCCGGGUAUGCUAUAUCUGUGGCCGAGAAUUUGGGUCCCAGUCAAUUGCCAUUCAUGAACCCCAGUGCUUGCAGAAGUGGCAUAUUGAAAACAGCAAGUUGCCCAAGCAUUUGAGGAGGCCAGAACCCUCCAAACCACAGUCUCUCAGCAGCAGUGGGUCCUACAGUCUUCAGGCAACUAACGAGGCUGCAUUUCAGAGUGCCCAGGCUCAGCUGCUGCCCUGUGAAUCCUGUGGCCGCACAUUCUUGCCAGAUCAUCUUCUUGUUCAUCACAGAAGCUGCAAGCCAAAGGGUGAGGGUCCCAGAGCACCACACUCAAACAGUUCUGAUCAUCUUACUGGCCUCAAGAAAGCUUGUAGUGGAACCCCAGCCCGACCAAGGACUGUUAUCUGCUACAUAUGUGGUAAGGAAUUUGGCACCCUGUCCCUUCCUAUUCAUGAGCCCAAAUGCCUGGAAAAGUGGAAAAUGGAAAAUGACCGGCUCCCUGUGGAGCUCCACCAGCCACUCCCACAGAAGCCUCAGCCCCUUCCGAAUGCACAGUCCAGCCAAGCGGGACCAAAUCAAGCUCAGCUUGUGUUCUGCCCACAUUGUAGCCGAAUCUUUACCUCAGACCGCCUCCUGGUACACCAGAGAAGUUGUAAAACUCAUCCUUAUGGGCCAAAAUAUCAGAAUUUGAAUUUAGGGAGUAAAGGAGGCCUAAAAGAGUACACUAAUUCCAA

- From RNA to protein (translation):

In [10]:
#Method 1 to generate the translated sequence (using this dictioniary)

#Dictionary triplet RNA to aminoacid:
triplet_rna_aa = {'GAA': 'E', 'CGA': 'R', 'GUG': 'V', 'UAA': '*', 'CGU': 'R', 'AUA': 'I', 'GAC': 'D', 'UCG': 'S', 
                  'GAU': 'D', 'AUG': 'M', 'CUG': 'L', 'CUA': 'L', 'UAC': 'Y', 'GGA': 'G', 'CGG': 'R', 'AGC': 'S', 
                  'UCU': 'S', 'UGA': '*', 'AAA': 'K', 'ACC': 'T', 'ACA': 'T', 'UGC': 'C', 'AAG': 'K', 'GUC': 'V', 
                  'UCC': 'S', 'ACU': 'T', 'AGA': 'R', 'CUU': 'L', 'GCC': 'A', 'GUA': 'V', 'UAG': '*', 'CAA': 'Q', 
                  'CAC': 'H', 'GCU': 'A', 'UUA': 'L', 'CAU': 'H', 'CGC': 'R', 'UUC': 'F', 'AUU': 'I', 'GGC': 'G', 
                  'CAG': 'Q', 'AAC': 'N', 'CCC': 'P', 'GUU': 'V', 'AGG': 'R', 'UGU': 'C', 'CCG': 'P', 'GGG': 'G', 
                  'AUC': 'I', 'UUU': 'F', 'AAU': 'N', 'UCA': 'S', 'GAG': 'E', 'CCA': 'P', 'GCA': 'A', 'UAU': 'Y', 
                  'GGU': 'G', 'UGG': 'W', 'GCG': 'A', 'CUC': 'L', 'UUG': 'L', 'CCU': 'P', 'ACG': 'T', 'AGU': 'S'}


In [8]:
#Method 2 to generate the translated sequence (using a function for the genetic code and gencode.txt file)

#Generate a function for the genetic code

def genetic_code(file):
    gencode = open(file)
    lines = gencode.read().splitlines()
    genetic_code = {}
    for line in lines:
        codon, aa = line.split()
        genetic_code[codon] = aa
    return genetic_code

In [9]:
rna1 = rna_seqs[0]

rna_seq_triplets = [rna1[i:i+3] for i in range(0, len(rna1), 3)]
prot_seq = ''
for triplets in rna_seq_triplets:
    triplet_to_aa = genetic_code('gencode.txt')
    prot_seq += triplet_to_aa[triplets]

print(rna1)
print(rna_seq_triplets)
print(prot_seq)


AUGGAAAGAGGAAAGAAGAAAAGAAUUUCCAAUAAGUUACAACAAACUUUUCACCAUUCUAAAGAACCCACUUUCCUUAUCAACCAAGCUGGGCUUCUCUCUAGUGACUCCUAUUCUAGCCUUUCCCCAGAAACAGAGAGUGUUAAUCCUGGUGAAAAUAUAAAGACAGACACUCAGAAAAAGAGACCUGGGACUGUGAUACUAUCAAAACUGUCAAGUAGAAGAAUUAUAUCGGAAAGCCAGCUUAGCCCCCCUGUGAUCCCGGCCCGCAGGCCUGGAUUCCGGGUAUGCUAUAUCUGUGGCCGAGAAUUUGGGUCCCAGUCAAUUGCCAUUCAUGAACCCCAGUGCUUGCAGAAGUGGCAUAUUGAAAACAGCAAGUUGCCCAAGCAUUUGAGGAGGCCAGAACCCUCCAAACCACAGUCUCUCAGCAGCAGUGGGUCCUACAGUCUUCAGGCAACUAACGAGGCUGCAUUUCAGAGUGCCCAGGCUCAGCUGCUGCCCUGUGAAUCCUGUGGCCGCACAUUCUUGCCAGAUCAUCUUCUUGUUCAUCACAGAAGCUGCAAGCCAAAGGGUGAGGGUCCCAGAGCACCACACUCAAACAGUUCUGAUCAUCUUACUGGCCUCAAGAAAGCUUGUAGUGGAACCCCAGCCCGACCAAGGACUGUUAUCUGCUACAUAUGUGGUAAGGAAUUUGGCACCCUGUCCCUUCCUAUUCAUGAGCCCAAAUGCCUGGAAAAGUGGAAAAUGGAAAAUGACCGGCUCCCUGUGGAGCUCCACCAGCCACUCCCACAGAAGCCUCAGCCCCUUCCGAAUGCACAGUCCAGCCAAGCGGGACCAAAUCAAGCUCAGCUUGUGUUCUGCCCACAUUGUAGCCGAAUCUUUACCUCAGACCGCCUCCUGGUACACCAGAGAAGUUGUAAAACUCAUCCUUAUGGGCCAAAAUAUCAGAAUUUGAAUUUAGGGAGUAAAGGAGGCCUAAAAGAGUACACUAAUUCCAAGC

In [11]:
def translation(rna_seq_list):
    """Creates a list of protein sequences from a list of rna sequences"""
    
    prot_seq_list = []
    for rna_seq in rna_seq_list:
        rna_seq_triplets = [rna_seq[i:i+3] for i in range(0, len(rna_seq), 3)]
        prot_seq = ''
        
        for triplet in rna_seq_triplets:
            if len(triplet) == 3:
                prot_seq += triplet_rna_aa[triplet]
            else:
                break
                
        prot_seq_list.append(prot_seq)
    return prot_seq_list

prot_seqs2 = translation(rna_seqs)
print(prot_seqs2[0:3])


['MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETESVNPGENIKTDTQKKRPGTVILSKLSSRRIISESQLSPPVIPARRPGFRVCYICGREFGSQSIAIHEPQCLQKWHIENSKLPKHLRRPEPSKPQSLSSSGSYSLQATNEAAFQSAQAQLLPCESCGRTFLPDHLLVHHRSCKPKGEGPRAPHSNSSDHLTGLKKACSGTPARPRTVICYICGKEFGTLSLPIHEPKCLEKWKMENDRLPVELHQPLPQKPQPLPNAQSSQAGPNQAQLVFCPHCSRIFTSDRLLVHQRSCKTHPYGPKYQNLNLGSKGGLKEYTNSKQQRNRAAPSVTDKVIHATQDALGEPGGALCL*', 'MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKLALKRNLLESTGKGLVQELAIQIEAAAAAAAAAAISKASKPDPFIQEECAHLP*', 'MDDSKGNGKRAKIRGKGPKIFLKSLLATLPNTSYVCASEPQLSPYLCEFFPGVNLLDVEHDRTNTGEQQSSKQMIVLHTKEVGMDIG*']


In [35]:
#STOP treatments

prot_seqsa = []
for prot_seq in prot_seqs2:
    stop = re.findall('\*', prot_seq)
    if stop != []:
        prot_seqsa.append(prot_seq)
        
prot_seqsb = []
for prot_seq in prot_seqs2:
    stop = re.findall('\*$', prot_seq)
    if stop != []:
        prot_seqsb.append(prot_seq)
        
prot_seqsc = []
for prot_seq in prot_seqs2:
    stop = re.findall('\*.', prot_seq)
    if stop != []:
        prot_seqsc.append(prot_seq)

print(len(prot_seqs2)) # all the sequences
print(len(prot_seqsa)) # sequences with *
print(len(prot_seqsb)) # sequences with * at the end
print(len(prot_seqsc)) # sequences with * in the middle

print(len(prot_seqsb)+len(prot_seqsc))

51756
50969
49638
1332
50970


In [33]:
#Eliminate the '*' from the end of the translated protein

def elim_stop (prot_seqs2):
    """Eliminates the '*' at the end of a translated sequence"""
    prot_seqs3 = []
    for prot_seq in prot_seqs2:
        splitted = re.split('\*', prot_seq)
        prot_seqs3.append(splitted[0])
    return prot_seqs3
        
prot_seqs3 = elim_stop(prot_seqs2)
print(prot_seqs3[0:3])
print(len(prot_seqs3))


['MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETESVNPGENIKTDTQKKRPGTVILSKLSSRRIISESQLSPPVIPARRPGFRVCYICGREFGSQSIAIHEPQCLQKWHIENSKLPKHLRRPEPSKPQSLSSSGSYSLQATNEAAFQSAQAQLLPCESCGRTFLPDHLLVHHRSCKPKGEGPRAPHSNSSDHLTGLKKACSGTPARPRTVICYICGKEFGTLSLPIHEPKCLEKWKMENDRLPVELHQPLPQKPQPLPNAQSSQAGPNQAQLVFCPHCSRIFTSDRLLVHQRSCKTHPYGPKYQNLNLGSKGGLKEYTNSKQQRNRAAPSVTDKVIHATQDALGEPGGALCL', 'MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKLALKRNLLESTGKGLVQELAIQIEAAAAAAAAAAISKASKPDPFIQEECAHLP', 'MDDSKGNGKRAKIRGKGPKIFLKSLLATLPNTSYVCASEPQLSPYLCEFFPGVNLLDVEHDRTNTGEQQSSKQMIVLHTKEVGMDIG']
51756


In [36]:
print(len(prot_seqs))
print(prot_seqs[0:3])
print(len(prot_seqs3))
print(prot_seqs3[0:3])

51756
['MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETESVNPGENIKTDTQKKRPGTVILSKLSSRRIISESQLSPPVIPARRPGFRVCYICGREFGSQSIAIHEPQCLQKWHIENSKLPKHLRRPEPSKPQSLSSSGSYSLQATNEAAFQSAQAQLLPCESCGRTFLPDHLLVHHRSCKPKGEGPRAPHSNSSDHLTGLKKACSGTPARPRTVICYICGKEFGTLSLPIHEPKCLEKWKMENDRLPVELHQPLPQKPQPLPNAQSSQAGPNQAQLVFCPHCSRIFTSDRLLVHQRSCKTHPYGPKYQNLNLGSKGGLKEYTNSKQQRNRAAPSVTDKVIHATQDALGEPGGALCL', 'MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKLALKRNLLESTGKGLVQELAIQIEAAAAAAAAAAISKASKPDPFIQEECAHLP', 'MDDSKGNGKRAKIRGKGPKIFLKSLLATLPNTSYVCASEPQLSPYLCEFFPGVNLLDVEHDRTNTGEQQSSKQMIVLHTKEVGMDIG']
51756
['MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETESVNPGENIKTDTQKKRPGTVILSKLSSRRIISESQLSPPVIPARRPGFRVCYICGREFGSQSIAIHEPQCLQKWHIENSKLPKHLRRPEPSKPQSLSSSGSYSLQATNEAAFQSAQAQLLPCESCGRTFLPDHLLVHHRSCKPKGEGPRAPHSNSSDHLTGLKKACSGTPARPRTVICYICGKEFGTLSLPIHEPKCLEKWKMENDRLPVELHQPLPQKPQPLPNAQSSQAGPNQAQLVFCPHCSRIFTSDRLLVHQRSCKTHPYGPKYQNLNLGSKGGLKEYTNSKQQRNRAAPSVTDKVIHATQDALGEPGGALCL', 'MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKLALKRNLLEST

### 3. List all proteins containing c-terminal degrons
- Find c-degrons (main function):

In [37]:
def find_cdegron(prot_seq_list, cdegron_motif):
    """Finds all proteins containing c-terminal degrons (cdegrons)
    input: a list of protein sequences and the c-degron motif
    return: a list of protein sequences containing the c-degron motif"""
    cdegron_seq_list = []
    for prot_seq in prot_seq_list:
        find_cdegron_motif = re.findall(cdegron_motif, prot_seq)
        if find_cdegron_motif != []:
            cdegron_seq_list.append(prot_seq)
    return cdegron_seq_list


In [None]:
#Pending: function to find degrons from cdna sequences
#make if statements: if cdna... if protein... else print("The sequence cannot be recognized. Please upload a list of cdnas or proteins")...

In [None]:
#Make function to use df as input and output, with id

def find_cdegron(prot_df, cdegron_motif):
    """Finds all proteins containing c-terminal degrons (cdegrons)
    input: a list of protein sequences and the c-degron motif
    return: a list of protein sequences containing the c-degron motif"""
    #cdegron_seq_list = []
    
    for prot_seq in prot_df:
        find_cdegron_motif = re.findall(cdegron_motif, prot_seq)
        if find_cdegron_motif != []:
            cdegron_seq_list.append(prot_seq)
    return cdegron_seq_df

find_cdegron()

In [9]:
prot_df[0:3]

Unnamed: 0,ID,Sequences
0,GENSCAN00000000001,MERGKKKRISNKLQQTFHHSKEPTFLINQAGLLSSDSYSSLSPETE...
1,GENSCAN00000000002,MMNRMAPENFQPDPFINRNDSNMKYEELEALFSQTMFPDRNLQEKL...
2,GENSCAN00000000003,MDDSKGNGKRAKIRGKGPKIFLKSLLATLPNTSYVCASEPQLSPYL...


- Functions for finding more than one c-degron motif in a given prot_seqs list:

In [38]:
cdegron_motifs = ['GG', 'RG', 'PG', 'XR', 'RXXG', 'EE', 'RXX', 'VX', 'AX', 'A']

#Prepare cdegron list to re terms:
def cdegron_to_re (cdegron_motifs):
    """This function converts a list of c-degron motifs to regular expressions"""
    cdegron_motifs_re = []
    for motif in cdegron_motifs:
        motif_re = motif +'$'
        cdegron_motifs_re.append(motif_re)
    cdegron_motifs_re = [c.replace('X', '.') for c in cdegron_motifs_re]
    return cdegron_motifs_re

In [39]:
#Search for each cdegron motif in a list of protein sequences:

def find_cdegron_list(cdegron_motifs, prot_seqs):
    """This function finds all cdegron motifs provided in a list"""
    cdegron_motifs_re = cdegron_to_re(cdegron_motifs)
    n_cdegron_motifs = []
    for i in range(len(cdegron_motifs_re)):
        motif = cdegron_motifs_re[i]
        cdegron_prot_list = find_cdegron(prot_seqs, motif)
        n_cdegron_motifs.append(len(cdegron_prot_list))
    return n_cdegron_motifs

In [40]:
#Calculate the percentage of each c-degron motif:

def percentages_cdegron (n_cdegron_motifs, prot_seqs):
    """This function calculates the percentage of proteins containing each c-degron provided"""
    percent_degron_list = []
    for i in range(len(n_cdegron_motifs)):
        percent_degron_i = round(n_cdegron_motifs[i]/len(prot_seqs)*100, 3)
        percent_degron_list.append(percent_degron_i)
    total_n_cdegron = sum(n_cdegron_motifs)
    percent_degrons_total = round(total_n_cdegron/len(prot_seqs)*100, 3)
    return [total_n_cdegron, percent_degron_list, percent_degrons_total]


In [41]:
#Results summary message:

def results_summary (cdegron_motifs, n_cdegron_motifs, percentages_degrons, prot_seqs):
    """This function displays a summary of the results"""
    total_n_cdegron = percentages_degrons[0]
    percent_degron_list = percentages_degrons[1]
    percent_degrons_total = percentages_degrons[2]
    
    sentence1 = "The protein list you provided harbors:\n" 
    sentence2 = ""
    for i in range(len(cdegron_motifs)): 
        sentencei = "- {} proteins with the {} c-degron motif ({} %)\n".format(str(n_cdegron_motifs[i]), str(cdegron_motifs[i]), str(percent_degron_list[i]))
        sentence2 += sentencei
    sentence3 = "- {} proteins with all c-degron motifs ({}%)\n".format(str(total_n_cdegron), str(percent_degrons_total))

    sentence4 = "from a total {} proteins.\n".format(str(len(prot_seqs)))
    sentence5 = sentence1 + sentence2 + sentence3 + sentence4
    return sentence5

#Pending: create a dataframe with sequences for each cdegron motif

- Test the functions:  
a) prot_seqs3 (translated from cdnas, and eliminated *)

In [42]:
#Data:
cdegron_motifs = ['GG', 'RG', 'PG', 'XR', 'RXXG', 'EE', 'RXX', 'VX', 'AX', 'A']
prot_seqs1000 = prot_seqs[0:1000]

#cdegron_to_re
cdegron_re = cdegron_to_re(cdegron_motifs)
print(cdegron_re)

#find_cdegron_list
cdegron_list3 = find_cdegron_list(cdegron_motifs, prot_seqs3)
print(cdegron_list3)

#percentages_cdegron
percents3 = percentages_cdegron(cdegron_list3, prot_seqs3)
print(percents3)

#results_summary
output3 = results_summary(cdegron_motifs, cdegron_list3, percents3, prot_seqs3)
print(output3)

#Pending: Implement functions inside other functions!!


['GG$', 'RG$', 'PG$', '.R$', 'R..G$', 'EE$', 'R..$', 'V.$', 'A.$', 'A$']
[204, 230, 186, 2322, 171, 301, 2993, 2587, 3225, 3111]
[15330, [0.394, 0.444, 0.359, 4.486, 0.33, 0.582, 5.783, 4.998, 6.231, 6.011], 29.62]
The protein list you provided harbors:
- 204 proteins with the GG c-degron motif (0.394 %)
- 230 proteins with the RG c-degron motif (0.444 %)
- 186 proteins with the PG c-degron motif (0.359 %)
- 2322 proteins with the XR c-degron motif (4.486 %)
- 171 proteins with the RXXG c-degron motif (0.33 %)
- 301 proteins with the EE c-degron motif (0.582 %)
- 2993 proteins with the RXX c-degron motif (5.783 %)
- 2587 proteins with the VX c-degron motif (4.998 %)
- 3225 proteins with the AX c-degron motif (6.231 %)
- 3111 proteins with the A c-degron motif (6.011 %)
- 15330 proteins with all c-degron motifs (29.62%)
from a total 51756 proteins.



b) prot_seqs

In [43]:
#Data:
cdegron_motifs = ['GG', 'RG', 'PG', 'XR', 'RXXG', 'EE', 'RXX', 'VX', 'AX', 'A']

#cdegron_to_re
cdegron_re = cdegron_to_re(cdegron_motifs)
print(cdegron_re)

#find_cdegron_list
cdegron_list = find_cdegron_list(cdegron_motifs, prot_seqs)
print(cdegron_list)

#percentages_cdegron
percents = percentages_cdegron(cdegron_list, prot_seqs)
print(percents)

#results_summary
output = results_summary(cdegron_motifs, cdegron_list, percents, prot_seqs)
print(output)

#Pending: Implement functions inside other functions!!

['GG$', 'RG$', 'PG$', '.R$', 'R..G$', 'EE$', 'R..$', 'V.$', 'A.$', 'A$']
[201, 239, 184, 2283, 176, 308, 2960, 2590, 3221, 3144]
[15306, [0.388, 0.462, 0.356, 4.411, 0.34, 0.595, 5.719, 5.004, 6.223, 6.075], 29.573]
The protein list you provided harbors:
- 201 proteins with the GG c-degron motif (0.388 %)
- 239 proteins with the RG c-degron motif (0.462 %)
- 184 proteins with the PG c-degron motif (0.356 %)
- 2283 proteins with the XR c-degron motif (4.411 %)
- 176 proteins with the RXXG c-degron motif (0.34 %)
- 308 proteins with the EE c-degron motif (0.595 %)
- 2960 proteins with the RXX c-degron motif (5.719 %)
- 2590 proteins with the VX c-degron motif (5.004 %)
- 3221 proteins with the AX c-degron motif (6.223 %)
- 3144 proteins with the A c-degron motif (6.075 %)
- 15306 proteins with all c-degron motifs (29.573%)
from a total 51756 proteins.



In [None]:
#Pending: List and enumerate all proteins containing c-degrons: ANNOTATION, pandas


#Pending: Save outputs (protein ids+seqs, summary, etc) in files

## Data visualisation

In [None]:
# Percentages, etc...

