# Python app: Find proteins containing c-terminal degrons

This app finds and selects all proteins that contain c-terminal degrons.

1. Upload human gene data &rarr; anotate gene_name, transcript_name, protein_name

2. Transcribe and translate &rarr; anotate proteins

3. Find c-degron sequences: use consensus sequences  

|C-degrons|
|---------:|
|-GG|  
|-RG|  
|-PG|  
|-XR|  
|-RXXG|  
|-EE| 
|-RXX|  
|-VX|  
|-AX|  
|-A|    

Varshavsky *et al* 2019 (**Fig S3** Supl material)  
Lin *et al* 2018  

### 1. Upload genomic data

In [8]:
#upload data, use pyensembl (pypi.org/project/pyensembl/)

from pyensembl import EnsemblRelease
import pandas as pd



help(EnsemblRelease)


#List all cDNA/cds sequences

Help on class EnsemblRelease in module pyensembl.ensembl_release:

class EnsemblRelease(pyensembl.genome.Genome)
 |  EnsemblRelease(release=99, species=Species(latin_name='homo_sapiens', synonyms=['human'], reference_assemblies={'GRCh38': (76, 99), 'GRCh37': (55, 75), 'NCBI36': (54, 54)}), server='ftp://ftp.ensembl.org')
 |  
 |  Bundles together the genomic annotation and sequence data associated with
 |  a particular release of the Ensembl database.
 |  
 |  Method resolution order:
 |      EnsemblRelease
 |      pyensembl.genome.Genome
 |      serializable.serializable.Serializable
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __hash__(self)
 |      Return hash(self).
 |  
 |  __init__(self, release=99, species=Species(latin_name='homo_sapiens', synonyms=['human'], reference_assemblies={'GRCh38': (76, 99), 'GRCh37': (55, 75), 'NCBI36': (54, 54)}), server='ftp://ftp.ensembl.org')
 |      Parameters
 |      ---

SequenceData(fasta_paths=['/home/monica/.cache/pyensembl/GRCh38/ensembl99/Homo_sapiens.GRCh38.pep.all.fa.gz'])

In [54]:
from Bio import SeqIO

protein_seqs = []
protein_ids = []
for record in SeqIO.parse("Data/Homo_sapiens.GRCh38.pep.all.fa", "fasta"):
    protein_seqs.append(record.seq)
    protein_ids.append(record.id)

print(protein_ids[0:9])
print(len(protein_ids))
print(protein_seqs[0:9])
print(len(protein_seqs))


['ENSP00000451515.1', 'ENSP00000451042.1', 'ENSP00000452494.1', 'ENSP00000488240.1', 'ENSP00000487941.1', 'ENSP00000419773.1', 'ENSP00000430034.1', 'ENSP00000488695.1', 'ENSP00000488000.1']
111047
[Seq('PSY', SingleLetterAlphabet()), Seq('EI', SingleLetterAlphabet()), Seq('TGGY', SingleLetterAlphabet()), Seq('GTGG', SingleLetterAlphabet()), Seq('GTGG', SingleLetterAlphabet()), Seq('VLLWFGELL', SingleLetterAlphabet()), Seq('*LQ*L', SingleLetterAlphabet()), Seq('LTG', SingleLetterAlphabet()), Seq('GIVGAT', SingleLetterAlphabet())]
111047


In [56]:
import pandas as pd

series_ids = pd.Series(protein_ids)
series_seqs = pd.Series(protein_seqs)
frame = {'ID': protein_ids, 'Sequences': protein_seqs}
result = pd.DataFrame(frame)
print(result)

                       ID                                          Sequences
0       ENSP00000451515.1                                          (P, S, Y)
1       ENSP00000451042.1                                             (E, I)
2       ENSP00000452494.1                                       (T, G, G, Y)
3       ENSP00000488240.1                                       (G, T, G, G)
4       ENSP00000487941.1                                       (G, T, G, G)
...                   ...                                                ...
111042  ENSP00000494625.1  (R, Q, G, R, C, D, T, Y, A, T, E, F, D, L, E, ...
111043  ENSP00000494933.1  (M, A, G, R, R, V, N, V, N, V, G, V, L, G, H, ...
111044  ENSP00000495578.1  (M, A, G, R, R, V, N, V, N, V, G, V, L, G, H, ...
111045  ENSP00000496548.1  (M, P, S, M, L, E, R, I, S, K, N, L, V, K, E, ...
111046  ENSP00000494855.1  (M, P, S, M, L, E, R, I, S, K, N, L, V, K, E, ...

[111047 rows x 2 columns]


<_io.TextIOWrapper encoding='iso-8859-1'>

### 2. Gene transcription and translation

In [None]:
#Transcribe & translate
#Dictionaries: transcription and translation

### 3. List all proteins containing c-terminal degrons

In [None]:
#Find c-degrons
#Use regular expressions
#List and enumerate all proteins containing c-degrons.
#Which proportion of the total are?