<a href="https://colab.research.google.com/github/nitrozyna/Rosalind/blob/master/15_splc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Problem description:
[RNA Splicing](http://rosalind.info/problems/splc/
)

After identifying the exons and introns of an **RNA string**, we only need to delete the introns and concatenate the exons to form a new string ready for translation.
---

### Given: A DNA string s (of length at most 1 kbp) and a collection of substrings of s acting as introns. All strings are given in FASTA format.

### Return: A protein string resulting from transcribing and translating the exons of s. (Note: Only one solution will exist for the dataset provided.)

Sample Dataset

>Rosalind_10
ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
>Rosalind_12
ATCGGTCGAA
>Rosalind_15
ATCGGTCGAGCGTGT

Sample Output
>MVYIADKQHVASREAYGHMFKVCA


In [0]:
#@title Importing some modules to make a connection between Colab and Drive to download the current dataset
!pip install PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [0]:
#@title Loading test dataset
fileID = "1ddDIJ0zN-dZc9KO5chxUtn0ClHGqL6dX" #@param {type:"string"}
downloaded = drive.CreateFile({'id':fileID})
downloaded.GetContentFile('rosalind_splc.txt')  # replace the file name with your file

In [0]:
#@title Codon table
codons = {
    "UUU": "F",
    "CUU": "L",
    "AUU": "I",
    "GUU": "V",
    "UUC": "F",
    "CUC": "L",
    "AUC": "I",
    "GUC": "V",
    "UUA": "L",
    "CUA": "L",
    "AUA": "I",
    "GUA": "V",
    "UUG": "L",
    "CUG": "L",
    "AUG": "M",
    "GUG": "V",
    "UCU": "S",
    "CCU": "P",
    "ACU": "T",
    "GCU": "A",
    "UCC": "S",
    "CCC": "P",
    "ACC": "T",
    "GCC": "A",
    "UCA": "S",
    "CCA": "P",
    "ACA": "T",
    "GCA": "A",
    "UCG": "S",
    "CCG": "P",
    "ACG": "T",
    "GCG": "A",
    "UAU": "Y",
    "CAU": "H",
    "AAU": "N",
    "GAU": "D",
    "UAC": "Y",
    "CAC": "H",
    "AAC": "N",
    "GAC": "D",
    "UAA": "*",
    "CAA": "Q",
    "AAA": "K",
    "GAA": "E",
    "UAG": "*",
    "CAG": "Q",
    "AAG": "K",
    "GAG": "E",
    "UGU": "C",
    "CGU": "R",
    "AGU": "S",
    "GGU": "G",
    "UGC": "C",
    "CGC": "R",
    "AGC": "S",
    "GGC": "G",
    "UGA": "*",
    "CGA": "R",
    "AGA": "R",
    "GGA": "G",
    "UGG": "W",
    "CGG": "R",
    "AGG": "R",
    "GGG": "G"
}

In [0]:
#@title Function to return a DNA string without a given intron
def removeDnaMotifs(dna,motif):
    return dna.replace(motif,"")

In [0]:
#@title Function for transcribing DNA to RNA
def translateDNA(dna):
    return dna.replace("T","U")

In [0]:
#@title Function for translating RNA into protein
def translateRnaToProtein(rna):
    counter = 3
    code = ""
    c = 0
    protein = ""
    for i in rna:
        code += i
        c += 1
        if c == counter:
            if code in codons:
                p = codons[code]
                if p == '*':
                    break
                protein += p
            c = 0
            code = ""
    return protein

In [0]:
#@title Preprocessing, separating motifs and dna into separate list elements

with open("rosalind_splc.txt") as f:
    motif = ""
    all_motifs = []
    for line in f:
        if line.strip().startswith(">"):
            all_motifs.append(motif)
            motif = ""
        else:
            motif += line.strip()
    all_motifs.append(motif)        

In [18]:
# Removing DNA motifs, transcribing the DNA into RNA and translating the RNA into protein
dna = all_motifs[1]
for motif in all_motifs[2:]:
    dna = removeDnaMotifs(dna,motif)
rna = translateDNA(dna)
print(translateRnaToProtein(rna))

MIKVRIGWSDEFPCWKHDGIGAFPSFAFLVPYWEVELHTHMVRVGVRLNRLVRICMVDKVASPRVGPDYFAYPCSDHPSRPKVALHGDGLNHGNASTRAYALLTMRPDLIAQSAVHERYTSIPGFTSHSRDPTSGGYPAWTDPQSCYIEGVSSRPSNLTRTQNYGLSIEKPIRVGVAFVQADRTCETECPVLFPIRGGVPMGRQG
