# Translating RNA into Protein

In [11]:
CODON_TABLE = {
    "UUU": "F",
    "UUC": "F",
    "UUA": "L",
    "UUG": "L",
    "UCU": "S",
    "UCC": "S",
    "UCA": "S",
    "UCG": "S",
    "UAU": "Y",
    "UAC": "Y",
    "UAA": "Stop",
    "UAG": "Stop",
    "UGU": "C",
    "UGC": "C",
    "UGA": "Stop",
    "UGG": "W",
    "CUU": "L",
    "CUC": "L",
    "CUA": "L",
    "CUG": "L",
    "CCU": "P",
    "CCC": "P",
    "CCA": "P",
    "CCG": "P",
    "CAU": "H",
    "CAC": "H",
    "CAA": "Q",
    "CAG": "Q",
    "CGU": "R",
    "CGC": "R",
    "CGA": "R",
    "CGG": "R",
    "AUU": "I",
    "AUC": "I",
    "AUA": "I",
    "AUG": "M",
    "ACU": "T",
    "ACC": "T",
    "ACA": "T",
    "ACG": "T",
    "AAU": "N",
    "AAC": "N",
    "AAA": "K",
    "AAG": "K",
    "AGU": "S",
    "AGC": "S",
    "AGA": "R",
    "AGG": "R",
    "GUU": "V",
    "GUC": "V",
    "GUA": "V",
    "GUG": "V",
    "GCU": "A",
    "GCC": "A",
    "GCA": "A",
    "GCG": "A",
    "GAU": "D",
    "GAC": "D",
    "GAA": "E",
    "GAG": "E",
    "GGU": "G",
    "GGC": "G",
    "GGA": "G",
    "GGG": "G",
}


def encode_protein(s):
    codon_size = 3
    amino_acid_sequence = []

    for i in range(0, len(s), codon_size):
        codon = s[i : i + codon_size]

        if codon in CODON_TABLE:
            amino_acid = CODON_TABLE[codon]
            if amino_acid == "Stop":
                break
            amino_acid_sequence.append(amino_acid)
        else:
            print(f"Invalid codon: {codon}")

    return "".join(amino_acid_sequence)

In [12]:
sample_input = "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"
expected_output = "MAMAPRTEINSTRING"
output = encode_protein(sample_input)
print(output)
assert output == expected_output

MAMAPRTEINSTRING


In [14]:
fn = "data/rosalind_prot.txt"
with open(fn, "r") as file:
    s = file.read().strip()

print(encode_protein(s))

MGALPSGSTKYPRNRSFTHQSCGQEDLRQSTASLLSLFKHLSGCNWLLKATRMVRPKVTRPAHDCKEHVPSTGQCHALQGEILNLTVIHLRNQCKYLLLPIRAAAKRVTFERTNARSRYHVTRMLAGEYWSLNRQRCVHGLLMGLLLLEYGFLAGDPQLRSVIFLRQESGGRMWPSCNMGNVDNYAAPDRSSYMGPDSSRARQIPLSENGSYSLDRKRCSHISSLSRNASMGFAAPDVPCLQQGLAFPYRLLSAQACTHRLGHIFCRRWTPHGSYLFQLRKILVRATSPPIVGRVLGMCLGLNINLVSGQCALTSHRNNDRSADLPRATSPTSLQALRLGGSSVIKPCQGNLYKSVELLKSEWSCQVFINRGEATAGGCILRYYIQLRQRCLCRSIAATEHSCDPVLLILSPSDLTSIDSVYITGQRAQVLTIATLVGQPRSCPIGRGGINKALADIEAPRMSAGRTASRVREGGINIRAIRAFTVLLVKRPAYESLNLWLGLLAELARGVEVPRSTMVHIAHSLDLSPRVPATFAEKISNYRDLGHQRSFFPATADSVTVRQKGGEEKLTSNKPPACLGTVHIFIYAHAMQAKGCGIVCVLLTLGLETNHAAGGAPTNRGASLRNSLLGRTSMLGLPYLVYSAGFHARSVTVGAILNRSDSNVPAPIVERRPNQPTTRDARSDAFRVRAAAGLLYKSISMSDRNQLLFLQSRPPGMYVQDWLARSRTSLRAVAAVMRGGITIILDAALASHMSTGASPYTCSLGIPASVPHPNHGHGRPYLHGCQTIVGYSRAALICISGPSGTNPWPGSCFGTTRFTIRRFQVETHRAELFVAEAINPQTKSHSRCGVCIVWRISPHSWWSRKSAGVQAACTWSMIKVVQSDAVLMRSLDLFNKRWDWTHEKEVMPTLSFGGRSRSVHAELSRSCVILLVVIGLSRRTSENTSKLPCADNHIGTFDDVGKAIGVTPGHFREADTAIRIREFAAPSIPVHADGFVGWVI