## Aula sobre representação de sequências biológicas

In [2]:
dna1 = "ATGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAA"
dna_valid = "aTGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAA"
dna_invalid = "xGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAA"

In [3]:
def is_valid(dna, codes=set(["A","T","C","G"])):
    return all(x in codes for x in dna.upper())

In [4]:
assert is_valid(dna_valid), "this message should not appear"

In [5]:
assert not is_valid(dna_invalid), "this is invalid on purpose but the message should not appear"

In [6]:
from collections import Counter
def frequency(dna):
    return {k:100*v/len(dna) for k,v in Counter(dna).items()}

In [7]:
frequency(dna1)

{'A': 38.94736842105263,
 'C': 18.94736842105263,
 'G': 17.894736842105264,
 'T': 24.210526315789473}

In [8]:
# sort by decreasing order of frequency
def sort_dict(d):
    return sorted(d.items(), key= lambda x: x[1], reverse=True)

In [9]:
sort_dict(frequency(dna1))

[('A', 38.94736842105263),
 ('T', 24.210526315789473),
 ('C', 18.94736842105263),
 ('G', 17.894736842105264)]

In [10]:
# get gc content
def gc_content(dna):
    dna=dna.upper()
    return (dna.count("G") + dna.count("C"))/len(dna)

In [11]:
gc_content(dna1)

0.3684210526315789

In [12]:
# get gc content of non-overlapping sequences of size k
def gc_content_non_overlapping(dna, k=100):
    for i in range(0, len(dna), k):
        yield gc_content(dna[i: i+k])

In [13]:
print(dna1)
print(list(gc_content_non_overlapping(dna1, 10)))

ATGAAATTATGAATGAGCCTCAGCTGAAGCATCGCGCATCAGACTACGCTCAGACTCAGACTCAGCATTATAGTGAATGTTAATAAATAAAATAA
[0.1, 0.5, 0.6, 0.6, 0.5, 0.5, 0.4, 0.3, 0.0, 0.0]


In [14]:
def transcription(dna):
    assert is_valid(dna), "Invalid DNA sequence"
    return dna.upper().replace("T", "U")

In [15]:
transcription(dna1)

'AUGAAAUUAUGAAUGAGCCUCAGCUGAAGCAUCGCGCAUCAGACUACGCUCAGACUCAGACUCAGCAUUAUAGUGAAUGUUAAUAAAUAAAAUAA'

In [16]:
def reverse_complement(dna):
    reverse = {"A":"T", "T":"A", "C":"G","G":"C"}
    return "".join(reverse[x] for x in dna.upper())[::-1]

In [17]:
assert reverse_complement("AATTGCGC") == "GCGCAATT", "this should not appear"

In [18]:
reverse_complement(dna1)

'TTATTTTATTTATTAACATTCACTATAATGCTGAGTCTGAGTCTGAGCGTAGTCTGATGCGCGATGCTTCAGCTGAGGCTCATTCATAATTTCAT'

In [19]:
def get_dict_of_aminoacids():
    with open("genetic_code.txt") as f:
        return {p[0]: p[1] for p in [line.replace("\"","").strip().split(" ") for line in f]}

In [20]:
print(get_dict_of_aminoacids())

{'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'TGT': 'C', 'TGC': 'C', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'TTT': 'F', 'TTC': 'F', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', 'CAT': 'H', 'CAC': 'H', 'ATA': 'I', 'ATT': 'I', 'ATC': 'I', 'AAA': 'K', 'AAG': 'K', 'TTA': 'L', 'TTG': 'L', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'ATG': 'M', 'AAT': 'N', 'AAC': 'N', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'AGT': 'S', 'AGC': 'S', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'TGG': 'W', 'TAT': 'Y', 'TAC': 'Y', 'TAA': '_', 'TAG': '_', 'TGA': '_'}


In [23]:
d = get_dict_of_aminoacids()
new_d = dict()
for k, v in d.items():
    new_d[k.replace("T", "U")] = v
print(new_d)

{'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'UGU': 'C', 'UGC': 'C', 'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'UUU': 'F', 'UUC': 'F', 'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', 'CAU': 'H', 'CAC': 'H', 'AUA': 'I', 'AUU': 'I', 'AUC': 'I', 'AAA': 'K', 'AAG': 'K', 'UUA': 'L', 'UUG': 'L', 'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L', 'AUG': 'M', 'AAU': 'N', 'AAC': 'N', 'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAA': 'Q', 'CAG': 'Q', 'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R', 'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S', 'AGU': 'S', 'AGC': 'S', 'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V', 'UGG': 'W', 'UAU': 'Y', 'UAC': 'Y', 'UAA': '_', 'UAG': '_', 'UGA': '_'}
