## Aula sobre representação de sequências biológicas (continuação)

In [1]:
from abi import is_valid, get_dict_of_aminoacids, dna_valid, dna_invalid

In [2]:
def read_sequence(filename="example_Hinfluenzae.txt"):
    with open(filename) as f:
        return "".join(l.strip() for l in f)

In [3]:
read_sequence()

'ATTTAAAAGAACTTAATGATCAAGTTCATCAAAATCTTATTGGGGTGCCAAATAAACGTACCCTTGAATTTGCAAAATATTTGCAAAAACGTAATCAACATACCTGGATTCGTTATGTTGTGGTTCCTGGTTATACTGATAGCGATCACGATGTGCATTTATTAGGTCAGTTTATTGAAGGTATGACCAATATTGAAAAAGTTGAACTTCTTCCTTATCATCGATTAGGTGTGCATAAATGGAAAACCCTTGGGTTAGATTATGAGCTTGAAAATGTATTACCGCCAACTAAAGAATCCTTAGAACATATTAAAACAATCCTAGAAGGTTATGGACACACTGTAAAATTCTAGAATAAATGTCAGCTAACATAAGGAGTAAATAATGAAAAAAATTATTTTAACATTATCACTTGGGTTACTTACCGCTTGTTCTGCTCAAATCCAAAAGGCTGAACAAAATGATGTGAAGCTGGCACCGCCGACTGATGTACGAAGCGGATATATACGTTTGGTAAAGAATGTGAATTATTACATCGATAGTGAATCGATCTGGGTGGATAACCAAGAGCCACAAATTGTACATTTTGATGCTGTGGTGAATTTAGATAGGGGATTGTATGTTTATCCTGAGCCTAAACGTTATGCACGTTCTGTTCGTCAGTATAAGATTTTGAATTGTGCAAATTATCATTTAACTCAAATACGAACTGATTTCTATGATGAATTTTGGGGACAGGGTTTGCGGGCAGCACCTAAAAAGCAAAAGAAACATACGTTAAGTTTAACACCTGATACAACGCTTTATAATGCTGCTCAGATTATTTGTGCAAATTATGGTAAAGCATTTTCAGTTGATAAAAAATAAAAAAATCTGCACCTTAATTAGTTTAAATTTTATTCAATTTTTAGGGTGCAGAGAGTATTCGATTTTTCTGCAGTTATTGCTATTTTACTGCTGGCACTTTTAAGTCTGGCTCGTTTGGTTTTTCAATTGGTG

In [4]:
def write_seq_to_file(seq, filename="output.txt"):
    with open(filename, "w") as f:
        [f.write(seq[i:i+60] + "\n") for i in range(0, len(seq)+1, 60)]

In [5]:
write_seq_to_file("AATTGCGCGATAT")
read_sequence("output.txt")

'AATTGCGCGATAT'

#### Read FASTA sequence
```python
>description
ATATCATCTATCGCGCAGCTAGCATCGACT

>description of the 2nd
ATCGCTCGTCGCTCGATCGTACGACTCGCATGCAAGCATCGACTACGATCG
```

In [25]:
def read_fasta(filename="example_Hinfluenzae.openreadingframes.txt"):
    with open(filename) as f:
        fastas = ("\n" + f.read().lstrip()).split("\n>") # force split on descriptions
        for f in fastas[1:]: # ignore the first empty
            p = f.split("\n")
            yield (p[0], "".join(p[1:]))

In [26]:
list(read_fasta())

[('ORF number 1 in reading frame 1 on the direct strand extends from base 16 to base 165.',
  'ATGATCAAGTTCATCAAAATCTTATTGGGGTGCCAAATAAACGTACCCTTGAATTTGCAAAATATTTGCAAAAACGTAATCAACATACCTGGATTCGTTATGTTGTGGTTCCTGGTTATACTGATAGCGATCACGATGTGCATTTATTAG'),
 ('Translation of ORF number 1 in reading frame 1 on the direct strand.',
  'MIKFIKILLGCQINVPLNLQNICKNVINIPGFVMLWFLVILIAITMCIY*'),
 ('ORF number 2 in reading frame 1 on the direct strand extends from base 385 to base 867.',
  'ATGAAAAAAATTATTTTAACATTATCACTTGGGTTACTTACCGCTTGTTCTGCTCAAATCCAAAAGGCTGAACAAAATGATGTGAAGCTGGCACCGCCGACTGATGTACGAAGCGGATATATACGTTTGGTAAAGAATGTGAATTATTACATCGATAGTGAATCGATCTGGGTGGATAACCAAGAGCCACAAATTGTACATTTTGATGCTGTGGTGAATTTAGATAGGGGATTGTATGTTTATCCTGAGCCTAAACGTTATGCACGTTCTGTTCGTCAGTATAAGATTTTGAATTGTGCAAATTATCATTTAACTCAAATACGAACTGATTTCTATGATGAATTTTGGGGACAGGGTTTGCGGGCAGCACCTAAAAAGCAAAAGAAACATACGTTAAGTTTAACACCTGATACAACGCTTTATAATGCTGCTCAGATTATTTGTGCAAATTATGGTAAAGCATTTTCAGTTGATAAAAAATAA'),
 ('Translation of ORF number 2 in rea

In [27]:
len(list(read_fasta()))

16

### Translate a sequence of codons into aminoacid sequence

In [18]:
DICT_AMINOACIDS = get_dict_of_aminoacids()
def translate_codon(cod):
    return DICT_AMINOACIDS[cod] if cod in DICT_AMINOACIDS else None

In [22]:
print(set(DICT_AMINOACIDS.values()))

{'P', 'F', '_', 'M', 'I', 'R', 'K', 'A', 'L', 'V', 'Q', 'E', 'C', 'N', 'W', 'H', 'S', 'T', 'D', 'G', 'Y'}


In [9]:
translate_codon("GCT")

'A'

In [10]:
def translate_seq(dna, ini_pos=0):
    assert is_valid(dna), "DNA should be valid"
    return "".join(translate_codon(dna[i:i+3]) for i in range(ini_pos,len(dna) - 2, 3))

In [11]:
print(translate_seq(dna_valid()))

MKL_MSLS_SIAHQTTLRLRLSIIVNVNK_N


#### Codon frequency
Given that each aminoacid can be enconded by more than one codon and that different species tend to have different codons it is relevant to check their frequency

In [12]:
def codon_frequency(dna, aminoacid):
    dna = dna.upper()
    assert is_valid(dna), "DNA should be valid"
    codons = [k for k, v in get_dict_of_aminoacids().items() if v == aminoacid] # get relevant codons
    dna_codons = [dna[i:i+3] for i in range(0, len(dna)-2, 3)] # get all the codons in the dna sequence
    return {codon: dna_codons.count(codon)/len(dna_codons) for codon in codons} # output the frequency

In [13]:
codon_frequency(dna_valid(), "R")

{'AGA': 0.06451612903225806,
 'AGG': 0.0,
 'CGA': 0.0,
 'CGC': 0.0,
 'CGG': 0.0,
 'CGT': 0.0}

In [14]:
# given a dna sequence obtain all the possible aminoacid sequences so as to get their reading frames
def reading_frames(dna):
    rdna = dna[::-1]
    return [translate_seq(dna), translate_seq(dna, 1), translate_seq(dna, 2), translate_seq(rdna), translate_seq(rdna, 1), translate_seq(rdna, 2)]

In [15]:
reading_frames(dna_valid())

['MKL_MSLS_SIAHQTTLRLRLSIIVNVNK_N',
 '_NYE_ASAEASRIRLRSDSDSAL__MLINKI',
 'EIMNEPQLKHRASDYAQTQTQHYSEC__IK_',
 'NKINNCK_YYDSDSDSHQTTRYEVDSE_VLK',
 'IK_IIVSDITTQTQTRIRLRATKSTPSKY_S',
 '_NK_L_VILRLRLRLASDYALRSRLRVSIKV']

In [16]:
# get reading frames that start with M and end with _
def open_reading_frames(dna):
    frames = reading_frames(dna)
    for frame in frames:
        orf, ms = [], []
        for i, f in enumerate(frame):
            if f == "M": ms.append(i)
            elif f == "_":
                for m in ms: orf.append(frame[m:i+1])
                ms = []
        yield orf

In [17]:
list(open_reading_frames(dna_valid()))

[['MKL_', 'MSLS_'], [], ['MNEPQLKHRASDYAQTQTQHYSEC_'], [], [], []]