Translation of a **dna** sequence into **protein** within all six reading frames

In [1]:
# look-up table for translating codons into amino acids
bases = ('T','C','A','G')
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
codon_table = dict(zip(codons, amino_acids))

In [2]:
def translate(sequence) :
    # initialise string variable for protein
    prot = ''

    # process dna variable by cycling through each codon
    for i in range(0, len(sequence)-2, 3) :
        # extract codon
        codon = sequence[i:i+3].upper()

        # translate upper-case version of codon into amino acid with help of dictionary
        # by retrieving the associated value from codon_table
        aa = codon_table[codon]

        # append amino acid to protein string
        prot += aa

    # return protein string at the end
    return(prot)

In [3]:
def reverse_complement(sequence) :
    # look-up table for complement basepairs
    complement = { 'a' : 't', 
                  'g' : 'c', 
                  'c' : 'g', 
                  't' : 'a' }

    # initialise a new variable as an empty string to store the reverse complement
    rev_comp = ''

    # reverse the DNA sequence string and turn into lower case
    seq_reverse = sequence.lower()[::-1]

    # then go through each base and add the replacements (complements) to the new string
    for letter in seq_reverse :
        rev_comp += complement[letter]

    return(rev_comp)    

In [4]:
# MAIN PROGRAM:

# read input sequence from file:
file = 'test.seq'
dna = ''
fh = open(file, 'r')
for line in fh :
    if not line.startswith('>') :
        dna += line.strip().upper()
fh.close()

print(f'read {len(dna)} bp sequence from {file}')

read 291 bp sequence from test.seq


In [5]:
# variable to keep track of frame number
frame = 0

# generate and print the three forward frames
for i in range(3) :
    # each frame is offset from the other by one nucleotide
    # we simply move the starting index for our subset forward by one
    # (or start at the beginning for the first loop)
    seq = dna[i:]
    prot = translate(seq)
    frame += 1
    print(f'Frame {frame}:\n\n{prot}\n')
    
# generate the reverse complement sequence
rev_comp = reverse_complement(dna)

# generate and print the three remaining frames:
for i in range(3) :
    seq = rev_comp[i:]
    prot = translate(seq)
    frame += 1
    print(f'Frame {frame}:\n\n{prot}\n')

Frame 1:

MGSAILSALLSRRSQRATTIIYHYARITTQRAHGLCDIIGPPITTLSASDDTTQRAHGLCDIIGPPITTLSASDDYYISLRTHHDAARTYYISLRTY

Frame 2:

WALRYYRPSYHDALSERRLLYITTHASRRSAHMGSAILSALLSRRSQRATTRRSAHMGSAILSALLSRRSQRATTIIYHYARITTQRAPIIYHYAR

Frame 3:

GLCDIIGPPITTLSASDDYYISLRTHHDAARTWALRYYRPSYHDALSERRHDAARTWALRYYRPSYHDALSERRLLYITTHASRRSAHLLYITTHV

Frame 4:

VRA**YIIGARCVVMRA**YIIVVAR*ERRDRRADNIAEPMCALRRVVAR*ERRDRRADNIAEPMCALRRDACVVIYNSRRSLRAS**EGR*YRRAH

Frame 5:

YVRSDI**VRAAS*CVRSDI**SSLAESVVIGGPIISQSPCARCVVSSLAESVVIGGPIISQSPCARCVVMRA**YIIVVAR*ERRDRRADNIAEP

Frame 6:

TCVVIYNRCALRRDACVVIYNSRRSLRAS**EGR*YRRAHVRAASCRRSLRAS**EGR*YRRAHVRAAS*CVRSDI**SSLAESVVIGGPIISQSP

