In [7]:
from Bio.Seq import Seq

In [8]:
from Bio.Alphabet import IUPAC

In [19]:
from Bio.SeqUtils import GC

# Sequences and Alphabets

IUPACProtein, ExtendedIUPACProtein----IUPACUnambiguousDNA, IUPACAmbiguousDNA, ExtendedIUPACDNA---IUPACAmbiguousRNA, IUPACUnambiguousRNA

In [9]:
my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)

In [10]:
my_seq

Seq('AGTACACTGGT', IUPACUnambiguousDNA())

In [11]:
my_seq.alphabet

IUPACUnambiguousDNA()

In [12]:
my_prot = Seq("AGTACACTGGT", IUPAC.protein)

In [13]:
my_prot

Seq('AGTACACTGGT', IUPACProtein())

# Sequences act like strings

In [14]:
my_seq = Seq("GATCG", IUPAC.unambiguous_dna)

In [16]:
for index, letter in enumerate(my_seq):
    print '%i %s' %(index,letter)
    print index,letter

0 G
0 G
1 A
1 A
2 T
2 T
3 C
3 C
4 G
4 G


In [17]:
my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPAC.unambiguous_dna)

In [20]:
GC(my_seq)

46.875

# Slicing a sequence

In [21]:
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)

In [22]:
my_seq[4:12]

Seq('GATGGGCC', IUPACUnambiguousDNA())

In [25]:
my_seq[::-1]

Seq('CGCTAAAAGCTAGGATATATCCGGGTAGCTAG', IUPACUnambiguousDNA())

# Concatenating or adding sequences

In [26]:
from Bio.Alphabet import generic_nucleotide

In [27]:
nuc_seq = Seq("GATCGATGC", generic_nucleotide)
dna_seq = Seq("ACGT", IUPAC.unambiguous_dna)

In [28]:
nuc_seq

Seq('GATCGATGC', NucleotideAlphabet())

In [29]:
dna_seq

Seq('ACGT', IUPACUnambiguousDNA())

In [30]:
nuc_seq + dna_seq

Seq('GATCGATGCACGT', NucleotideAlphabet())

In [32]:
from Bio.Alphabet import generic_dna

In [33]:
list_of_seqs = [Seq("ACGT", generic_dna), Seq("AACC", generic_dna), Seq("GGTT", generic_dna)]
concatenated = Seq("", generic_dna)

In [36]:
for s in list_of_seqs:
    concatenated += s
concatenated

Seq('ACGTAACCGGTTACGTAACCGGTT', DNAAlphabet())

In [37]:
sum(list_of_seqs, Seq("", generic_dna))

Seq('ACGTAACCGGTT', DNAAlphabet())

Unlike the Python string, the Biopython Seq does not (currently) have a .join method.

# Changing case

In [43]:
dna_seq = Seq("acgtACGT", generic_dna)

In [44]:
dna_seq

Seq('acgtACGT', DNAAlphabet())

In [40]:
dna_seq.upper()

Seq('ACGTACGT', DNAAlphabet())

# Nucleotide sequences and (reverse) complements

In [45]:
dna_seq.complement()

Seq('tgcaTGCA', DNAAlphabet())

In [48]:
dna_seq.reverse_complement()

Seq('ACGTacgt', DNAAlphabet())

# Transcription

In [49]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)

In [50]:
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())

In [51]:
template_dna = coding_dna.reverse_complement()

In [52]:
template_dna

Seq('CTATCGGGCACCCTTTCAGCGGCCCATTACAATGGCCAT', IUPACUnambiguousDNA())

### transcript

In [53]:
messenger_rna = coding_dna.transcribe()

In [54]:
messenger_rna

Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())

The actual biological transcription process works from the template strand, doing a reverse complement (TCAG → CUGA) to give the mRNA. However, in Biopython and bioinformatics in general, we typically work directly with the coding strand because this means we can get the mRNA sequence just by switching T → U.

### back-transcript

In [55]:
messenger_rna.back_transcribe()

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())

# Translation

In [56]:
messenger_rna.translate()

Seq('MAIVMGR*KGAR*', HasStopCodon(IUPACProtein(), '*'))

In [57]:
coding_dna

Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())

In [59]:
coding_dna.translate()

Seq('MAIVMGR*KGAR*', HasStopCodon(IUPACProtein(), '*'))

### the translation table is optional

In [60]:
coding_dna.translate(table="Vertebrate Mitochondrial")

Seq('MAIVMGRWKGAR*', HasStopCodon(IUPACProtein(), '*'))

In [61]:
coding_dna.translate(table = 2)

Seq('MAIVMGRWKGAR*', HasStopCodon(IUPACProtein(), '*'))

### stop

In [62]:
coding_dna.translate(to_stop=True)

Seq('MAIVMGR', IUPACProtein())

In [63]:
gene = Seq("GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA" + \
            "GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACAGATAGGCGATCGTGAT" + \
            "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" + \
            "TATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCAT" + \
            "AAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA",
            generic_dna)

In [64]:
gene.translate( table = 'Bacterial')

Seq('VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HR*', HasStopCodon(ExtendedIUPACProtein(), '*'))

In [65]:
gene.translate(table = 'Bacterial', to_stop=True)

Seq('VKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR', ExtendedIUPACProtein())

### In the bacterial genetic code GTG is a valid start codon, and while it does normally encode Valine, if used as a start codon it should be translated as methionine.

In [66]:
gene.translate(table = 'Bacterial', cds=True)

Seq('MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGHWRDH...HHR', ExtendedIUPACProtein())

# Translation Tables

In [71]:
from Bio.Data import CodonTable

In [72]:
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]

In [73]:
print standard_table

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [74]:
print mito_table

Table 2 Vertebrate Mitochondrial, SGC1

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA W   | A
T | TTG L   | TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I(s)| ACT T   | AAT N   | AGT S   | T
A | ATC I(s)| ACC T   | AAC N   | AGC S   | C
A | ATA M(s)| ACA T   | AAA K   | AGA Stop| A
A | ATG M(s)| ACG T   | AAG K   | AGG Stop| G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V(s)| GCG A   | GAG E   | GGG G   

In [75]:
mito_table.stop_codons

['TAA', 'TAG', 'AGA', 'AGG']

In [76]:
mito_table.start_codons

['ATT', 'ATC', 'ATA', 'ATG', 'GTG']

In [77]:
mito_table.forward_table['ACG']

'T'

# Comparing Seq objects

In [78]:
seq1 = Seq("ACGT", IUPAC.unambiguous_dna)
seq2 = Seq("ACGT", IUPAC.ambiguous_dna)

In [79]:
seq1 == seq2

True

In [81]:
from Bio.Alphabet import generic_dna, generic_protein
dna_seq = Seq("ACGT", generic_dna)
prot_seq = Seq('ACGT', generic_protein)
dna_seq == prot_seq



True

# MutableSeq objects

In [82]:
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

In [83]:
mutable_seq = my_seq.tomutable()

In [84]:
mutable_seq

MutableSeq('GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

### or

In [86]:
from Bio.Seq import MutableSeq

In [87]:
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

In [88]:
mutable_seq[5] = 'C'

In [89]:
mutable_seq.remove('T')

In [90]:
mutable_seq

MutableSeq('GCCACGTAATGGGCCGCTGAAAGGGTGCCCGA', IUPACUnambiguousDNA())

In [93]:
mutable_seq.reverse()

In [94]:
mutable_seq

MutableSeq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG', IUPACUnambiguousDNA())

An important technical difference between mutable and immutable objects in Python means that you can’t use a MutableSeq object as a dictionary key, but you can use a Python string or a Seq object in this way.

In [95]:
new_seq = mutable_seq.toseq()

In [96]:
new_seq

Seq('AGCCCGTGGGAAAGTCGCCGGGTAATGCACCG', IUPACUnambiguousDNA())

# UnknownSeq objects

In [98]:
from Bio.Seq import UnknownSeq

In [99]:
unk = UnknownSeq(20)

In [100]:
unk

UnknownSeq(20, alphabet = Alphabet(), character = '?')

In [101]:
print unk

????????????????????


specify an alphabet, meaning for nucleotide sequences the letter defaults to “N” and for proteins “X”

In [104]:
unk_dna = UnknownSeq(20, alphabet=IUPAC.ambiguous_dna, character = 'N')

In [106]:
unk_dna

UnknownSeq(20, alphabet = IUPACAmbiguousDNA(), character = 'N')

In [107]:
print unk_dna

NNNNNNNNNNNNNNNNNNNN


In [108]:
unk_protein = unk_dna.translate()

In [109]:
unk_protein

UnknownSeq(6, alphabet = ProteinAlphabet(), character = 'X')

In [110]:
print unk_protein

XXXXXX


Some sequence file formats don’t always include the actual sequence, for example GenBank and EMBL files may include a list of features but for the sequence just present the contig information. 

# Working with strings directly