In [3]:
from Bio.Seq import Seq
my_seq = Seq("GATCG")
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

0 G
1 A
2 T
3 C
4 G


In [4]:
print(my_seq[0])
print(my_seq[2])
print(my_seq[-1])
print(my_seq[2])

G
T
G
T


In [5]:
# non overlapping string count
"AAAA".count("AA")

2

In [6]:
# calculating GC content
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)

46.875

In [7]:
# directly calculating GC content
from Bio.SeqUtils import GC
GC(my_seq)

46.875

In [13]:
# slicing sequence
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
print(my_seq[4:12])
print(my_seq[0::1]) # all seq, start from position 0
print(my_seq[0::2]) # skip every 2 letters get 1 letterl starting from position 0

print(my_seq[0::3]) # skip every 3 letters get 1 letter; starting from position 0
print(my_seq[1::3]) # skip every 3 letters get 1 letter; starting from position 1
print(my_seq[2::3]) # skip every 3 letters get 1 letter; starting from position 2


GATGGGCC
GATCGATGGGCCTATATAGGATCGAAAATCGC
GTGTGCTTTGACAATG
GCTGTAGTAAG
AGGCATGCATC
TAGCTAAGAC


In [16]:
# turning seq objects into string

str(my_seq)
print(my_seq)

fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)

GATCGATGGGCCTATATAGGATCGAAAATCGC
>Name
GATCGATGGGCCTATATAGGATCGAAAATCGC



In [18]:
# joing sequences together
protein_seq = Seq("EVRNAK")
dna_seq = Seq("ACGT")
wrongSeq = protein_seq + dna_seq
print(wrongSeq)

EVRNAKACGT


In [19]:
# joining sequence with for loop
list_of_seqs = [Seq("ACGT"), Seq("AACC"), Seq("GGTT")]
concatenated = Seq("")
for s in list_of_seqs:
    concatenated += s
print(concatenated)

ACGTAACCGGTT


In [20]:
# joining sequence with 'join' method
contigs = [Seq("ATG"), Seq("ATCCCG"), Seq("TTGCA")]
spacer = Seq("N"*10)
spacer.join(contigs)

Seq('ATGNNNNNNNNNNATCCCGNNNNNNNNNNTTGCA')

In [24]:
# changing case
dna_seq = Seq("acgtACGT")
print(dna_seq.upper())
print(dna_seq.lower())

# check case sensitive
print("GTAC" in dna_seq)
print("GTAC" in dna_seq.upper())

ACGTACGT
acgtacgt
False
True


In [26]:
# complement and reverse complement sequence

my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC")
print(my_seq.complement())
print(my_seq.reverse_complement())


CTAGCTACCCGGATATATCCTAGCTTTTAGCG
GCGATTTTCGATCCTATATAGGCCCATCGATC


In [28]:
# transcription

coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
template_dna = coding_dna.reverse_complement()

messenger_rna = coding_dna.transcribe()
print(messenger_rna)

print(messenger_rna.back_transcribe())

AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG


In [33]:
# translation
print(messenger_rna.translate())
print(coding_dna.translate())

# choose translation table: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
print(coding_dna.translate(table="Vertebrate Mitochondrial"))
print(coding_dna.translate(table=2)) # use table ID according to the link above
print(coding_dna.translate(to_stop=True)) # translate only to a stop codon



MAIVMGR*KGAR*
MAIVMGR*KGAR*
MAIVMGRWKGAR*
MAIVMGRWKGAR*
MAIVMGR


In [36]:
# print out codon table

from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
mito_table = CodonTable.unambiguous_dna_by_id[2]

print(standard_table)
print(mito_table)

print(mito_table.stop_codons)
print(mito_table.start_codons)

Table 1 Standard, SGC0

  |  T      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
T | TTT F   | TCT S   | TAT Y   | TGT C   | T
T | TTC F   | TCC S   | TAC Y   | TGC C   | C
T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
--+---------+---------+---------+---------+--
C | CTT L   | CCT P   | CAT H   | CGT R   | T
C | CTC L   | CCC P   | CAC H   | CGC R   | C
C | CTA L   | CCA P   | CAA Q   | CGA R   | A
C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | ATT I   | ACT T   | AAT N   | AGT S   | T
A | ATC I   | ACC T   | AAC N   | AGC S   | C
A | ATA I   | ACA T   | AAA K   | AGA R   | A
A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
--+---------+---------+---------+---------+--
G | GTT V   | GCT A   | GAT D   | GGT G   | T
G | GTC V   | GCC A   | GAC D   | GGC G   | C
G | GTA V   | GCA A   | GAA E   | GGA G   | A
G | GTG V   | GCG A   | GAG E   | GGG G   | G
--+---------

In [37]:
## comparing sequence
seq1 = Seq("ACGT")
"ACGT" == seq1 ## check if seq are exactly the same

True

In [39]:
# mutability 
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA")
mutable_seq = my_seq.tomutable() # convert to mutable seq we can edit
mutable_seq[5] = "C"
mutable_seq.remove("T")
mutable_seq.remove("T")
mutable_seq.remove("T")
mutable_seq.remove("T")
mutable_seq.remove("T")
new_seq = mutable_seq.toseq() # convert back to immutable 
print(new_seq)


GCCACGAAGGGCCGCGAAAGGGGCCCGA


In [40]:
# unknown sequence objects
from Bio.Seq import UnknownSeq
unk_dna = UnknownSeq(20, character="N") ## use N for DNA and X for protein 
print(unk_dna)

NNNNNNNNNNNNNNNNNNNN
