# 설치
```bash
pip install biopython
```


In [2]:
from Bio.Seq import Seq
my_seq = Seq("CATAAGCATGACAGATACAGCATGCCGA")
my_seq

Seq('CATAAGCATGACAGATACAGCATGCCGA')

In [3]:
my_seq.alphabet

Alphabet()

알파벳이 아닌 아미노산을 지정해 줄 수도 있습니다.

In [5]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq('AGTACGATGACGTPK', IUPAC.protein)
my_seq

Seq('AGTACGATGACGTPK', IUPACProtein())

In [6]:
my_seq.alphabet

IUPACProtein()

## 3.2 시퀀스의 문자열 처리

In [48]:
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq('''CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTG
AATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGG
CCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAA
AGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGA
ATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGAT
AAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCA
GGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCC
AGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGT
TTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTT
GTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGAT
GTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC''', IUPAC.unambiguous_dna)
len(my_seq)

750

In [35]:
my_seq.count('G')

241

In [36]:
100*float(my_seq.count('G') + my_seq.count('C'))/len(my_seq)

59.19463087248322

In [37]:
from Bio.SeqUtils import GC
GC(my_seq)

59.19463087248322

In [38]:
my_seq[0:10]

Seq('CGTAACAAGG', IUPACUnambiguousDNA())

In [39]:
my_seq[739:]

Seq('TTACGC', IUPACUnambiguousDNA())

## 3.6 Changing case
   

In [40]:
my_seq[0:10].lower()

Seq('cgtaacaagg', DNAAlphabet())

In [41]:
my_seq[0:10].complement()

Seq('GCATTGTTCC', IUPACUnambiguousDNA())

In [42]:
my_seq[0:10].reverse_complement()

Seq('CCTTGTTACG', IUPACUnambiguousDNA())

In [44]:
my_seq[0:10:-1]

Seq('', IUPACUnambiguousDNA())

In [60]:
my_seq = Seq('CATAAGCATGATACAGATACAGCATGCCGA',IUPAC.unambiguous_dna)
mRNA = my_seq.transcribe()
mRNA

Seq('CAUAAGCAUGAUACAGAUACAGCAUGCCGA', IUPACUnambiguousRNA())

In [61]:
len(mRNA)

30

In [62]:
mRNA.translate()

Seq('HKHDTDTACR', IUPACProtein())

In [63]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_protein

record = SeqRecord(Seq("MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD" \
+"GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK" \
+"NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM" \
+"SSAC", generic_protein),
id="gi|14150838|gb|AAK54648.1|AF376133_1",
description="chalcone synthase [Cucumis sativus]")

In [64]:
print(record.format('fasta'))

>gi|14150838|gb|AAK54648.1|AF376133_1 chalcone synthase [Cucumis sativus]
MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD
GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK
NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM
SSAC



In [67]:
from Bio import SeqIO
# record = SeqIO.read('./data/ls_orchid.gbk', 'genbank')
help(SeqIO)

Help on package Bio.SeqIO in Bio:

NAME
    Bio.SeqIO - Sequence input/output as SeqRecord objects.

DESCRIPTION
    Bio.SeqIO is also documented at SeqIO_ and by a whole chapter in our tutorial:
    
      - `HTML Tutorial`_
      - `PDF Tutorial`_
    
    .. _SeqIO: http://biopython.org/wiki/SeqIO
    .. _`HTML Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.html
    .. _`PDF Tutorial`: http://biopython.org/DIST/docs/tutorial/Tutorial.pdf
    
    Input
    -----
    The main function is Bio.SeqIO.parse(...) which takes an input file handle
    (or in recent versions of Biopython alternatively a filename as a string),
    and format string.  This returns an iterator giving SeqRecord objects:
    
    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("Fasta/f002", "fasta"):
    ...     print("%s %i" % (record.id, len(record)))
    gi|1348912|gb|G26680|G26680 633
    gi|1348917|gb|G26685|G26685 413
    gi|1592936|gb|G29385|G29385 471
    
    Note that the parse(

In [68]:
for seq_record in SeqIO.parse('./data/ls_orchid.fasta', 'fasta'):
    print(seq_record.id)
    print(len(seq_record))

gi|2765658|emb|Z78533.1|CIZ78533
740
gi|2765657|emb|Z78532.1|CCZ78532
753
gi|2765656|emb|Z78531.1|CFZ78531
748
gi|2765655|emb|Z78530.1|CMZ78530
744
gi|2765654|emb|Z78529.1|CLZ78529
733
gi|2765652|emb|Z78527.1|CYZ78527
718
gi|2765651|emb|Z78526.1|CGZ78526
730
gi|2765650|emb|Z78525.1|CAZ78525
704
gi|2765649|emb|Z78524.1|CFZ78524
740
gi|2765648|emb|Z78523.1|CHZ78523
709
gi|2765647|emb|Z78522.1|CMZ78522
700
gi|2765646|emb|Z78521.1|CCZ78521
726
gi|2765645|emb|Z78520.1|CSZ78520
753
gi|2765644|emb|Z78519.1|CPZ78519
699
gi|2765643|emb|Z78518.1|CRZ78518
658
gi|2765642|emb|Z78517.1|CFZ78517
752
gi|2765641|emb|Z78516.1|CPZ78516
726
gi|2765640|emb|Z78515.1|MXZ78515
765
gi|2765639|emb|Z78514.1|PSZ78514
755
gi|2765638|emb|Z78513.1|PBZ78513
742
gi|2765637|emb|Z78512.1|PWZ78512
762
gi|2765636|emb|Z78511.1|PEZ78511
745
gi|2765635|emb|Z78510.1|PCZ78510
750
gi|2765634|emb|Z78509.1|PPZ78509
731
gi|2765633|emb|Z78508.1|PLZ78508
741
gi|2765632|emb|Z78507.1|PLZ78507
740
gi|2765631|emb|Z78506.1|PLZ78506
727
g

In [69]:
identifiers = [seq_record.id for seq_record in SeqIO.parse('./data/ls_orchid.gbk','genbank')]
identifiers

['Z78533.1',
 'Z78532.1',
 'Z78531.1',
 'Z78530.1',
 'Z78529.1',
 'Z78527.1',
 'Z78526.1',
 'Z78525.1',
 'Z78524.1',
 'Z78523.1',
 'Z78522.1',
 'Z78521.1',
 'Z78520.1',
 'Z78519.1',
 'Z78518.1',
 'Z78517.1',
 'Z78516.1',
 'Z78515.1',
 'Z78514.1',
 'Z78513.1',
 'Z78512.1',
 'Z78511.1',
 'Z78510.1',
 'Z78509.1',
 'Z78508.1',
 'Z78507.1',
 'Z78506.1',
 'Z78505.1',
 'Z78504.1',
 'Z78503.1',
 'Z78502.1',
 'Z78501.1',
 'Z78500.1',
 'Z78499.1',
 'Z78498.1',
 'Z78497.1',
 'Z78496.1',
 'Z78495.1',
 'Z78494.1',
 'Z78493.1',
 'Z78492.1',
 'Z78491.1',
 'Z78490.1',
 'Z78489.1',
 'Z78488.1',
 'Z78487.1',
 'Z78486.1',
 'Z78485.1',
 'Z78484.1',
 'Z78483.1',
 'Z78482.1',
 'Z78481.1',
 'Z78480.1',
 'Z78479.1',
 'Z78478.1',
 'Z78477.1',
 'Z78476.1',
 'Z78475.1',
 'Z78474.1',
 'Z78473.1',
 'Z78472.1',
 'Z78471.1',
 'Z78470.1',
 'Z78469.1',
 'Z78468.1',
 'Z78467.1',
 'Z78466.1',
 'Z78465.1',
 'Z78464.1',
 'Z78463.1',
 'Z78462.1',
 'Z78461.1',
 'Z78460.1',
 'Z78459.1',
 'Z78458.1',
 'Z78457.1',
 'Z78456.1',

In [70]:
from Bio import SeqIO
record_iterator = SeqIO.parse('./data/ls_orchid.gbk','genbank')
first_record = next(record_iterator)
print(first_record)

ID: Z78533.1
Name: Z78533
Description: C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA
Number of features: 5
/molecule_type=DNA
/topology=linear
/data_file_division=PLN
/date=30-NOV-2006
/accessions=['Z78533']
/sequence_version=1
/gi=2765658
/keywords=['5.8S ribosomal RNA', '5.8S rRNA gene', 'internal transcribed spacer', 'ITS1', 'ITS2']
/source=Cypripedium irapeanum
/organism=Cypripedium irapeanum
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'Liliopsida', 'Asparagales', 'Orchidaceae', 'Cypripedioideae', 'Cypripedium']
/references=[Reference(title='Phylogenetics of the slipper orchids (Cypripedioideae: Orchidaceae): nuclear rDNA ITS sequences', ...), Reference(title='Direct Submission', ...)]
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())


In [71]:
len(first_record)

740

In [72]:
len(first_record.features)

5

In [73]:
print(first_record.annotations['source'])

Cypripedium irapeanum


# parsing sequence from the net

In [74]:
from Bio import Entrez
from Bio import SeqIO
Entrez.email = 'text@example.com'
with Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id="6273291") as handle:
    seq_record = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))

AF191665.1 with 0 features
