In [1]:
from Bio import SeqIO
gb_file = "sequence.gb"
for gb_record in SeqIO.parse(open(gb_file,"r"), "genbank") :
    # now do something with the record
    print("Name %s, %i features" % (gb_record.name, len(gb_record.features)))
    print(repr(gb_record.seq))

Name AE017199, 1107 features
Seq('TCTCGCAGAGTTCTTTTTTGTATTAACAAACCCAAAACCCATAGAATTTAATGA...TTA', IUPACAmbiguousDNA())


# A nice tutorial at warwick uni
[https://www2.warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/genbank2fasta/]

[https://www2.warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/genbank/]

In [3]:
from Bio import SeqIO
gbk_filename = "sequence.gb"
faa_filename = "NC_005213_converted.faa"
input_handle  = open(gbk_filename, "r")
output_handle = open(faa_filename, "w")

for seq_record in SeqIO.parse(input_handle, "genbank") :
    print "Dealing with GenBank record %s" % seq_record.id
    for seq_feature in seq_record.features :
        if seq_feature.type=="CDS" :
            assert len(seq_feature.qualifiers['translation'])==1
            output_handle.write(">%s from %s\n%s\n" % (
                   seq_feature.qualifiers['locus_tag'][0],
                   seq_record.name,
                   seq_feature.qualifiers['translation'][0]))

output_handle.close()
input_handle.close()

Dealing with GenBank record AE017199.1


In [4]:
from Bio import SeqIO

In [2]:
#dir(SeqIO)

In [6]:
print(SeqIO.__path__)

['C:\\Users\\andriy\\Anaconda2\\lib\\site-packages\\Bio\\SeqIO']


In [3]:
#dir(record.features[4])

In [7]:
record=SeqIO.read('NC_005816.gb', 'genbank')

In [44]:
record.id

'NC_005816.1'

In [4]:
# count=0
# for item in record.features:
#     count+=1
    
#     if item.type == 'CDS':
#         print(count)
#         print item

In [31]:
f=record.features[3]

In [32]:
type(f)

Bio.SeqFeature.SeqFeature

In [35]:
f.type

'CDS'

In [5]:
#dir(f)

In [46]:
type(f.qualifiers.get('translation'))

list

# Chapter 19 of Python for Bioinformatics

In [43]:
from Bio import SeqIO, SeqRecord, Seq
from Bio.Alphabet import IUPAC

GB_FILE = 'NC_006581.gb'
OUT_FILE = 'nadh.fasta'
with open(GB_FILE) as gb_fh:
    record = SeqIO.read(gb_fh, 'genbank')
seqs_for_fasta = []
for feature in record.features:
    # Each Genbank record may have several features, the program
    # will walk over all of them.
    qualifier = feature.qualifiers
    # Each feature has several parameters
    # Pick selected parameters.
    if 'NADH' in qualifier.get('product',[''])[0] and \
    'product' in qualifier and 'translation' in qualifier:
        id_ = qualifier['db_xref'][0][3:]
        desc = qualifier['product'][0]
        # nadh_sq is a NADH protein sequence
        nadh_sq = Seq.Seq(qualifier['translation'][0], IUPAC.protein)
        # 'srec' is a SeqRecord object from nadh_sq sequence.
        srec = SeqRecord.SeqRecord(nadh_sq, id=id_, description=desc)
        # Add this SeqRecord object into seqsforfasta list.
        seqs_for_fasta.append(srec)
with open(OUT_FILE, 'w') as outf:
    # Write all the sequences as a FASTA file.
    SeqIO.write(seqs_for_fasta, outf, 'fasta')

In [1]:
from Bio import SeqIO, SeqRecord, Seq
from Bio.Alphabet import IUPAC

GB_FILE = 'mrosea_contigs.gb'
OUT_FILE = 'mrosea_proteins.fasta'
seqs_for_fasta = []
with open(GB_FILE) as gb_fh:
    for record in SeqIO.parse(gb_fh, 'genbank'):

        for feature in record.features:
            # Each Genbank record may have several features, the program
            # will walk over all of them.
            qualifier = feature.qualifiers
        #     print("<<")
        #     print(qualifier)
        #     print(">>")
            if 'product' in qualifier and 'translation' in qualifier:#we've got a protein
                id_ = qualifier['protein_id'][0]
                desc = qualifier['product'][0]
                sq = Seq.Seq(qualifier['translation'][0], IUPAC.protein)
                srec = SeqRecord.SeqRecord(sq, id=id_, description=desc)
                seqs_for_fasta.append(srec)

with open(OUT_FILE, 'w') as outf:
    # Write all the sequences as a FASTA file.
    SeqIO.write(seqs_for_fasta, outf, 'fasta')

## A new touch on a genbank with multiple entries

In [3]:
import re
#re.match(r'(ftp|http)://.*\.(jpg|png)$', s)
with open("GCF_000372845.1_ASM37284v1_genomic.gbff", 'r') as f:
    txt=f.read()
txt[:300]

'LOCUS       NZ_KB889963          3546604 bp    DNA     linear   CON 17-APR-2017\nDEFINITION  Methylocystis rosea SV97 A3OODRAFT_scaffold1.1, whole genome\n            shotgun sequence.\nACCESSION   NZ_KB889963 NZ_ARCT01000000\nVERSION     NZ_KB889963.1\nDBLINK      BioProject: PRJNA224116\n            Bio'

In [None]:
re.match(r'(LOCUS)\w')

In [25]:

import itertools as it
filename='test.dat'
entries=list()
headers=list()

with open("GCF_000372845.1_ASM37284v1_genomic.gbff",'r') as f:
    for key,group in it.groupby(f,lambda line: line.startswith('LOCUS')):
            if not key:
                entries.append(list(group))
            else:
                headers.append(list(group))

            

In [29]:
combined=map(lambda x1, x2: x1+x2, headers, entries)

In [48]:
c2=''.join(combined[1])
c1[:300]

'LOCUS       NZ_KB889963          3546604 bp    DNA     linear   CON 17-APR-2017\nDEFINITION  Methylocystis rosea SV97 A3OODRAFT_scaffold1.1, whole genome\n            shotgun sequence.\nACCESSION   NZ_KB889963 NZ_ARCT01000000\nVERSION     NZ_KB889963.1\nDBLINK      BioProject: PRJNA224116\n            Bio'

In [49]:
from Bio import SeqIO, SeqRecord, Seq
from Bio.Alphabet import IUPAC

try:
    from cStringIO import StringIO
except:
    from StringIO import StringIO

GB_FILE = 'mrosea_contigs.gb'
OUT_FILE = 'mros_ncbi_part2.fasta'

sio=StringIO(c2)

seqs_for_fasta = []

for record in SeqIO.parse(sio, 'genbank'):

    for feature in record.features:
        # Each Genbank record may have several features, the program
        # will walk over all of them.
        qualifier = feature.qualifiers
    #     print("<<")
    #     print(qualifier)
    #     print(">>")
        if 'product' in qualifier and 'translation' in qualifier:#we've got a protein
            id_ = qualifier['protein_id'][0]
            desc = qualifier['product'][0]
            sq = Seq.Seq(qualifier['translation'][0], IUPAC.protein)
            srec = SeqRecord.SeqRecord(sq, id=id_, description=desc)
            seqs_for_fasta.append(srec)

with open(OUT_FILE, 'w') as outf:
    # Write all the sequences as a FASTA file.
    SeqIO.write(seqs_for_fasta, outf, 'fasta')

In [22]:

import itertools as it
filename='test.dat'
entries=list()
headers=list()

with open("GCF_000372845.1_ASM37284v1_genomic.gbff",'r') as f:
    for key,group in it.groupby(f,lambda line: line.startswith('LOCUS')):
            group = list(group)
            print("====")
            print(group[:10])

====
['LOCUS       NZ_KB889963          3546604 bp    DNA     linear   CON 17-APR-2017\n']
====
['DEFINITION  Methylocystis rosea SV97 A3OODRAFT_scaffold1.1, whole genome\n', '            shotgun sequence.\n', 'ACCESSION   NZ_KB889963 NZ_ARCT01000000\n', 'VERSION     NZ_KB889963.1\n', 'DBLINK      BioProject: PRJNA224116\n', '            BioSample: SAMN02256431\n', '            Assembly: GCF_000372845.1\n', 'KEYWORDS    WGS; GSC:MIGS:2.1; IMPROVED_HIGH_QUALITY_DRAFT; RefSeq.\n', 'SOURCE      Methylocystis rosea SV97\n', '  ORGANISM  Methylocystis rosea SV97\n']
====
['LOCUS       NZ_KB889964           365446 bp    DNA     linear   CON 17-APR-2017\n']
====
['DEFINITION  Methylocystis rosea SV97 A3OODRAFT_scaffold2.2, whole genome\n', '            shotgun sequence.\n', 'ACCESSION   NZ_KB889964 NZ_ARCT01000000\n', 'VERSION     NZ_KB889964.1\n', 'DBLINK      BioProject: PRJNA224116\n', '            BioSample: SAMN02256431\n', '            Assembly: GCF_000372845.1\n', 'KEYWORDS    WGS; GSC

In [23]:

import itertools as it
filename='test.dat'
entries=list()
headers=list()

with open("GCF_000372845.1_ASM37284v1_genomic.gbff",'r') as f:
    for group in it.groupby(f,lambda line: line.startswith('LOCUS')):
            group = list(group)
            print("====")
            print(group[:10])

====
[True, <itertools._grouper object at 0x0000000008D039B0>]
====
[False, <itertools._grouper object at 0x0000000003DDE518>]
====
[True, <itertools._grouper object at 0x0000000003DDE470>]
====
[False, <itertools._grouper object at 0x0000000003DDE518>]


In [33]:
with open("GCF_000372845.1_ASM37284v1_genomic.gbff",'r') as f:
    chunks = re.split(r'^LOCUS', f.read())

In [34]:
type(chunks)

list

In [36]:
len(chunks)

2

In [37]:
re.split()

['       NZ_KB889963          3546604 bp    DNA     linear   CON 17-APR-2017\nDEFINITION  Methylocystis rosea SV97 A3OODRAFT_scaffold1.1, whole genome\n            shotgun sequence.\nACCESSION   NZ_KB889963 NZ_ARCT01000000\nVERSION     NZ_KB889963.1\nDBLINK      BioProject: PRJNA224116\n            BioSample: SAMN02256431\n            Assembly: GCF_000372845.1\nKEYWORDS    WGS; GSC:MIGS:2.1; IMPROVED_HIGH_QUALITY_DRAFT; RefSeq.\nSOURCE      Methylocystis rosea SV97\n  ORGANISM  Methylocystis rosea SV97\n            Bacteria; Proteobacteria; Alphaproteobacteria; Rhizobiales;\n            Methylocystaceae; Methylocystis.\nREFERENCE   1  (bases 1 to 3546604)\n  AUTHORS   Kalyuzhnaya,M.G., Huntemann,M., Han,J., Chen,A., Kyrpides,N.,\n            Mavromatis,K., Markowitz,V., Palaniappan,K., Ivanova,N.,\n            Schaumberg,A., Pati,A., Liolios,K., Nordberg,H.P., Cantor,M.N.,\n            Hua,S.X. and Woyke,T.\n  TITLE     Direct Submission\n  JOURNAL   Submitted (16-APR-2013) DOE Joint G