In [4]:
def batch_iterator(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    while entry:
        batch = []
        while len(batch) < batch_size:
            try:
                entry = iterator.next()
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            batch.append(entry)
        if batch:
            yield batch



In [7]:
%%time
from Bio import SeqIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fasta" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))



Wrote 50000 records to group_1.fastq
Wrote 50000 records to group_2.fastq
Wrote 50000 records to group_3.fastq
Wrote 50000 records to group_4.fastq
Wrote 50000 records to group_5.fastq
Wrote 50000 records to group_6.fastq
Wrote 50000 records to group_7.fastq
Wrote 50000 records to group_8.fastq
Wrote 50000 records to group_9.fastq
Wrote 50000 records to group_10.fastq
Wrote 50000 records to group_11.fastq
Wrote 50000 records to group_12.fastq
Wrote 50000 records to group_13.fastq
Wrote 50000 records to group_14.fastq
Wrote 50000 records to group_15.fastq
Wrote 50000 records to group_16.fastq
Wrote 50000 records to group_17.fastq
Wrote 50000 records to group_18.fastq
Wrote 50000 records to group_19.fastq
Wrote 50000 records to group_20.fastq
Wrote 50000 records to group_21.fastq
Wrote 50000 records to group_22.fastq
Wrote 50000 records to group_23.fastq
Wrote 50000 records to group_24.fastq
Wrote 50000 records to group_25.fastq
Wrote 50000 records to group_26.fastq
Wrote 50000 records t

## In-memory files
[https://stackoverflow.com/questions/23028071/open-a-file-in-memory]

In [11]:
#import io##for python3
import StringIO

output = StringIO.StringIO()
output.write('First line.\n')
print >>output, 'Second line.'

# Retrieve file contents -- this will be
# 'First line.\nSecond line.\n'
contents = output.getvalue()
print(contents)
# Close object and discard memory buffer --
# .getvalue() will now raise an exception.
output.close()


First line.
Second line.



## Redoing generator with an in-memory implementation

In [15]:
import StringIO

In [16]:
%%time
from Bio import SeqIO
import StringIO
#output = io.StringIO()
record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    filename = StringIO.StringIO()
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))

TypeError: coercing to Unicode: need string or buffer, instance found

## memory map
[https://docs.python.org/2/library/mmap.html]

In [None]:
import mmap
from Bio import SeqIO
#import StringIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))