In [15]:
def batch_iterator(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    while entry:
        batch = []
        while len(batch) < batch_size:
            try:
                entry = iterator.next()
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            batch.append(entry)
        if batch:
            yield batch



In [5]:
%%time
from Bio import SeqIO

record_iter = SeqIO.parse(open("../2501025493.genes.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
    filename = "group_%i.fasta" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))



Wrote 1000 records to group_1.fasta
Wrote 1000 records to group_2.fasta
Wrote 1000 records to group_3.fasta
Wrote 818 records to group_4.fasta
Wall time: 254 ms


# Trying temporary files

In [14]:
import tempfile

# create a temporary file and write some data to it
fp = tempfile.TemporaryFile()
fp.write(b'Hello world!')
# read data from file
fp.seek(0)

txt=fp.read()
print(txt)
# close the file, it will be removed
fp.close()
#print(txt)
# # create a temporary file using a context manager
# with tempfile.TemporaryFile() as fp:
#     fp.write(b'Hello world!')
#     fp.seek(0)
#     fp.read()

# # file is now closed and removed

# # create a temporary directory using the context manager
# with tempfile.TemporaryDirectory() as tmpdirname:
#     print('created temporary directory', tmpdirname)

# directory and contents have been removed

Hello world!


In [17]:
import tempfile
from Bio import SeqIO

record_iter = SeqIO.parse(open("../2501025493.genes.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
    
    filename = "group_%i.fasta" % (i + 1)
    filename = tempfile.TemporaryFile()
    filename.seek(0)
    count = SeqIO.write(batch, filename, "fasta")
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))

Wrote 1000 records to <open file '<fdopen>', mode 'w+b' at 0x000000000621DC00>
Wrote 1000 records to <open file '<fdopen>', mode 'w+b' at 0x0000000006CE55D0>
Wrote 1000 records to <open file '<fdopen>', mode 'w+b' at 0x0000000006D3B930>
Wrote 818 records to <open file '<fdopen>', mode 'w+b' at 0x000000000655A030>


In [19]:
txt=filename.seek(0)
txt=filename.read()

In [27]:
filename

<open file '<fdopen>', mode 'w+b' at 0x000000000655A030>

In [24]:
txt2

''

In [22]:
#Coool

In [26]:
txt2=filename.read()

## In-memory files
[https://stackoverflow.com/questions/23028071/open-a-file-in-memory]

In [11]:
#import io##for python3
import StringIO

output = StringIO.StringIO()
output.write('First line.\n')
print >>output, 'Second line.'

# Retrieve file contents -- this will be
# 'First line.\nSecond line.\n'
contents = output.getvalue()
print(contents)
# Close object and discard memory buffer --
# .getvalue() will now raise an exception.
output.close()


First line.
Second line.



## Redoing generator with an in-memory implementation

In [15]:
import StringIO

In [16]:
%%time
from Bio import SeqIO
import StringIO
#output = io.StringIO()
record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    filename = StringIO.StringIO()
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))

TypeError: coercing to Unicode: need string or buffer, instance found

## memory map
[https://docs.python.org/2/library/mmap.html]

In [None]:
import mmap
from Bio import SeqIO
#import StringIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))