Skip to content

Commit

Permalink
Merge pull request #2942 from btel/npz_optimize
Browse files Browse the repository at this point in the history
BUG: Fix loading npz files >2GB on 64bit systems
  • Loading branch information
charris committed Jun 12, 2013
2 parents d4b4ff0 + 7c4e9e1 commit f7ea474
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 11 deletions.
10 changes: 10 additions & 0 deletions doc/release/1.8.0-notes.rst
Expand Up @@ -80,6 +80,11 @@ fact, it never was true in some corner cases). Instead, use
For more information check the "Internal memory layout of an ndarray"
section in the documentation.

IO compatibility with large files
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Large NPZ files >2GB can be loaded on 64-bit systems.


New Features
============
Expand Down Expand Up @@ -136,6 +141,11 @@ A simple test runner script ``runtests.py`` was added. It also builds Numpy via
Improvements
============

IO performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~~~~

Performance in reading large files was improved by chunking (see also IO compatibility).

Performance improvements to `pad`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `pad` function has a new implementation, greatly improving performance for
Expand Down
20 changes: 17 additions & 3 deletions numpy/lib/format.py
Expand Up @@ -148,6 +148,7 @@

MAGIC_PREFIX = asbytes('\x93NUMPY')
MAGIC_LEN = len(MAGIC_PREFIX) + 2
BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes

def magic(major, minor):
""" Return the magic string for the given file format version.
Expand Down Expand Up @@ -457,9 +458,22 @@ def read_array(fp):
else:
# This is not a real file. We have to read it the memory-intensive
# way.
# XXX: we can probably chunk this to avoid the memory hit.
data = fp.read(int(count * dtype.itemsize))
array = numpy.fromstring(data, dtype=dtype, count=count)
# crc32 module fails on reads greater than 2 ** 32 bytes, breaking
# large reads from gzip streams. Chunk reads to BUFFER_SIZE bytes to
# avoid issue and reduce memory overhead of the read. In
# non-chunked case count < max_read_count, so only one read is
# performed.

max_read_count = BUFFER_SIZE // dtype.itemsize

array = numpy.empty(count, dtype=dtype)

for i in range(0, count, max_read_count):
read_count = min(max_read_count, count - i)

data = fp.read(int(read_count * dtype.itemsize))
array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype,
count=read_count)

if fortran_order:
array.shape = shape[::-1]
Expand Down
14 changes: 7 additions & 7 deletions numpy/lib/npyio.py
Expand Up @@ -25,8 +25,6 @@
asbytes, asstr, asbytes_nested, bytes, basestring, unicode
)

from io import BytesIO

if sys.version_info[0] >= 3:
import pickle
else:
Expand Down Expand Up @@ -244,12 +242,14 @@ def __getitem__(self, key):
member = 1
key += '.npy'
if member:
bytes = self.zip.read(key)
if bytes.startswith(format.MAGIC_PREFIX):
value = BytesIO(bytes)
return format.read_array(value)
bytes = self.zip.open(key)
magic = bytes.read(len(format.MAGIC_PREFIX))
bytes.close()
if magic == format.MAGIC_PREFIX:
bytes = self.zip.open(key)
return format.read_array(bytes)
else:
return bytes
return self.zip.read(key)
else:
raise KeyError("%s is not a file in the archive" % key)

Expand Down
17 changes: 16 additions & 1 deletion numpy/lib/tests/test_io.py
Expand Up @@ -4,10 +4,10 @@
import gzip
import os
import threading
from tempfile import mkstemp, mktemp, NamedTemporaryFile
import time
import warnings
import gc
from tempfile import mkstemp, NamedTemporaryFile
from io import BytesIO
from datetime import datetime
from numpy.testing.utils import WarningManager
Expand Down Expand Up @@ -43,6 +43,8 @@ def writelines(self, lines):


MAJVER, MINVER = sys.version_info[:2]
IS_64BIT = sys.maxsize > 2**32


def strptime(s, fmt=None):
"""This function is available in the datetime module only
Expand Down Expand Up @@ -139,6 +141,19 @@ def roundtrip(self, *args, **kwargs):
for n, arr in enumerate(self.arr):
assert_equal(arr, self.arr_reloaded['arr_%d' % n])

@np.testing.dec.skipif(not IS_64BIT, "Works only with 64bit systems")
@np.testing.dec.slow
def test_big_arrays(self):
L = (1 << 31) + 100000
tmp = mktemp(suffix='.npz')
a = np.empty(L, dtype=np.uint8)
np.savez(tmp, a=a)
del a
npfile = np.load(tmp)
a = npfile['a']
npfile.close()
os.remove(tmp)

def test_multiple_arrays(self):
a = np.array([[1, 2], [3, 4]], float)
b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)
Expand Down

0 comments on commit f7ea474

Please sign in to comment.