Merge pull request #2942 from btel/npz_optimize

BUG: Fix loading npz files >2GB on 64bit systems
numpy · Jun 12, 2013 · f7ea474 · f7ea474
2 parents d4b4ff0 + 7c4e9e1
commit f7ea474
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 11 deletions.
diff --git a/doc/release/1.8.0-notes.rst b/doc/release/1.8.0-notes.rst
@@ -80,6 +80,11 @@ fact, it never was true in some corner cases). Instead, use
 For more information check the "Internal memory layout of an ndarray"
 section in the documentation.
 
+IO compatibility with large files
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Large NPZ files >2GB can be loaded on 64-bit systems.
+
 
 New Features
 ============
@@ -136,6 +141,11 @@ A simple test runner script ``runtests.py`` was added. It also builds Numpy via
 Improvements
 ============
 
+IO performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Performance in reading large files was improved by chunking (see also IO compatibility).
+
 Performance improvements to `pad`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The `pad` function has a new implementation, greatly improving performance for

diff --git a/numpy/lib/format.py b/numpy/lib/format.py
@@ -148,6 +148,7 @@
 
 MAGIC_PREFIX = asbytes('\x93NUMPY')
 MAGIC_LEN = len(MAGIC_PREFIX) + 2
+BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes
 
 def magic(major, minor):
     """ Return the magic string for the given file format version.
@@ -457,9 +458,22 @@ def read_array(fp):
         else:
             # This is not a real file. We have to read it the memory-intensive
             # way.
-            # XXX: we can probably chunk this to avoid the memory hit.
-            data = fp.read(int(count * dtype.itemsize))
-            array = numpy.fromstring(data, dtype=dtype, count=count)
+            # crc32 module fails on reads greater than 2 ** 32 bytes, breaking
+            # large reads from gzip streams. Chunk reads to BUFFER_SIZE bytes to
+            # avoid issue and reduce memory overhead of the read. In
+            # non-chunked case count < max_read_count, so only one read is
+            # performed.
+
+            max_read_count = BUFFER_SIZE // dtype.itemsize
+
+            array = numpy.empty(count, dtype=dtype)
+
+            for i in range(0, count, max_read_count):
+                read_count = min(max_read_count, count - i)
+
+                data = fp.read(int(read_count * dtype.itemsize))
+                array[i:i+read_count] = numpy.frombuffer(data, dtype=dtype,
+                                                         count=read_count)
 
         if fortran_order:
             array.shape = shape[::-1]

diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
@@ -25,8 +25,6 @@
         asbytes, asstr, asbytes_nested, bytes, basestring, unicode
         )
 
-from io import BytesIO
-
 if sys.version_info[0] >= 3:
     import pickle
 else:
@@ -244,12 +242,14 @@ def __getitem__(self, key):
             member = 1
             key += '.npy'
         if member:
-            bytes = self.zip.read(key)
-            if bytes.startswith(format.MAGIC_PREFIX):
-                value = BytesIO(bytes)
-                return format.read_array(value)
+            bytes = self.zip.open(key)
+            magic = bytes.read(len(format.MAGIC_PREFIX))
+            bytes.close()
+            if magic == format.MAGIC_PREFIX:
+                bytes = self.zip.open(key)
+                return format.read_array(bytes)
             else:
-                return bytes
+                return self.zip.read(key)
         else:
             raise KeyError("%s is not a file in the archive" % key)
 

diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
@@ -4,10 +4,10 @@
 import gzip
 import os
 import threading
+from tempfile import mkstemp, mktemp, NamedTemporaryFile
 import time
 import warnings
 import gc
-from tempfile import mkstemp, NamedTemporaryFile
 from io import BytesIO
 from datetime import datetime
 from numpy.testing.utils import WarningManager
@@ -43,6 +43,8 @@ def writelines(self, lines):
 
 
 MAJVER, MINVER = sys.version_info[:2]
+IS_64BIT = sys.maxsize > 2**32
+
 
 def strptime(s, fmt=None):
     """This function is available in the datetime module only
@@ -139,6 +141,19 @@ def roundtrip(self, *args, **kwargs):
         for n, arr in enumerate(self.arr):
             assert_equal(arr, self.arr_reloaded['arr_%d' % n])
 
+    @np.testing.dec.skipif(not IS_64BIT, "Works only with 64bit systems")
+    @np.testing.dec.slow
+    def test_big_arrays(self):
+        L = (1 << 31) + 100000
+        tmp = mktemp(suffix='.npz')
+        a = np.empty(L, dtype=np.uint8)
+        np.savez(tmp, a=a)
+        del a
+        npfile = np.load(tmp)
+        a = npfile['a']
+        npfile.close()
+        os.remove(tmp)
+
     def test_multiple_arrays(self):
         a = np.array([[1, 2], [3, 4]], float)
         b = np.array([[1 + 2j, 2 + 7j], [3 - 6j, 4 + 12j]], complex)