mongodb · ShaneHarvey · Nov 7, 2022 · Nov 4, 2022 · Nov 4, 2022 · Nov 4, 2022
@@ -4,10 +4,23 @@ Changelog
 Changes in Version 4.3.3
 ------------------------
 
-- Fixed a performance regression in :meth:`~gridfs.GridOut.download_to_stream`
-  and :meth:`~gridfs.GridOut.download_to_stream_by_name` by reading in chunks
-  instead of line by line.
+Version 4.3.3 fixes a number of bugs:
 
+- Fixed a performance regression in :meth:`~gridfs.GridFSBucket.download_to_stream`
+  and :meth:`~gridfs.GridFSBucket.download_to_stream_by_name` by reading in chunks
+  instead of line by line (`PYTHON-3502`_).
+- Improved performance of :meth:`gridfs.grid_file.GridOut.read` and
+  :meth:`gridfs.grid_file.GridOut.readline` (`PYTHON-3508`_).
+
+Issues Resolved
+...............
+
+See the `PyMongo 4.3.3 release notes in JIRA`_ for the list of resolved issues
+in this release.
+
+.. _PYTHON-3502: https://jira.mongodb.org/browse/PYTHON-3502
+.. _PYTHON-3508: https://jira.mongodb.org/browse/PYTHON-3508
+.. _PyMongo 4.3.3 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=34709
 
 Changes in Version 4.3 (4.3.2)
 ------------------------------

@@ -463,7 +463,10 @@ def __init__(
         self.__files = root_collection.files
         self.__file_id = file_id
         self.__buffer = EMPTY
+        # Start position within the current buffered chunk.
+        self.__buffer_pos = 0
         self.__chunk_iter = None
+        # Position within the total file.
         self.__position = 0
         self._file = file_document
         self._session = session
@@ -510,12 +513,12 @@ def readchunk(self) -> bytes:
         """Reads a chunk at a time. If the current position is within a
         chunk the remainder of the chunk is returned.
         """
-        received = len(self.__buffer)
+        received = len(self.__buffer) - self.__buffer_pos
         chunk_data = EMPTY
         chunk_size = int(self.chunk_size)
 
         if received > 0:
-            chunk_data = self.__buffer
+            chunk_data = self.__buffer[self.__buffer_pos :]
         elif self.__position < int(self.length):
             chunk_number = int((received + self.__position) / chunk_size)
             if self.__chunk_iter is None:
@@ -531,25 +534,12 @@ def readchunk(self) -> bytes:
 
         self.__position += len(chunk_data)
         self.__buffer = EMPTY
+        self.__buffer_pos = 0
         return chunk_data
 
-    def read(self, size: int = -1) -> bytes:
-        """Read at most `size` bytes from the file (less if there
-        isn't enough data).
-
-        The bytes are returned as an instance of :class:`str` (:class:`bytes`
-        in python 3). If `size` is negative or omitted all data is read.
-
-        :Parameters:
-          - `size` (optional): the number of bytes to read
-
-        .. versionchanged:: 3.8
-           This method now only checks for extra chunks after reading the
-           entire file. Previously, this method would check for extra chunks
-           on every call.
-        """
+    def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
+        """Internal read() and readline() helper."""
         self._ensure_file()
-
         remainder = int(self.length) - self.__position
         if size < 0 or size > remainder:
             size = remainder
@@ -558,11 +548,36 @@ def read(self, size: int = -1) -> bytes:
             return EMPTY
 
         received = 0
-        data = io.BytesIO()
+        data = []
         while received < size:
-            chunk_data = self.readchunk()
+            needed = size - received
+            if self.__buffer:
+                # Optimization: Read the buffer with zero byte copies.
+                buf = self.__buffer
+                chunk_start = self.__buffer_pos
+                chunk_data = memoryview(buf)[self.__buffer_pos :]
+                self.__buffer = EMPTY
+                self.__buffer_pos = 0
+                self.__position += len(chunk_data)
+            else:
+                buf = self.readchunk()
+                chunk_start = 0
+                chunk_data = memoryview(buf)
+            if line:
+                pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start
+                if pos >= 0:
+                    # Decrease size to exit the loop.
+                    size = received + pos + 1
+                    needed = pos + 1
+            if len(chunk_data) > needed:
+                data.append(chunk_data[:needed])
+                # Optimization: Save the buffer with zero byte copies.
+                self.__buffer = buf
+                self.__buffer_pos = chunk_start + needed
+                self.__position -= len(self.__buffer) - self.__buffer_pos
+            else:
+                data.append(chunk_data)
             received += len(chunk_data)
-            data.write(chunk_data)
 
         # Detect extra chunks after reading the entire file.
         if size == remainder and self.__chunk_iter:
@@ -571,47 +586,32 @@ def read(self, size: int = -1) -> bytes:
             except StopIteration:
                 pass
 
-        self.__position -= received - size
+        return b"".join(data)
+
+    def read(self, size: int = -1) -> bytes:
+        """Read at most `size` bytes from the file (less if there
+        isn't enough data).
+
+        The bytes are returned as an instance of :class:`str` (:class:`bytes`
+        in python 3). If `size` is negative or omitted all data is read.
+
+        :Parameters:
+          - `size` (optional): the number of bytes to read
 
-        # Return 'size' bytes and store the rest.
-        data.seek(size)
-        self.__buffer = data.read()
-        data.seek(0)
-        return data.read(size)
+        .. versionchanged:: 3.8
+           This method now only checks for extra chunks after reading the
+           entire file. Previously, this method would check for extra chunks
+           on every call.
+        """
+        return self._read_size_or_line(size=size)
 
     def readline(self, size: int = -1) -> bytes:  # type: ignore[override]
         """Read one line or up to `size` bytes from the file.
 
         :Parameters:
          - `size` (optional): the maximum number of bytes to read
         """
-        remainder = int(self.length) - self.__position
-        if size < 0 or size > remainder:
-            size = remainder
-
-        if size == 0:
-            return EMPTY
-
-        received = 0
-        data = io.BytesIO()
-        while received < size:
-            chunk_data = self.readchunk()
-            pos = chunk_data.find(NEWLN, 0, size)
-            if pos != -1:
-                size = received + pos + 1
-
-            received += len(chunk_data)
-            data.write(chunk_data)
-            if pos != -1:
-                break
-
-        self.__position -= received - size
-
-        # Return 'size' bytes and store the rest.
-        data.seek(size)
-        self.__buffer = data.read()
-        data.seek(0)
-        return data.read(size)
+        return self._read_size_or_line(size=size, line=True)
 
     def tell(self) -> int:
         """Return the current position of this file."""
@@ -651,6 +651,7 @@ def seek(self, pos: int, whence: int = _SEEK_SET) -> int:
 
         self.__position = new_pos
         self.__buffer = EMPTY
+        self.__buffer_pos = 0
         if self.__chunk_iter:
             self.__chunk_iter.close()
             self.__chunk_iter = None