From 9bc72aedb6dfbc2e5a7bc34359b12f85f6154c0c Mon Sep 17 00:00:00 2001
From: Shane Harvey <shnhrv@gmail.com>
Date: Fri, 4 Nov 2022 12:59:33 -0700
Subject: [PATCH 1/4] PYTHON-3508 Improve the performance of GridOut.readline
 and GridOut.read

---
 gridfs/grid_file.py | 90 +++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py
index cec7d57a22..f64ca267e1 100644
--- a/gridfs/grid_file.py
+++ b/gridfs/grid_file.py
@@ -533,23 +533,9 @@ def readchunk(self) -> bytes:
         self.__buffer = EMPTY
         return chunk_data
 
-    def read(self, size: int = -1) -> bytes:
-        """Read at most `size` bytes from the file (less if there
-        isn't enough data).
-
-        The bytes are returned as an instance of :class:`str` (:class:`bytes`
-        in python 3). If `size` is negative or omitted all data is read.
-
-        :Parameters:
-          - `size` (optional): the number of bytes to read
-
-        .. versionchanged:: 3.8
-           This method now only checks for extra chunks after reading the
-           entire file. Previously, this method would check for extra chunks
-           on every call.
-        """
+    def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
+        """Internal read() and readline() helper."""
         self._ensure_file()
-
         remainder = int(self.length) - self.__position
         if size < 0 or size > remainder:
             size = remainder
@@ -558,26 +544,52 @@ def read(self, size: int = -1) -> bytes:
             return EMPTY
 
         received = 0
-        data = io.BytesIO()
+        data = []
+        extra = EMPTY
         while received < size:
+            needed = size - received
             chunk_data = self.readchunk()
             received += len(chunk_data)
-            data.write(chunk_data)
+            if line:
+                pos = chunk_data.find(NEWLN, 0, needed)
+                if pos != -1:
+                    data.append(chunk_data[: pos + 1])
+                    extra = chunk_data[pos + 1 :]
+                    break
+            if len(chunk_data) > needed:
+                data.append(chunk_data[:needed])
+                extra = chunk_data[needed:]
+            else:
+                data.append(chunk_data)
+
+        self.__buffer = extra
+        self.__position -= len(extra)
 
         # Detect extra chunks after reading the entire file.
-        if size == remainder and self.__chunk_iter:
+        if size == remainder and self.__chunk_iter and not extra:
             try:
                 self.__chunk_iter.next()
             except StopIteration:
                 pass
 
-        self.__position -= received - size
+        return b"".join(data)
 
-        # Return 'size' bytes and store the rest.
-        data.seek(size)
-        self.__buffer = data.read()
-        data.seek(0)
-        return data.read(size)
+    def read(self, size: int = -1) -> bytes:
+        """Read at most `size` bytes from the file (less if there
+        isn't enough data).
+
+        The bytes are returned as an instance of :class:`str` (:class:`bytes`
+        in python 3). If `size` is negative or omitted all data is read.
+
+        :Parameters:
+          - `size` (optional): the number of bytes to read
+
+        .. versionchanged:: 3.8
+           This method now only checks for extra chunks after reading the
+           entire file. Previously, this method would check for extra chunks
+           on every call.
+        """
+        return self._read_size_or_line(size=size)
 
     def readline(self, size: int = -1) -> bytes:  # type: ignore[override]
         """Read one line or up to `size` bytes from the file.
@@ -585,33 +597,7 @@ def readline(self, size: int = -1) -> bytes:  # type: ignore[override]
         :Parameters:
          - `size` (optional): the maximum number of bytes to read
         """
-        remainder = int(self.length) - self.__position
-        if size < 0 or size > remainder:
-            size = remainder
-
-        if size == 0:
-            return EMPTY
-
-        received = 0
-        data = io.BytesIO()
-        while received < size:
-            chunk_data = self.readchunk()
-            pos = chunk_data.find(NEWLN, 0, size)
-            if pos != -1:
-                size = received + pos + 1
-
-            received += len(chunk_data)
-            data.write(chunk_data)
-            if pos != -1:
-                break
-
-        self.__position -= received - size
-
-        # Return 'size' bytes and store the rest.
-        data.seek(size)
-        self.__buffer = data.read()
-        data.seek(0)
-        return data.read(size)
+        return self._read_size_or_line(size=size, line=True)
 
     def tell(self) -> int:
         """Return the current position of this file."""

From ecfa62343fa010221f62d1a5e94420fd1bc43e37 Mon Sep 17 00:00:00 2001
From: Shane Harvey <shnhrv@gmail.com>
Date: Fri, 4 Nov 2022 15:18:16 -0700
Subject: [PATCH 2/4] PYTHON-3508 Futher improve read() readline() by avoiding
 memory copies on cached buffer

---
 gridfs/grid_file.py | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py
index f64ca267e1..2bb1a0cd34 100644
--- a/gridfs/grid_file.py
+++ b/gridfs/grid_file.py
@@ -463,7 +463,10 @@ def __init__(
         self.__files = root_collection.files
         self.__file_id = file_id
         self.__buffer = EMPTY
+        # Start position within the current buffered chunk.
+        self.__buffer_pos = 0
         self.__chunk_iter = None
+        # Position within the total file.
         self.__position = 0
         self._file = file_document
         self._session = session
@@ -510,12 +513,12 @@ def readchunk(self) -> bytes:
         """Reads a chunk at a time. If the current position is within a
         chunk the remainder of the chunk is returned.
         """
-        received = len(self.__buffer)
+        received = len(self.__buffer) - self.__buffer_pos
         chunk_data = EMPTY
         chunk_size = int(self.chunk_size)
 
         if received > 0:
-            chunk_data = self.__buffer
+            chunk_data = self.__buffer[self.__buffer_pos :]
         elif self.__position < int(self.length):
             chunk_number = int((received + self.__position) / chunk_size)
             if self.__chunk_iter is None:
@@ -531,6 +534,7 @@ def readchunk(self) -> bytes:
 
         self.__position += len(chunk_data)
         self.__buffer = EMPTY
+        self.__buffer_pos = 0
         return chunk_data
 
     def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
@@ -545,28 +549,37 @@ def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
 
         received = 0
         data = []
-        extra = EMPTY
         while received < size:
             needed = size - received
-            chunk_data = self.readchunk()
-            received += len(chunk_data)
+            if self.__buffer:
+                # Optimization: Read the buffer with zero byte copies.
+                buf = self.__buffer
+                chunk_start = self.__buffer_pos
+                chunk_data = memoryview(buf)[self.__buffer_pos :]
+                self.__buffer = EMPTY
+                self.__buffer_pos = 0
+                self.__position += len(chunk_data)
+            else:
+                buf = chunk_data = self.readchunk()
+                chunk_start = 0
             if line:
-                pos = chunk_data.find(NEWLN, 0, needed)
-                if pos != -1:
-                    data.append(chunk_data[: pos + 1])
-                    extra = chunk_data[pos + 1 :]
-                    break
+                pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start
+                if pos >= 0:
+                    # Decrease size to exit the loop.
+                    size = received + pos + 1
+                    needed = pos + 1
             if len(chunk_data) > needed:
                 data.append(chunk_data[:needed])
-                extra = chunk_data[needed:]
+                # Optimization: Save the buffer with zero byte copies.
+                self.__buffer = buf
+                self.__buffer_pos = chunk_start + needed
+                self.__position -= len(self.__buffer) - self.__buffer_pos
             else:
                 data.append(chunk_data)
-
-        self.__buffer = extra
-        self.__position -= len(extra)
+            received += len(chunk_data)
 
         # Detect extra chunks after reading the entire file.
-        if size == remainder and self.__chunk_iter and not extra:
+        if size == remainder and self.__chunk_iter:
             try:
                 self.__chunk_iter.next()
             except StopIteration:
@@ -637,6 +650,7 @@ def seek(self, pos: int, whence: int = _SEEK_SET) -> int:
 
         self.__position = new_pos
         self.__buffer = EMPTY
+        self.__buffer_pos = 0
         if self.__chunk_iter:
             self.__chunk_iter.close()
             self.__chunk_iter = None

From 375db12d71ba4fd1d62bb4f64236aa32831226c2 Mon Sep 17 00:00:00 2001
From: Shane Harvey <shnhrv@gmail.com>
Date: Fri, 4 Nov 2022 15:46:46 -0700
Subject: [PATCH 3/4] PYTHON-3508 Add changelog

---
 doc/changelog.rst | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 4f4e5ace71..b3587e04ca 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -4,10 +4,23 @@ Changelog
 Changes in Version 4.3.3
 ------------------------
 
-- Fixed a performance regression in :meth:`~gridfs.GridOut.download_to_stream`
-  and :meth:`~gridfs.GridOut.download_to_stream_by_name` by reading in chunks
-  instead of line by line.
+Version 4.3.3 fixes a number of bugs:
 
+- Fixed a performance regression in :meth:`~gridfs.GridFSBucket.download_to_stream`
+  and :meth:`~gridfs.GridFSBucket.download_to_stream_by_name` by reading in chunks
+  instead of line by line (`PYTHON-3502`_).
+- Improved performance of :meth:`gridfs.grid_file.GridOut.read` and
+  :meth:`gridfs.grid_file.GridOut.readline` (`PYTHON-3508`_).
+
+Issues Resolved
+...............
+
+See the `PyMongo 4.3.3 release notes in JIRA`_ for the list of resolved issues
+in this release.
+
+.. _PYTHON-3502: https://jira.mongodb.org/browse/PYTHON-3502
+.. _PYTHON-3508: https://jira.mongodb.org/browse/PYTHON-3508
+.. _PyMongo 4.3.3 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=34709
 
 Changes in Version 4.3 (4.3.2)
 ------------------------------

From a7cb2761ef040e549dbd19786a5a41294268cd4c Mon Sep 17 00:00:00 2001
From: Shane Harvey <shnhrv@gmail.com>
Date: Fri, 4 Nov 2022 15:53:05 -0700
Subject: [PATCH 4/4] PYTHON-3508 Fix mypy error

---
 gridfs/grid_file.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py
index 2bb1a0cd34..50efc0cd23 100644
--- a/gridfs/grid_file.py
+++ b/gridfs/grid_file.py
@@ -560,8 +560,9 @@ def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
                 self.__buffer_pos = 0
                 self.__position += len(chunk_data)
             else:
-                buf = chunk_data = self.readchunk()
+                buf = self.readchunk()
                 chunk_start = 0
+                chunk_data = memoryview(buf)
             if line:
                 pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start
                 if pos >= 0: