From 9bc72aedb6dfbc2e5a7bc34359b12f85f6154c0c Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Fri, 4 Nov 2022 12:59:33 -0700 Subject: [PATCH 1/4] PYTHON-3508 Improve the performance of GridOut.readline and GridOut.read --- gridfs/grid_file.py | 90 +++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py index cec7d57a22..f64ca267e1 100644 --- a/gridfs/grid_file.py +++ b/gridfs/grid_file.py @@ -533,23 +533,9 @@ def readchunk(self) -> bytes: self.__buffer = EMPTY return chunk_data - def read(self, size: int = -1) -> bytes: - """Read at most `size` bytes from the file (less if there - isn't enough data). - - The bytes are returned as an instance of :class:`str` (:class:`bytes` - in python 3). If `size` is negative or omitted all data is read. - - :Parameters: - - `size` (optional): the number of bytes to read - - .. versionchanged:: 3.8 - This method now only checks for extra chunks after reading the - entire file. Previously, this method would check for extra chunks - on every call. - """ + def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes: + """Internal read() and readline() helper.""" self._ensure_file() - remainder = int(self.length) - self.__position if size < 0 or size > remainder: size = remainder @@ -558,26 +544,52 @@ def read(self, size: int = -1) -> bytes: return EMPTY received = 0 - data = io.BytesIO() + data = [] + extra = EMPTY while received < size: + needed = size - received chunk_data = self.readchunk() received += len(chunk_data) - data.write(chunk_data) + if line: + pos = chunk_data.find(NEWLN, 0, needed) + if pos != -1: + data.append(chunk_data[: pos + 1]) + extra = chunk_data[pos + 1 :] + break + if len(chunk_data) > needed: + data.append(chunk_data[:needed]) + extra = chunk_data[needed:] + else: + data.append(chunk_data) + + self.__buffer = extra + self.__position -= len(extra) # Detect extra chunks after reading the entire file. - if size == remainder and self.__chunk_iter: + if size == remainder and self.__chunk_iter and not extra: try: self.__chunk_iter.next() except StopIteration: pass - self.__position -= received - size + return b"".join(data) - # Return 'size' bytes and store the rest. - data.seek(size) - self.__buffer = data.read() - data.seek(0) - return data.read(size) + def read(self, size: int = -1) -> bytes: + """Read at most `size` bytes from the file (less if there + isn't enough data). + + The bytes are returned as an instance of :class:`str` (:class:`bytes` + in python 3). If `size` is negative or omitted all data is read. + + :Parameters: + - `size` (optional): the number of bytes to read + + .. versionchanged:: 3.8 + This method now only checks for extra chunks after reading the + entire file. Previously, this method would check for extra chunks + on every call. + """ + return self._read_size_or_line(size=size) def readline(self, size: int = -1) -> bytes: # type: ignore[override] """Read one line or up to `size` bytes from the file. @@ -585,33 +597,7 @@ def readline(self, size: int = -1) -> bytes: # type: ignore[override] :Parameters: - `size` (optional): the maximum number of bytes to read """ - remainder = int(self.length) - self.__position - if size < 0 or size > remainder: - size = remainder - - if size == 0: - return EMPTY - - received = 0 - data = io.BytesIO() - while received < size: - chunk_data = self.readchunk() - pos = chunk_data.find(NEWLN, 0, size) - if pos != -1: - size = received + pos + 1 - - received += len(chunk_data) - data.write(chunk_data) - if pos != -1: - break - - self.__position -= received - size - - # Return 'size' bytes and store the rest. - data.seek(size) - self.__buffer = data.read() - data.seek(0) - return data.read(size) + return self._read_size_or_line(size=size, line=True) def tell(self) -> int: """Return the current position of this file.""" From ecfa62343fa010221f62d1a5e94420fd1bc43e37 Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Fri, 4 Nov 2022 15:18:16 -0700 Subject: [PATCH 2/4] PYTHON-3508 Futher improve read() readline() by avoiding memory copies on cached buffer --- gridfs/grid_file.py | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py index f64ca267e1..2bb1a0cd34 100644 --- a/gridfs/grid_file.py +++ b/gridfs/grid_file.py @@ -463,7 +463,10 @@ def __init__( self.__files = root_collection.files self.__file_id = file_id self.__buffer = EMPTY + # Start position within the current buffered chunk. + self.__buffer_pos = 0 self.__chunk_iter = None + # Position within the total file. self.__position = 0 self._file = file_document self._session = session @@ -510,12 +513,12 @@ def readchunk(self) -> bytes: """Reads a chunk at a time. If the current position is within a chunk the remainder of the chunk is returned. """ - received = len(self.__buffer) + received = len(self.__buffer) - self.__buffer_pos chunk_data = EMPTY chunk_size = int(self.chunk_size) if received > 0: - chunk_data = self.__buffer + chunk_data = self.__buffer[self.__buffer_pos :] elif self.__position < int(self.length): chunk_number = int((received + self.__position) / chunk_size) if self.__chunk_iter is None: @@ -531,6 +534,7 @@ def readchunk(self) -> bytes: self.__position += len(chunk_data) self.__buffer = EMPTY + self.__buffer_pos = 0 return chunk_data def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes: @@ -545,28 +549,37 @@ def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes: received = 0 data = [] - extra = EMPTY while received < size: needed = size - received - chunk_data = self.readchunk() - received += len(chunk_data) + if self.__buffer: + # Optimization: Read the buffer with zero byte copies. + buf = self.__buffer + chunk_start = self.__buffer_pos + chunk_data = memoryview(buf)[self.__buffer_pos :] + self.__buffer = EMPTY + self.__buffer_pos = 0 + self.__position += len(chunk_data) + else: + buf = chunk_data = self.readchunk() + chunk_start = 0 if line: - pos = chunk_data.find(NEWLN, 0, needed) - if pos != -1: - data.append(chunk_data[: pos + 1]) - extra = chunk_data[pos + 1 :] - break + pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start + if pos >= 0: + # Decrease size to exit the loop. + size = received + pos + 1 + needed = pos + 1 if len(chunk_data) > needed: data.append(chunk_data[:needed]) - extra = chunk_data[needed:] + # Optimization: Save the buffer with zero byte copies. + self.__buffer = buf + self.__buffer_pos = chunk_start + needed + self.__position -= len(self.__buffer) - self.__buffer_pos else: data.append(chunk_data) - - self.__buffer = extra - self.__position -= len(extra) + received += len(chunk_data) # Detect extra chunks after reading the entire file. - if size == remainder and self.__chunk_iter and not extra: + if size == remainder and self.__chunk_iter: try: self.__chunk_iter.next() except StopIteration: @@ -637,6 +650,7 @@ def seek(self, pos: int, whence: int = _SEEK_SET) -> int: self.__position = new_pos self.__buffer = EMPTY + self.__buffer_pos = 0 if self.__chunk_iter: self.__chunk_iter.close() self.__chunk_iter = None From 375db12d71ba4fd1d62bb4f64236aa32831226c2 Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Fri, 4 Nov 2022 15:46:46 -0700 Subject: [PATCH 3/4] PYTHON-3508 Add changelog --- doc/changelog.rst | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 4f4e5ace71..b3587e04ca 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -4,10 +4,23 @@ Changelog Changes in Version 4.3.3 ------------------------ -- Fixed a performance regression in :meth:`~gridfs.GridOut.download_to_stream` - and :meth:`~gridfs.GridOut.download_to_stream_by_name` by reading in chunks - instead of line by line. +Version 4.3.3 fixes a number of bugs: +- Fixed a performance regression in :meth:`~gridfs.GridFSBucket.download_to_stream` + and :meth:`~gridfs.GridFSBucket.download_to_stream_by_name` by reading in chunks + instead of line by line (`PYTHON-3502`_). +- Improved performance of :meth:`gridfs.grid_file.GridOut.read` and + :meth:`gridfs.grid_file.GridOut.readline` (`PYTHON-3508`_). + +Issues Resolved +............... + +See the `PyMongo 4.3.3 release notes in JIRA`_ for the list of resolved issues +in this release. + +.. _PYTHON-3502: https://jira.mongodb.org/browse/PYTHON-3502 +.. _PYTHON-3508: https://jira.mongodb.org/browse/PYTHON-3508 +.. _PyMongo 4.3.3 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=34709 Changes in Version 4.3 (4.3.2) ------------------------------ From a7cb2761ef040e549dbd19786a5a41294268cd4c Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Fri, 4 Nov 2022 15:53:05 -0700 Subject: [PATCH 4/4] PYTHON-3508 Fix mypy error --- gridfs/grid_file.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py index 2bb1a0cd34..50efc0cd23 100644 --- a/gridfs/grid_file.py +++ b/gridfs/grid_file.py @@ -560,8 +560,9 @@ def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes: self.__buffer_pos = 0 self.__position += len(chunk_data) else: - buf = chunk_data = self.readchunk() + buf = self.readchunk() chunk_start = 0 + chunk_data = memoryview(buf) if line: pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start if pos >= 0: