Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,23 @@ Changelog
Changes in Version 4.3.3
------------------------

- Fixed a performance regression in :meth:`~gridfs.GridOut.download_to_stream`
and :meth:`~gridfs.GridOut.download_to_stream_by_name` by reading in chunks
instead of line by line.
Version 4.3.3 fixes a number of bugs:

- Fixed a performance regression in :meth:`~gridfs.GridFSBucket.download_to_stream`
and :meth:`~gridfs.GridFSBucket.download_to_stream_by_name` by reading in chunks
instead of line by line (`PYTHON-3502`_).
- Improved performance of :meth:`gridfs.grid_file.GridOut.read` and
:meth:`gridfs.grid_file.GridOut.readline` (`PYTHON-3508`_).

Issues Resolved
...............

See the `PyMongo 4.3.3 release notes in JIRA`_ for the list of resolved issues
in this release.

.. _PYTHON-3502: https://jira.mongodb.org/browse/PYTHON-3502
.. _PYTHON-3508: https://jira.mongodb.org/browse/PYTHON-3508
.. _PyMongo 4.3.3 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=34709

Changes in Version 4.3 (4.3.2)
------------------------------
Expand Down
109 changes: 55 additions & 54 deletions gridfs/grid_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,10 @@ def __init__(
self.__files = root_collection.files
self.__file_id = file_id
self.__buffer = EMPTY
# Start position within the current buffered chunk.
self.__buffer_pos = 0
self.__chunk_iter = None
# Position within the total file.
self.__position = 0
self._file = file_document
self._session = session
Expand Down Expand Up @@ -510,12 +513,12 @@ def readchunk(self) -> bytes:
"""Reads a chunk at a time. If the current position is within a
chunk the remainder of the chunk is returned.
"""
received = len(self.__buffer)
received = len(self.__buffer) - self.__buffer_pos
chunk_data = EMPTY
chunk_size = int(self.chunk_size)

if received > 0:
chunk_data = self.__buffer
chunk_data = self.__buffer[self.__buffer_pos :]
elif self.__position < int(self.length):
chunk_number = int((received + self.__position) / chunk_size)
if self.__chunk_iter is None:
Expand All @@ -531,25 +534,12 @@ def readchunk(self) -> bytes:

self.__position += len(chunk_data)
self.__buffer = EMPTY
self.__buffer_pos = 0
return chunk_data

def read(self, size: int = -1) -> bytes:
"""Read at most `size` bytes from the file (less if there
isn't enough data).

The bytes are returned as an instance of :class:`str` (:class:`bytes`
in python 3). If `size` is negative or omitted all data is read.

:Parameters:
- `size` (optional): the number of bytes to read

.. versionchanged:: 3.8
This method now only checks for extra chunks after reading the
entire file. Previously, this method would check for extra chunks
on every call.
"""
def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes:
"""Internal read() and readline() helper."""
self._ensure_file()

remainder = int(self.length) - self.__position
if size < 0 or size > remainder:
size = remainder
Expand All @@ -558,11 +548,36 @@ def read(self, size: int = -1) -> bytes:
return EMPTY

received = 0
data = io.BytesIO()
data = []
while received < size:
chunk_data = self.readchunk()
needed = size - received
if self.__buffer:
# Optimization: Read the buffer with zero byte copies.
buf = self.__buffer
chunk_start = self.__buffer_pos
chunk_data = memoryview(buf)[self.__buffer_pos :]
self.__buffer = EMPTY
self.__buffer_pos = 0
self.__position += len(chunk_data)
else:
buf = self.readchunk()
chunk_start = 0
chunk_data = memoryview(buf)
if line:
pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start
if pos >= 0:
# Decrease size to exit the loop.
size = received + pos + 1
needed = pos + 1
if len(chunk_data) > needed:
data.append(chunk_data[:needed])
# Optimization: Save the buffer with zero byte copies.
self.__buffer = buf
self.__buffer_pos = chunk_start + needed
self.__position -= len(self.__buffer) - self.__buffer_pos
else:
data.append(chunk_data)
received += len(chunk_data)
data.write(chunk_data)

# Detect extra chunks after reading the entire file.
if size == remainder and self.__chunk_iter:
Expand All @@ -571,47 +586,32 @@ def read(self, size: int = -1) -> bytes:
except StopIteration:
pass

self.__position -= received - size
return b"".join(data)

def read(self, size: int = -1) -> bytes:
"""Read at most `size` bytes from the file (less if there
isn't enough data).

The bytes are returned as an instance of :class:`str` (:class:`bytes`
in python 3). If `size` is negative or omitted all data is read.

:Parameters:
- `size` (optional): the number of bytes to read

# Return 'size' bytes and store the rest.
data.seek(size)
self.__buffer = data.read()
data.seek(0)
return data.read(size)
.. versionchanged:: 3.8
This method now only checks for extra chunks after reading the
entire file. Previously, this method would check for extra chunks
on every call.
"""
return self._read_size_or_line(size=size)

def readline(self, size: int = -1) -> bytes: # type: ignore[override]
"""Read one line or up to `size` bytes from the file.

:Parameters:
- `size` (optional): the maximum number of bytes to read
"""
remainder = int(self.length) - self.__position
if size < 0 or size > remainder:
size = remainder

if size == 0:
return EMPTY

received = 0
data = io.BytesIO()
while received < size:
chunk_data = self.readchunk()
pos = chunk_data.find(NEWLN, 0, size)
if pos != -1:
size = received + pos + 1

received += len(chunk_data)
data.write(chunk_data)
if pos != -1:
break

self.__position -= received - size

# Return 'size' bytes and store the rest.
data.seek(size)
self.__buffer = data.read()
data.seek(0)
return data.read(size)
return self._read_size_or_line(size=size, line=True)

def tell(self) -> int:
"""Return the current position of this file."""
Expand Down Expand Up @@ -651,6 +651,7 @@ def seek(self, pos: int, whence: int = _SEEK_SET) -> int:

self.__position = new_pos
self.__buffer = EMPTY
self.__buffer_pos = 0
if self.__chunk_iter:
self.__chunk_iter.close()
self.__chunk_iter = None
Expand Down