From 7cf00eec216a40e058dd5216c228a7916b86936e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 13:56:35 +0000 Subject: [PATCH 01/11] refactor: make JSON and XML loaders truly streaming/lazy Agent-Logs-Url: https://github.com/pickwicksoft/pystreamapi/sessions/82b1cf1d-99d7-44ca-b24f-32dd8310dc66 Co-authored-by: garlontas <70283087+garlontas@users.noreply.github.com> --- pyproject.toml | 4 +- pystreamapi/loaders/__json/__json_loader.py | 113 ++++++++++++++++---- pystreamapi/loaders/__xml/__xml_loader.py | 60 ++++++----- 3 files changed, 131 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 88958a8..d1c783b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ packages = [ python = ">=3.10,<4.0" joblib = ">=1.2.0" defusedxml = { version = ">=0.7,<0.8", optional = true } +ijson = { version = ">=3.1", optional = true } pyyaml = "^6.0.1" tomlkit = "^0.13.2" setuptools = ">=70.0.0" @@ -24,7 +25,8 @@ setuptools = ">=70.0.0" xml_loader = ["defusedxml"] yaml_loader = ["pyyaml"] toml_loader = ["tomlkit"] -all = ["defusedxml", "pyyaml", "tomlkit"] +json_loader = ["ijson"] +all = ["defusedxml", "ijson", "pyyaml", "tomlkit"] [tool.poetry.group.test.dependencies] parameterized = "*" diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py index 70364b1..c01071c 100644 --- a/pystreamapi/loaders/__json/__json_loader.py +++ b/pystreamapi/loaders/__json/__json_loader.py @@ -1,9 +1,60 @@ -import json as jsonlib +import io from collections import namedtuple from typing import Any, Iterator +try: + import ijson +except ImportError as exc: + raise ImportError( + "Please install the json_loader extra dependency (ijson) to use the json loader." + ) from exc + from pystreamapi.loaders.__loader_utils import LoaderUtils +_PEEK_SIZE = 4096 + + +class _TextToBytesWrapper: + """Wraps a text-mode file handle and converts its output to bytes for ijson.""" + + def __init__(self, handle, encoding='utf-8'): + self._handle = handle + self._encoding = encoding + + def read(self, size=-1): + data = self._handle.read(size) + if isinstance(data, str): + return data.encode(self._encoding) + return data if data else b'' + + +class _PeekableBytesReader: + """Replays a pre-read buffer before delegating further reads to the underlying source.""" + + def __init__(self, buffer: bytes, source): + self._buf = buffer + self._src = source + + def read(self, size=-1): + if size == -1: + tail = self._src.read() + if isinstance(tail, str): + tail = tail.encode('utf-8') + result = self._buf + tail + self._buf = b'' + return result + if len(self._buf) >= size: + result = self._buf[:size] + self._buf = self._buf[size:] + return result + needed = size - len(self._buf) + more = self._src.read(needed) + if isinstance(more, str): + more = more.encode('utf-8') + result = self._buf + more + self._buf = b'' + return result + def json(src: str, read_from_src=False) -> Iterator[Any]: """ @@ -24,44 +75,64 @@ def json(src: str, read_from_src=False) -> Iterator[Any]: def __lazy_load_json_file(file_path: str) -> Iterator[Any]: - """Lazily read and parse a JSON file, yielding namedtuples.""" + """Lazily read and parse a JSON file, yielding namedtuples incrementally.""" def generator(): - """Generate namedtuples from the JSON file contents.""" + """Yield namedtuples from the JSON file using a streaming parser.""" # skipcq: PTC-W6004 with open(file_path, mode='r', encoding='utf-8') as jsonfile: - src = jsonfile.read() - if not src.strip(): - return - result = jsonlib.loads(src, object_hook=__dict_to_namedtuple) - if isinstance(result, list): - yield from result - else: - yield result + yield from __stream_json_items(jsonfile) return generator() def __lazy_load_json_string(json_string: str) -> Iterator[Any]: - """Lazily parse a JSON string, yielding namedtuples.""" + """Lazily parse a JSON string, yielding namedtuples incrementally.""" def generator(): - """Internal generator that yields namedtuples by parsing the JSON string on demand.""" - if not json_string.strip(): - return - result = jsonlib.loads(json_string, object_hook=__dict_to_namedtuple) - if isinstance(result, list): - yield from result - else: - yield result + """Yield namedtuples by streaming-parsing the JSON string.""" + yield from __stream_json_items(io.StringIO(json_string)) return generator() +def __stream_json_items(handle) -> Iterator[Any]: + """Stream JSON items from a text-mode file-like handle using ijson. + + Reads an initial chunk to detect whether the root value is an array or a + single object, then replays that chunk together with the remainder of the + handle through a bytes wrapper so that ijson can parse incrementally. + """ + initial = handle.read(_PEEK_SIZE) + if isinstance(initial, str): + initial_str = initial + initial_bytes = initial.encode('utf-8') + else: + initial_bytes = initial + initial_str = initial.decode('utf-8', errors='replace') + + stripped = initial_str.lstrip() + if not stripped: + return + + first_char = stripped[0] + reader = _PeekableBytesReader(initial_bytes, _TextToBytesWrapper(handle)) + + if first_char == '[': + for item in ijson.items(reader, 'item', use_float=True): + yield __dict_to_namedtuple(item) + else: + obj = next(ijson.items(reader, '', use_float=True), None) + if obj is not None: + yield __dict_to_namedtuple(obj) + + def __dict_to_namedtuple(d, name='Item'): - """Convert a dictionary to a namedtuple""" + """Convert a dictionary (and any nested dicts/lists) to namedtuples recursively.""" if isinstance(d, dict): fields = list(d.keys()) Item = namedtuple(name, fields) return Item(**{k: __dict_to_namedtuple(v, k) for k, v in d.items()}) + if isinstance(d, list): + return [__dict_to_namedtuple(item) for item in d] return d diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index 1a12aba..fc7d13c 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -1,3 +1,4 @@ +import io from typing import Iterator, Any try: @@ -36,36 +37,55 @@ def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True, def _lazy_parse_xml_file(file_path: str, encoding: str, retrieve_children: bool, cast_types: bool) -> Iterator[Any]: - """Lazily parse an XML file by reading its content and yielding parsed namedtuples.""" + """Lazily parse an XML file using iterparse, yielding namedtuples without reading all at once.""" def generator(): - """Generator that reads the XML file and yields parsed namedtuples lazily.""" + """Generator that streams XML elements from the file and yields namedtuples lazily.""" # skipcq: PTC-W6004 with open(file_path, mode='r', encoding=encoding) as xmlfile: - xml_string = xmlfile.read() - yield from _parse_xml_string_lazy(xml_string, retrieve_children, cast_types) + yield from _iterparse_xml(xmlfile, retrieve_children, cast_types) return generator() def _lazy_parse_xml_string(xml_string: str, retrieve_children: bool, cast_types: bool) -> Iterator[Any]: - """Lazily parse an XML string by yielding parsed namedtuples for each element.""" + """Lazily parse an XML string using iterparse, yielding namedtuples without a full DOM build.""" def generator(): - """Generator that yields parsed namedtuples from the XML string lazily.""" - yield from _parse_xml_string_lazy(xml_string, retrieve_children, cast_types) + """Generator that streams XML elements from a string source and yields namedtuples.""" + yield from _iterparse_xml(io.StringIO(xml_string), retrieve_children, cast_types) return generator() -def _parse_xml_string_lazy(xml_string: str, retrieve_children: bool, - cast_types: bool) -> Iterator[Any]: - """Parse an XML string into namedtuples, optionally yielding child elements lazily.""" - root = ElementTree.fromstring(xml_string) - parsed = __parse_xml(root, cast_types) - if retrieve_children: - yield from __flatten(parsed) - else: - yield parsed +def _iterparse_xml(source, retrieve_children: bool, cast_types: bool) -> Iterator[Any]: + """Drive iterparse over *source* and yield namedtuples incrementally. + + When *retrieve_children* is True each direct child of the root element is + converted and yielded as soon as its closing tag is encountered; the child + is then removed from the root so that memory is freed immediately. + + When *retrieve_children* is False the entire document is consumed and the + root element is converted and yielded once. + """ + depth = 0 + root = None + context = ElementTree.iterparse(source, events=('start', 'end')) + + for event, elem in context: + if event == 'start': + depth += 1 + if root is None: + root = elem + else: # 'end' + depth -= 1 + if retrieve_children: + if depth == 1: + yield __parse_xml(elem, cast_types) + root.remove(elem) + else: + if depth == 0: + yield __parse_xml(root, cast_types) + return def __parse_xml(element, cast_types: bool): @@ -106,11 +126,3 @@ def __filter_single_items(tag_dict): """Filter out single-item lists from a dictionary.""" return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()} - -def __flatten(data): - """Yield flattened elements from a possibly nested structure.""" - for item in data: - if isinstance(item, list): - yield from item - else: - yield item From 17116c6682fa67cbaa19b5339652c89c9c3300d1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 13:59:06 +0000 Subject: [PATCH 02/11] address code review: add comments and type annotation Agent-Logs-Url: https://github.com/pickwicksoft/pystreamapi/sessions/82b1cf1d-99d7-44ca-b24f-32dd8310dc66 Co-authored-by: garlontas <70283087+garlontas@users.noreply.github.com> --- pystreamapi/loaders/__json/__json_loader.py | 10 +++++++++- pystreamapi/loaders/__xml/__xml_loader.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py index c01071c..655e409 100644 --- a/pystreamapi/loaders/__json/__json_loader.py +++ b/pystreamapi/loaders/__json/__json_loader.py @@ -37,6 +37,8 @@ def __init__(self, buffer: bytes, source): def read(self, size=-1): if size == -1: + # Full-read path: used by non-chunking callers (e.g. test helpers). + # Streaming callers (like ijson) always pass an explicit chunk size. tail = self._src.read() if isinstance(tail, str): tail = tail.encode('utf-8') @@ -128,7 +130,13 @@ def __stream_json_items(handle) -> Iterator[Any]: def __dict_to_namedtuple(d, name='Item'): - """Convert a dictionary (and any nested dicts/lists) to namedtuples recursively.""" + """Convert a dictionary (and any nested dicts/lists) to namedtuples recursively. + + List values are materialised eagerly because namedtuple field values must be + concrete sequences. This is O(size of the current item) — the same behaviour + as the previous json.loads(object_hook=...) approach — while top-level streaming + (one item at a time) is handled by the ijson layer above. + """ if isinstance(d, dict): fields = list(d.keys()) Item = namedtuple(name, fields) diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index fc7d13c..800fc5e 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -57,7 +57,7 @@ def generator(): return generator() -def _iterparse_xml(source, retrieve_children: bool, cast_types: bool) -> Iterator[Any]: +def _iterparse_xml(source: "IO[Any]", retrieve_children: bool, cast_types: bool) -> Iterator[Any]: """Drive iterparse over *source* and yield namedtuples incrementally. When *retrieve_children* is True each direct child of the root element is From 18ca5257ffe343d92201eb5ad50a407b80452a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:11:28 +0200 Subject: [PATCH 03/11] Add ijson to dependencies in tox.ini --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index e377225..85ebbab 100644 --- a/tox.ini +++ b/tox.ini @@ -11,6 +11,7 @@ deps = defusedxml pyyaml tomlkit + ijson commands = coverage run -m unittest discover -s tests -t tests --pattern 'test_*.py' coverage xml From 74e81a13cee0f50fe75d9bb668d396069be6f4ef Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:13:08 +0000 Subject: [PATCH 04/11] refactor: fix docstrings and remove trailing blank lines This PR refactors documentation and file formatting to comply with style guidelines. It wraps long docstring lines, cleans up extraneous blank lines at the ends of files, and adds missing module and function docstrings. - Doc line too long: The patch rewraps overly long docstrings into multiple lines, breaking sentences at natural boundaries and aligning each line with proper indentation. This ensures that all documentation lines stay within the prescribed maximum line length. - Multiple blank lines detected at end of the file: Removed extraneous blank lines at the end of files to enforce a single newline termination. This change eliminates unexpected trailing whitespace and maintains consistent file formatting. - Missing module/function docstring: Added descriptive module-level and function-level docstrings where none existed, including explanations of purpose, parameters, and return values. These additions improve code readability and satisfy documentation coverage requirements. > This Autofix was generated by AI. Please review the change before merging. --- pystreamapi/loaders/__json/__json_loader.py | 10 ++++++++++ pystreamapi/loaders/__xml/__xml_loader.py | 14 ++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py index 655e409..712a11b 100644 --- a/pystreamapi/loaders/__json/__json_loader.py +++ b/pystreamapi/loaders/__json/__json_loader.py @@ -14,28 +14,38 @@ _PEEK_SIZE = 4096 +""" +Module for wrapping text-mode file handles and converting their output to bytes for use with ijson. +""" + class _TextToBytesWrapper: """Wraps a text-mode file handle and converts its output to bytes for ijson.""" def __init__(self, handle, encoding='utf-8'): + """Initialize the wrapper with a file handle and text encoding.""" self._handle = handle self._encoding = encoding def read(self, size=-1): + """Read up to size characters from the handle and return bytes, encoding text as needed.""" data = self._handle.read(size) if isinstance(data, str): return data.encode(self._encoding) return data if data else b'' +"""Module providing a reader that replays a pre-read buffer before delegating further reads to an underlying source.""" + class _PeekableBytesReader: """Replays a pre-read buffer before delegating further reads to the underlying source.""" def __init__(self, buffer: bytes, source): + """Initialize the peekable bytes reader with a pre-read buffer and underlying source.""" self._buf = buffer self._src = source def read(self, size=-1): + """Read up to size bytes, replaying the pre-read buffer before reading from the underlying source.""" if size == -1: # Full-read path: used by non-chunking callers (e.g. test helpers). # Streaming callers (like ijson) always pass an explicit chunk size. diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index 800fc5e..d044d9e 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -37,9 +37,15 @@ def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True, def _lazy_parse_xml_file(file_path: str, encoding: str, retrieve_children: bool, cast_types: bool) -> Iterator[Any]: - """Lazily parse an XML file using iterparse, yielding namedtuples without reading all at once.""" + """ + Lazily parse an XML file using iterparse, yielding namedtuples + without reading all at once. + """ def generator(): - """Generator that streams XML elements from the file and yields namedtuples lazily.""" + """ + Generator that streams XML elements from the file and yields + namedtuples lazily. + """ # skipcq: PTC-W6004 with open(file_path, mode='r', encoding=encoding) as xmlfile: yield from _iterparse_xml(xmlfile, retrieve_children, cast_types) @@ -110,7 +116,7 @@ def __parse_single_element(element, cast_types: bool): return Item(sub_item) -def __parse_multiple_elements(element, cast_types: bool): + def __parse_multiple_elements(element, cast_types: bool): """Parse XML element with multiple children and convert it into a namedtuple.""" tag_dict = {} for e in element: @@ -122,7 +128,7 @@ def __parse_multiple_elements(element, cast_types: bool): return Item(*filtered_dict.values()) -def __filter_single_items(tag_dict): + def __filter_single_items(tag_dict): """Filter out single-item lists from a dictionary.""" return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()} From 663680c6d3a339f65d9b4fad09c60be724bfd123 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:17:18 +0000 Subject: [PATCH 05/11] fix: resolve syntax/lint errors in xml and json loaders, run poetry lock Agent-Logs-Url: https://github.com/pickwicksoft/pystreamapi/sessions/5d738f28-a5b0-447a-b1aa-dcc973d07552 Co-authored-by: garlontas <70283087+garlontas@users.noreply.github.com> --- poetry.lock | 111 +++++++++++++++++++- pystreamapi/loaders/__json/__json_loader.py | 9 +- pystreamapi/loaders/__xml/__xml_loader.py | 5 +- 3 files changed, 113 insertions(+), 12 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3aae21a..b2c3d56 100644 --- a/poetry.lock +++ b/poetry.lock @@ -436,6 +436,112 @@ type1 = ["xattr ; sys_platform == \"darwin\""] unicode = ["unicodedata2 (>=17.0.0) ; python_version <= \"3.14\""] woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"] +[[package]] +name = "ijson" +version = "3.5.0" +description = "Iterative JSON parser with standard Python iterator interfaces" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"json-loader\" or extra == \"all\"" +files = [ + {file = "ijson-3.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ea8dcac10d86adaeead454bc25c97b68d0bda573d5fd6f86f5e21cf8f7906f88"}, + {file = "ijson-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:92b0495bbb2150bbf14fc5d98fb6d76bcd1c526605a172709e602e6fedc96495"}, + {file = "ijson-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7af0c4c8943be8b09a4e57bdc1da6001dae7b36526d4154fe5c8224738d0921f"}, + {file = "ijson-3.5.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:45887d5e84ff0d2b138c926cebd9071830733968afe8d9d12080b3c178c7f918"}, + {file = "ijson-3.5.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a70b575be8e57a28c80e90ed349ad3a851c3478524c70e36e07d6092ecd12c9"}, + {file = "ijson-3.5.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2adeecd45830bfd5580ca79a584154713aabef0b9607e16249133df5d2859813"}, + {file = "ijson-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d873e72889e7fc5962ab58909f1adff338d7c2f49e450e5b5fe844eff8155a14"}, + {file = "ijson-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9a88c559456a79708592234d697645d92b599718f4cbbeaa6515f83ac63ca0ae"}, + {file = "ijson-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cf83f58ad50dc0d39a2105cb26d4f359b38f42cef68b913170d4d47d97d97ba5"}, + {file = "ijson-3.5.0-cp310-cp310-win32.whl", hash = "sha256:aec4580a7712a19b1f95cd41bed260fc6a31266d37ef941827772a4c199e8143"}, + {file = "ijson-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9a9c4c70501e23e8eb1675330686d1598eebfa14b6f0dbc8f00c2e081cc628fa"}, + {file = "ijson-3.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5616311404b858d32740b7ad8b9a799c62165f5ecb85d0a8ed16c21665a90533"}, + {file = "ijson-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e9733f94029dd41702d573ef64752e2556e72aea14623d6dbb7a44ca1ccf30fd"}, + {file = "ijson-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:db8398c6721b98412a4f618da8022550c8b9c5d9214040646071b5deb4d4a393"}, + {file = "ijson-3.5.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:c061314845c08163b1784b6076ea5f075372461a32e6916f4e5f211fd4130b64"}, + {file = "ijson-3.5.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1111a1c5ac79119c5d6e836f900c1a53844b50a18af38311baa6bb61e2645aca"}, + {file = "ijson-3.5.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1e74aff8c681c24002b61b1822f9511d4c384f324f7dbc08c78538e01fdc9fcb"}, + {file = "ijson-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:739a7229b1b0cc5f7e2785a6e7a5fc915e850d3fed9588d0e89a09f88a417253"}, + {file = "ijson-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ef88712160360cab3ca6471a4e5418243f8b267cf1fe1620879d1b5558babc71"}, + {file = "ijson-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ca0d1b6b5f8166a6248f4309497585fb8553b04bc8179a0260fad636cfdb798"}, + {file = "ijson-3.5.0-cp311-cp311-win32.whl", hash = "sha256:966039cf9047c7967febf7b9a52ec6f38f5464a4c7fbb5565e0224b7376fefff"}, + {file = "ijson-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:6bad6a1634cb7c9f3f4c7e52325283b35b565f5b6cc27d42660c6912ce883422"}, + {file = "ijson-3.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1ebefbe149a6106cc848a3eaf536af51a9b5ccc9082de801389f152dba6ab755"}, + {file = "ijson-3.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:19e30d9f00f82e64de689c0b8651b9cfed879c184b139d7e1ea5030cec401c21"}, + {file = "ijson-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a04a33ee78a6f27b9b8528c1ca3c207b1df3b8b867a4cf2fcc4109986f35c227"}, + {file = "ijson-3.5.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7d48dc2984af02eb3c56edfb3f13b3f62f2f3e4fe36f058c8cfc75d93adf4fed"}, + {file = "ijson-3.5.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1e73a44844d9adbca9cf2c4132cd875933e83f3d4b23881fcaf82be83644c7d"}, + {file = "ijson-3.5.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7389a56b8562a19948bdf1d7bae3a2edc8c7f86fb59834dcb1c4c722818e645a"}, + {file = "ijson-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3176f23f8ebec83f374ed0c3b4e5a0c4db7ede54c005864efebbed46da123608"}, + {file = "ijson-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6babd88e508630c6ef86c9bebaaf13bb2fb8ec1d8f8868773a03c20253f599bc"}, + {file = "ijson-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dc1b3836b174b6db2fa8319f1926fb5445abd195dc963368092103f8579cb8ed"}, + {file = "ijson-3.5.0-cp312-cp312-win32.whl", hash = "sha256:6673de9395fb9893c1c79a43becd8c8fbee0a250be6ea324bfd1487bb5e9ee4c"}, + {file = "ijson-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:f4f7fabd653459dcb004175235f310435959b1bb5dfa8878578391c6cc9ad944"}, + {file = "ijson-3.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e9cedc10e40dd6023c351ed8bfc7dcfce58204f15c321c3c1546b9c7b12562a4"}, + {file = "ijson-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3647649f782ee06c97490b43680371186651f3f69bebe64c6083ee7615d185e5"}, + {file = "ijson-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:90e74be1dce05fce73451c62d1118671f78f47c9f6be3991c82b91063bf01fc9"}, + {file = "ijson-3.5.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:78e9ad73e7be2dd80627504bd5cbf512348c55ce2c06e362ed7683b5220e8568"}, + {file = "ijson-3.5.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9577449313cc94be89a4fe4b3e716c65f09cc19636d5a6b2861c4e80dddebd58"}, + {file = "ijson-3.5.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e4c1178fb50aff5f5701a30a5152ead82a14e189ce0f6102fa1b5f10b2f54ff"}, + {file = "ijson-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0eb402ab026ffb37a918d75af2b7260fe6cfbce13232cc83728a714dd30bd81d"}, + {file = "ijson-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5b08ee08355f9f729612a8eb9bf69cc14f9310c3b2a487c6f1c3c65d85216ec4"}, + {file = "ijson-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bda62b6d48442903e7bf56152108afb7f0f1293c2b9bef2f2c369defea76ab18"}, + {file = "ijson-3.5.0-cp313-cp313-win32.whl", hash = "sha256:8d073d9b13574cfa11083cc7267c238b7a6ed563c2661e79192da4a25f09c82c"}, + {file = "ijson-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:2419f9e32e0968a876b04d8f26aeac042abd16f582810b576936bbc4c6015069"}, + {file = "ijson-3.5.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:4d4b0cd676b8c842f7648c1a783448fac5cd3b98289abd83711b3e275e143524"}, + {file = "ijson-3.5.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:252dec3680a48bb82d475e36b4ae1b3a9d7eb690b951bb98a76c5fe519e30188"}, + {file = "ijson-3.5.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:aa1b5dca97d323931fde2501172337384c958914d81a9dac7f00f0d4bfc76bc7"}, + {file = "ijson-3.5.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7a5ec7fd86d606094bba6f6f8f87494897102fa4584ef653f3005c51a784c320"}, + {file = "ijson-3.5.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:009f41443e1521847701c6d87fa3923c0b1961be3c7e7de90947c8cb92ea7c44"}, + {file = "ijson-3.5.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e4c3651d1f9fe2839a93fdf8fd1d5ca3a54975349894249f3b1b572bcc4bd577"}, + {file = "ijson-3.5.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:945b7abcfcfeae2cde17d8d900870f03536494245dda7ad4f8d056faa303256c"}, + {file = "ijson-3.5.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:0574b0a841ff97495c13e9d7260fbf3d85358b061f540c52a123db9dbbaa2ed6"}, + {file = "ijson-3.5.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f969ffb2b89c5cdf686652d7fb66252bc72126fa54d416317411497276056a18"}, + {file = "ijson-3.5.0-cp313-cp313t-win32.whl", hash = "sha256:59d3f9f46deed1332ad669518b8099920512a78bda64c1f021fcd2aff2b36693"}, + {file = "ijson-3.5.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5c2839fa233746d8aad3b8cd2354e441613f5df66d721d59da4a09394bd1db2b"}, + {file = "ijson-3.5.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:25a5a6b2045c90bb83061df27cfa43572afa43ba9408611d7bfe237c20a731a9"}, + {file = "ijson-3.5.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:8976c54c0b864bc82b951bae06567566ac77ef63b90a773a69cd73aab47f4f4f"}, + {file = "ijson-3.5.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:859eb2038f7f1b0664df4241957694cc35e6295992d71c98659b22c69b3cbc10"}, + {file = "ijson-3.5.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:c911aa02991c7c0d3639b6619b93a93210ff1e7f58bf7225d613abea10adc78e"}, + {file = "ijson-3.5.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:903cbdc350173605220edc19796fbea9b2203c8b3951fb7335abfa8ed37afda8"}, + {file = "ijson-3.5.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a4549d96ded5b8efa71639b2160235415f6bdb8c83367615e2dbabcb72755c33"}, + {file = "ijson-3.5.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6b2dcf6349e6042d83f3f8c39ce84823cf7577eba25bac5aae5e39bbbbbe9c1c"}, + {file = "ijson-3.5.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:e44af39e6f8a17e5627dcd89715d8279bf3474153ff99aae031a936e5c5572e5"}, + {file = "ijson-3.5.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9260332304b7e7828db56d43f08fc970a3ab741bf84ff10189361ea1b60c395b"}, + {file = "ijson-3.5.0-cp314-cp314-win32.whl", hash = "sha256:63bc8121bb422f6969ced270173a3fa692c29d4ae30c860a2309941abd81012a"}, + {file = "ijson-3.5.0-cp314-cp314-win_amd64.whl", hash = "sha256:01b6dad72b7b7df225ef970d334556dfad46c696a2c6767fb5d9ed8889728bca"}, + {file = "ijson-3.5.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:2ea4b676ec98e374c1df400a47929859e4fa1239274339024df4716e802aa7e4"}, + {file = "ijson-3.5.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:014586eec043e23c80be9a923c56c3a0920a0f1f7d17478ce7bc20ba443968ef"}, + {file = "ijson-3.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5b8b886b0248652d437f66e7c5ac318bbdcb2c7137a7e5327a68ca00b286f5f"}, + {file = "ijson-3.5.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:498fd46ae2349297e43acf97cdc421e711dbd7198418677259393d2acdc62d78"}, + {file = "ijson-3.5.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22a51b4f9b81f12793731cf226266d1de2112c3c04ba4a04117ad4e466897e05"}, + {file = "ijson-3.5.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9636c710dc4ac4a281baa266a64f323b4cc165cec26836af702c44328b59a515"}, + {file = "ijson-3.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f7168a39e8211107666d71b25693fd1b2bac0b33735ef744114c403c6cac21e1"}, + {file = "ijson-3.5.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:8696454245415bc617ab03b0dc3ae4c86987df5dc6a90bad378fe72c5409d89e"}, + {file = "ijson-3.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c21bfb61f71f191565885bf1bc29e0a186292d866b4880637b833848360bdc1b"}, + {file = "ijson-3.5.0-cp314-cp314t-win32.whl", hash = "sha256:a2619460d6795b70d0155e5bf016200ac8a63ab5397aa33588bb02b6c21759e6"}, + {file = "ijson-3.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:4f24b78d4ef028d17eb57ad1b16c0aed4a17bdd9badbf232dc5d9305b7e13854"}, + {file = "ijson-3.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0ec62d397447cbe4941818c53e22b054e03250ff9cdbaea75144b11bc6db44ed"}, + {file = "ijson-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:75980237a16e5e36ad46fbdd33e3f3d817c187624974c48947df0a2bfa104b9e"}, + {file = "ijson-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a9c321e8e1cdeac8aac698d09a90d98a049c9be8e8330c89cf2fcc517c96d51d"}, + {file = "ijson-3.5.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:92878b130d7ad71919c70b4f50ad23ec7fbf2d09a9c675f9179d49c4be869a63"}, + {file = "ijson-3.5.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1ab890d43656c1d12c4a8dafb7fac5a2278ed3e4408102e0971f48b6ed4583d"}, + {file = "ijson-3.5.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a55185e8983fef0b21abc1a0bbaa11eeb2fabdc651e2167f23defa9fe4eb999b"}, + {file = "ijson-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5a3af031e30751164c3289294f249f942535fbe7e8f35eb3ecc374247449214e"}, + {file = "ijson-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f4c8f5ccf7230a9a94c1d836322783ed0c0ec2a151f3d53b2e0a67c89ad66970"}, + {file = "ijson-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6e249796d2090afc1c42d2458ab0dbf0072a30ffa246b5683e3f7b9dc9b1b7f9"}, + {file = "ijson-3.5.0-cp39-cp39-win32.whl", hash = "sha256:1b2cf2c0c79313fbc607a0d90788ffb4f4614872983af4aa85c5b92533ec4da2"}, + {file = "ijson-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:d38cb03f6b7cc26d542ff710adfe98e5f6d53878461c45456c97d3668297ec0d"}, + {file = "ijson-3.5.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d64c624da0e9d692d6eb0ff63a79656b59d76bf80773a17c5b0f835e4e8ef627"}, + {file = "ijson-3.5.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:876f7df73b7e0d6474f9caa729b9cdbfc8e76de9075a4887dfd689e29e85c4ca"}, + {file = "ijson-3.5.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e7dbff2c8d9027809b0cde663df44f3210da10ea377121d42896fb6ee405dd31"}, + {file = "ijson-3.5.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4217a1edc278660679e1197c83a1a2a2d367792bfbb2a3279577f4b59b93730d"}, + {file = "ijson-3.5.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:04f0fc740311388ee745ba55a12292b722d6f52000b11acbb913982ba5fbdf87"}, + {file = "ijson-3.5.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fdeee6957f92e0c114f65c55cf8fe7eabb80cfacab64eea6864060913173f66d"}, + {file = "ijson-3.5.0.tar.gz", hash = "sha256:94688760720e3f5212731b3cb8d30267f9a045fb38fb3870254e7b9504246f31"}, +] + [[package]] name = "isort" version = "8.0.1" @@ -1310,7 +1416,8 @@ files = [ ] [extras] -all = ["defusedxml", "pyyaml", "tomlkit"] +all = ["defusedxml", "ijson", "pyyaml", "tomlkit"] +json-loader = ["ijson"] toml-loader = ["tomlkit"] xml-loader = ["defusedxml"] yaml-loader = ["pyyaml"] @@ -1318,4 +1425,4 @@ yaml-loader = ["pyyaml"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<4.0" -content-hash = "c436b17bc26b05df172933d6c2063092dfbfc27c1f05ee549a0f288ba4b89bc4" +content-hash = "f2c5681125edbd7634dc356fff1f75814ce64802800d96a1a3dcd731facd928e" diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py index 712a11b..2da5b5b 100644 --- a/pystreamapi/loaders/__json/__json_loader.py +++ b/pystreamapi/loaders/__json/__json_loader.py @@ -14,10 +14,6 @@ _PEEK_SIZE = 4096 -""" -Module for wrapping text-mode file handles and converting their output to bytes for use with ijson. -""" - class _TextToBytesWrapper: """Wraps a text-mode file handle and converts its output to bytes for ijson.""" @@ -34,8 +30,6 @@ def read(self, size=-1): return data if data else b'' -"""Module providing a reader that replays a pre-read buffer before delegating further reads to an underlying source.""" - class _PeekableBytesReader: """Replays a pre-read buffer before delegating further reads to the underlying source.""" @@ -45,7 +39,8 @@ def __init__(self, buffer: bytes, source): self._src = source def read(self, size=-1): - """Read up to size bytes, replaying the pre-read buffer before reading from the underlying source.""" + """Read up to size bytes, replaying the pre-read buffer before + reading from the underlying source.""" if size == -1: # Full-read path: used by non-chunking callers (e.g. test helpers). # Streaming callers (like ijson) always pass an explicit chunk size. diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index d044d9e..7bc1e62 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -116,7 +116,7 @@ def __parse_single_element(element, cast_types: bool): return Item(sub_item) - def __parse_multiple_elements(element, cast_types: bool): +def __parse_multiple_elements(element, cast_types: bool): """Parse XML element with multiple children and convert it into a namedtuple.""" tag_dict = {} for e in element: @@ -128,7 +128,6 @@ def __parse_multiple_elements(element, cast_types: bool): return Item(*filtered_dict.values()) - def __filter_single_items(tag_dict): +def __filter_single_items(tag_dict): """Filter out single-item lists from a dictionary.""" return {key: value[0] if len(value) == 1 else value for key, value in tag_dict.items()} - From 15b9269b64afbaaaa655d622b47076f907179220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:21:51 +0200 Subject: [PATCH 06/11] Correct docstring formatting in read method Fix formatting of docstring in read method. --- pystreamapi/loaders/__json/__json_loader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py index 2da5b5b..8701c1e 100644 --- a/pystreamapi/loaders/__json/__json_loader.py +++ b/pystreamapi/loaders/__json/__json_loader.py @@ -39,8 +39,10 @@ def __init__(self, buffer: bytes, source): self._src = source def read(self, size=-1): - """Read up to size bytes, replaying the pre-read buffer before - reading from the underlying source.""" + """ + Read up to size bytes, replaying the pre-read buffer before + reading from the underlying source. + """ if size == -1: # Full-read path: used by non-chunking callers (e.g. test helpers). # Streaming callers (like ijson) always pass an explicit chunk size. From 638a2d82e0d405381bc591d03d33274dc8009096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:25:53 +0200 Subject: [PATCH 07/11] Refactor XML parsing with helper functions --- pystreamapi/loaders/__xml/__xml_loader.py | 65 +++++++++++++++++------ 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index 7bc1e62..c88992f 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -69,29 +69,60 @@ def _iterparse_xml(source: "IO[Any]", retrieve_children: bool, cast_types: bool) When *retrieve_children* is True each direct child of the root element is converted and yielded as soon as its closing tag is encountered; the child is then removed from the root so that memory is freed immediately. - When *retrieve_children* is False the entire document is consumed and the root element is converted and yielded once. """ - depth = 0 - root = None context = ElementTree.iterparse(source, events=('start', 'end')) + root, depth = _consume_iterparse(context, cast_types, retrieve_children) + +def _consume_iterparse( + context: Iterator[tuple[str, Any]], + cast_types: bool, + retrieve_children: bool, +) -> tuple[Any, int]: + """Consume *context* events, yielding elements per *retrieve_children* strategy.""" + depth = 0 + root = None for event, elem in context: - if event == 'start': - depth += 1 - if root is None: - root = elem - else: # 'end' - depth -= 1 - if retrieve_children: - if depth == 1: - yield __parse_xml(elem, cast_types) - root.remove(elem) - else: - if depth == 0: - yield __parse_xml(root, cast_types) - return + depth, root = _handle_event(event, elem, depth, root) + if retrieve_children: + yield from _maybe_yield_child(event, elem, depth, root, cast_types) + elif _is_root_closed(event, depth): + yield __parse_xml(root, cast_types) + return + + +def _handle_event( + event: str, + elem: Any, + depth: int, + root: Any, +) -> tuple[int, Any]: + """Update depth and capture root on the first start event.""" + if event == 'start': + if root is None: + root = elem + return depth + 1, root + return depth - 1, root + + +def _maybe_yield_child( + event: str, + elem: Any, + depth: int, + root: Any, + cast_types: bool, +) -> Iterator[Any]: + """Yield and evict a direct child when its closing tag is reached at depth 1.""" + if event == 'end' and depth == 1: + yield __parse_xml(elem, cast_types) + root.remove(elem) + + +def _is_root_closed(event: str, depth: int) -> bool: + """Return True when the root element's closing tag has been processed.""" + return event == 'end' and depth == 0 def __parse_xml(element, cast_types: bool): From a343fe667f1119950fc7f62112d770d018ece3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:30:27 +0200 Subject: [PATCH 08/11] Refactor XML parsing logic in __xml_loader.py --- pystreamapi/loaders/__xml/__xml_loader.py | 65 ++++++----------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index c88992f..7bc1e62 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -69,60 +69,29 @@ def _iterparse_xml(source: "IO[Any]", retrieve_children: bool, cast_types: bool) When *retrieve_children* is True each direct child of the root element is converted and yielded as soon as its closing tag is encountered; the child is then removed from the root so that memory is freed immediately. + When *retrieve_children* is False the entire document is consumed and the root element is converted and yielded once. """ - context = ElementTree.iterparse(source, events=('start', 'end')) - root, depth = _consume_iterparse(context, cast_types, retrieve_children) - - -def _consume_iterparse( - context: Iterator[tuple[str, Any]], - cast_types: bool, - retrieve_children: bool, -) -> tuple[Any, int]: - """Consume *context* events, yielding elements per *retrieve_children* strategy.""" depth = 0 root = None + context = ElementTree.iterparse(source, events=('start', 'end')) + for event, elem in context: - depth, root = _handle_event(event, elem, depth, root) - if retrieve_children: - yield from _maybe_yield_child(event, elem, depth, root, cast_types) - elif _is_root_closed(event, depth): - yield __parse_xml(root, cast_types) - return - - -def _handle_event( - event: str, - elem: Any, - depth: int, - root: Any, -) -> tuple[int, Any]: - """Update depth and capture root on the first start event.""" - if event == 'start': - if root is None: - root = elem - return depth + 1, root - return depth - 1, root - - -def _maybe_yield_child( - event: str, - elem: Any, - depth: int, - root: Any, - cast_types: bool, -) -> Iterator[Any]: - """Yield and evict a direct child when its closing tag is reached at depth 1.""" - if event == 'end' and depth == 1: - yield __parse_xml(elem, cast_types) - root.remove(elem) - - -def _is_root_closed(event: str, depth: int) -> bool: - """Return True when the root element's closing tag has been processed.""" - return event == 'end' and depth == 0 + if event == 'start': + depth += 1 + if root is None: + root = elem + else: # 'end' + depth -= 1 + if retrieve_children: + if depth == 1: + yield __parse_xml(elem, cast_types) + root.remove(elem) + else: + if depth == 0: + yield __parse_xml(root, cast_types) + return def __parse_xml(element, cast_types: bool): From f4b4b5a599d6b51828c524291af670c76a47fffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:33:46 +0200 Subject: [PATCH 09/11] Fix XML parsing to free space completely --- pystreamapi/loaders/__xml/__xml_loader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index 7bc1e62..da43d01 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -87,10 +87,13 @@ def _iterparse_xml(source: "IO[Any]", retrieve_children: bool, cast_types: bool) if retrieve_children: if depth == 1: yield __parse_xml(elem, cast_types) + yield __parse_xml(elem, cast_types) + elem.clear() root.remove(elem) else: if depth == 0: yield __parse_xml(root, cast_types) + root.remove(elem) return From 029e56b78af03f1d117d78d3a3279294556312dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:35:34 +0200 Subject: [PATCH 10/11] Update __xml_loader.py --- pystreamapi/loaders/__xml/__xml_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index da43d01..4f3f159 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -86,7 +86,6 @@ def _iterparse_xml(source: "IO[Any]", retrieve_children: bool, cast_types: bool) depth -= 1 if retrieve_children: if depth == 1: - yield __parse_xml(elem, cast_types) yield __parse_xml(elem, cast_types) elem.clear() root.remove(elem) From 3e20fa8f17b937c517cccb39ba3e91ebb614bbce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20G=C3=A2rlon=C8=9Ba?= <70283087+garlontas@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:37:10 +0200 Subject: [PATCH 11/11] Fix XML parsing by removing unnecessary element removal Remove redundant root.remove(elem) call in XML parsing. --- pystreamapi/loaders/__xml/__xml_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py index 4f3f159..7be5822 100644 --- a/pystreamapi/loaders/__xml/__xml_loader.py +++ b/pystreamapi/loaders/__xml/__xml_loader.py @@ -92,7 +92,6 @@ def _iterparse_xml(source: "IO[Any]", retrieve_children: bool, cast_types: bool) else: if depth == 0: yield __parse_xml(root, cast_types) - root.remove(elem) return