diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f11b6af24e4e4..a9219cd811b6f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3287,6 +3287,45 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``: df = pd.read_xml(xml, stylesheet=xsl) df +For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` +supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_ +which are memory-efficient methods to iterate through an XML tree and extract specific elements and attributes. +without holding entire tree in memory. + + .. versionadded:: 1.5.0 + +.. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk +.. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse + +To use this feature, you must pass a physical XML file path into ``read_xml`` and use the ``iterparse`` argument. +Files should not be compressed or point to online sources but stored on local disk. Also, ``iterparse`` should be +a dictionary where the key is the repeating nodes in document (which become the rows) and the value is a list of +any element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not +used in this method, descendants do not need to share same relationship with one another. Below shows example +of reading in Wikipedia's very large (12 GB+) latest article data dump. + +.. code-block:: ipython + + In [1]: df = pd.read_xml( + ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml", + ... iterparse = {"page": ["title", "ns", "id"]} + ... ) + ... df + Out[2]: + title ns id + 0 Gettysburg Address 0 21450 + 1 Main Page 0 42950 + 2 Declaration by United Nations 0 8435 + 3 Constitution of the United States of America 0 8435 + 4 Declaration of Independence (Israel) 0 17858 + ... ... ... ... + 3578760 Page:Black cat 1897 07 v2 n10.pdf/17 104 219649 + 3578761 Page:Black cat 1897 07 v2 n10.pdf/43 104 219649 + 3578762 Page:Black cat 1897 07 v2 n10.pdf/44 104 219649 + 3578763 The History of Tom Jones, a Foundling/Book IX 0 12084291 + 3578764 Page:Shakespeare of Stratford (1926) Yale.djvu/91 104 21450 + + [3578765 rows x 3 columns] .. _io.xml: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c4a760efd9a40..74639cc693cb7 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -162,6 +162,43 @@ apply converter methods, and parse dates (:issue:`43567`). df df.dtypes +.. _whatsnew_150.read_xml_iterparse: + +read_xml now supports large XML using ``iterparse`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` +now supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_ +which are memory-efficient methods to iterate through XML trees and extract specific elements +and attributes without holding entire tree in memory (:issue:`#45442`). + +.. code-block:: ipython + + In [1]: df = pd.read_xml( + ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml", + ... iterparse = {"page": ["title", "ns", "id"]}) + ... ) + df + Out[2]: + title ns id + 0 Gettysburg Address 0 21450 + 1 Main Page 0 42950 + 2 Declaration by United Nations 0 8435 + 3 Constitution of the United States of America 0 8435 + 4 Declaration of Independence (Israel) 0 17858 + ... ... ... ... + 3578760 Page:Black cat 1897 07 v2 n10.pdf/17 104 219649 + 3578761 Page:Black cat 1897 07 v2 n10.pdf/43 104 219649 + 3578762 Page:Black cat 1897 07 v2 n10.pdf/44 104 219649 + 3578763 The History of Tom Jones, a Foundling/Book IX 0 12084291 + 3578764 Page:Shakespeare of Stratford (1926) Yale.djvu/91 104 21450 + + [3578765 rows x 3 columns] + + +.. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk +.. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse + .. _whatsnew_150.api_breaking.api_breaking2: api_breaking_change2 diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 3e4a54fe19032..181b0fe115f4c 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,7 +5,10 @@ from __future__ import annotations import io -from typing import Sequence +from typing import ( + Any, + Sequence, +) from pandas._typing import ( CompressionOptions, @@ -35,6 +38,7 @@ from pandas.io.common import ( file_exists, get_handle, + infer_compression, is_fsspec_url, is_url, stringify_path, @@ -97,6 +101,13 @@ class _XMLFrameParser: URL, file, file-like object, or a raw string containing XSLT, `etree` does not support XSLT but retained for consistency. + iterparse : dict, optional + Dict with row element as key and list of descendant elements + and/or attributes as value to be retrieved in iterparsing of + XML document. + + .. versionadded:: 1.5.0 + {decompression_options} .. versionchanged:: 1.4.0 Zstandard support. @@ -113,6 +124,7 @@ class _XMLFrameParser: To subclass this class effectively you must override the following methods:` * :func:`parse_data` * :func:`_parse_nodes` + * :func:`_iterparse_nodes` * :func:`_parse_doc` * :func:`_validate_names` * :func:`_validate_path` @@ -135,6 +147,7 @@ def __init__( parse_dates: ParseDatesArg | None, encoding: str | None, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, + iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, ) -> None: @@ -149,6 +162,7 @@ def __init__( self.parse_dates = parse_dates self.encoding = encoding self.stylesheet = stylesheet + self.iterparse = iterparse self.is_style = None self.compression = compression self.storage_options = storage_options @@ -178,9 +192,34 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: Notes ----- - Namespace URIs will be removed from return node values.Also, + Namespace URIs will be removed from return node values. Also, elements with missing children or attributes compared to siblings - will have optional keys filled withi None values. + will have optional keys filled with None values. + """ + + raise AbstractMethodError(self) + + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + """ + Iterparse xml nodes. + + This method will read in local disk, decompressed XML files for elements + and underlying descendants using iterparse, a method to iterate through + an XML tree without holding entire XML tree in memory. + + Raises + ------ + TypeError + * If `iterparse` is not a dict or its dict value is not list-like. + ParserError + * If `path_or_buffer` is not a physical, decompressed file on disk. + * If no data is returned from selected items in `iterparse`. + + Notes + ----- + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes in submitted list + will have optional keys filled with None values. """ raise AbstractMethodError(self) @@ -240,12 +279,17 @@ def parse_data(self) -> list[dict[str, str | None]]: "To use stylesheet, you need lxml installed and selected as parser." ) - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self._validate_path() - self._validate_path() self._validate_names() - return self._parse_nodes() + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + ) + + return xml_dicts def _parse_nodes(self) -> list[dict[str, str | None]]: elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) @@ -331,6 +375,67 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: return dicts + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import iterparse + + dicts: list[dict[str, str | None]] = [] + row: dict[str, str | None] | None = None + + if not isinstance(self.iterparse, dict): + raise TypeError( + f"{type(self.iterparse).__name__} is not a valid type for iterparse" + ) + + row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" + if not is_list_like(self.iterparse[row_node]): + raise TypeError( + f"{type(self.iterparse[row_node])} is not a valid type " + "for value in iterparse" + ) + + if ( + not isinstance(self.path_or_buffer, str) + or is_url(self.path_or_buffer) + or is_fsspec_url(self.path_or_buffer) + or self.path_or_buffer.startswith((" None: """ Notes @@ -361,9 +466,14 @@ def _validate_path(self) -> None: ) def _validate_names(self) -> None: + children: list[Any] + if self.names: - parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) - children = parent.findall("*") if parent else [] + if self.iterparse: + children = self.iterparse[next(iter(self.iterparse))] + else: + parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + children = parent.findall("*") if parent else [] if is_list_like(self.names): if len(self.names) < len(children): @@ -413,16 +523,22 @@ def parse_data(self) -> list[dict[str, str | None]]: """ from lxml.etree import XML - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) - if self.stylesheet is not None: - self.xsl_doc = XML(self._parse_doc(self.stylesheet)) - self.xml_doc = XML(self._transform_doc()) + if self.stylesheet: + self.xsl_doc = XML(self._parse_doc(self.stylesheet)) + self.xml_doc = XML(self._transform_doc()) + + self._validate_path() - self._validate_path() self._validate_names() - return self._parse_nodes() + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + ) + + return xml_dicts def _parse_nodes(self) -> list[dict[str, str | None]]: elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) @@ -507,6 +623,70 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: return dicts + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + from lxml.etree import iterparse + + dicts: list[dict[str, str | None]] = [] + row: dict[str, str | None] | None = None + + if not isinstance(self.iterparse, dict): + raise TypeError( + f"{type(self.iterparse).__name__} is not a valid type for iterparse" + ) + + row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" + if not is_list_like(self.iterparse[row_node]): + raise TypeError( + f"{type(self.iterparse[row_node])} is not a valid type " + "for value in iterparse" + ) + + if ( + not isinstance(self.path_or_buffer, str) + or is_url(self.path_or_buffer) + or is_fsspec_url(self.path_or_buffer) + or self.path_or_buffer.startswith((" None: msg = ( @@ -528,21 +708,15 @@ def _validate_path(self) -> None: raise ValueError(msg) def _validate_names(self) -> None: - """ - Validate names. - - This method will check if names is a list and aligns with - length of parse nodes. + children: list[Any] - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ if self.names: - children = self.xml_doc.xpath( - self.xpath + "[1]/*", namespaces=self.namespaces - ) + if self.iterparse: + children = self.iterparse[next(iter(self.iterparse))] + else: + children = self.xml_doc.xpath( + self.xpath + "[1]/*", namespaces=self.namespaces + ) if is_list_like(self.names): if len(self.names) < len(children): @@ -704,6 +878,7 @@ def _parse( encoding: str | None, parser: XMLParsers, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, + iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, **kwargs, @@ -741,6 +916,7 @@ def _parse( parse_dates, encoding, stylesheet, + iterparse, compression, storage_options, ) @@ -760,6 +936,7 @@ def _parse( parse_dates, encoding, stylesheet, + iterparse, compression, storage_options, ) @@ -798,6 +975,7 @@ def read_xml( encoding: str | None = "utf-8", parser: XMLParsers = "lxml", stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, + iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> DataFrame: @@ -890,6 +1068,20 @@ def read_xml( transformation and not the original XML document. Only XSLT 1.0 scripts and not later versions is currently supported. + iterparse : dict, optional + The nodes or attributes to retrieve in iterparsing of XML document + as a dict with key being the name of repeating element and value being + list of elements or attribute names that are descendants of the repeated + element. Note: If this option is used, it will replace ``xpath`` parsing + and unlike xpath, descendants do not need to relate to each other but can + exist any where in document under the repeating element. This memory- + efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). + For example, :: + + iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} + + .. versionadded:: 1.5.0 + {decompression_options} .. versionchanged:: 1.4.0 Zstandard support. @@ -938,6 +1130,10 @@ def read_xml( exceptions due to issues with XML document, ``xpath``, or other parameters. + See the :ref:`read_xml documentation in the IO section of the docs + ` for more information in using this method to parse XML + files to DataFrames. + Examples -------- >>> xml = ''' @@ -1022,6 +1218,7 @@ def read_xml( encoding=encoding, parser=parser, stylesheet=stylesheet, + iterparse=iterparse, compression=compression, storage_options=storage_options, ) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index f0fd500bb443c..bfb6bb19452bd 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,11 +14,16 @@ from pandas.compat import is_ci_environment from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + EmptyDataError, + ParserError, +) import pandas.util._test_decorators as td from pandas import DataFrame import pandas._testing as tm +from pandas.io.common import get_handle from pandas.io.xml import read_xml """ @@ -243,6 +248,21 @@ def parser(request): return request.param +def read_xml_iterparse(data, **kwargs): + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(data) + return read_xml(path, **kwargs) + + +def read_xml_iterparse_comp(comp_path, compression_only, **kwargs): + with get_handle(comp_path, "r", compression=compression_only) as handles: + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(handles.handle.read()) + return read_xml(path, **kwargs) + + # FILE / URL @@ -252,12 +272,24 @@ def test_parser_consistency_file(datapath): df_file_lxml = read_xml(filename, parser="lxml") df_file_etree = read_xml(filename, parser="etree") + df_iter_lxml = read_xml( + filename, + parser="lxml", + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + df_iter_etree = read_xml( + filename, + parser="etree", + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + tm.assert_frame_equal(df_file_lxml, df_file_etree) + tm.assert_frame_equal(df_file_lxml, df_iter_lxml) + tm.assert_frame_equal(df_iter_lxml, df_iter_etree) @pytest.mark.network @pytest.mark.slow -@td.skip_if_no("lxml") @tm.network( url=( "https://data.cityofchicago.org/api/views/" @@ -265,15 +297,47 @@ def test_parser_consistency_file(datapath): ), check_before_test=True, ) -def test_parser_consistency_url(): +def test_parser_consistency_url(parser): url = ( "https://data.cityofchicago.org/api/views/" "8pix-ypme/rows.xml?accessType=DOWNLOAD" ) - df_url_lxml = read_xml(url, xpath=".//row/row", parser="lxml") - df_url_etree = read_xml(url, xpath=".//row/row", parser="etree") - tm.assert_frame_equal(df_url_lxml, df_url_etree) + with tm.ensure_clean(filename="cta.xml") as path: + (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False)) + + df_xpath = read_xml(path, parser=parser) + df_iter = read_xml( + path, + parser=parser, + iterparse={ + "row": [ + "_id", + "_uuid", + "_position", + "_address", + "stop_id", + "direction_id", + "stop_name", + "station_name", + "station_descriptive_name", + "map_id", + "ada", + "red", + "blue", + "g", + "brn", + "p", + "pexp", + "y", + "pnk", + "o", + "location", + ] + }, + ) + + tm.assert_frame_equal(df_xpath, df_iter) def test_file_like(datapath, parser, mode): @@ -479,6 +543,12 @@ def test_default_namespace(parser): parser=parser, ) + df_iter = read_xml_iterparse( + xml_default_nmsp, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) + df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -488,6 +558,7 @@ def test_default_namespace(parser): ) tm.assert_frame_equal(df_nmsp, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_prefix_namespace(parser): @@ -497,6 +568,9 @@ def test_prefix_namespace(parser): namespaces={"doc": "http://example.com"}, parser=parser, ) + df_iter = read_xml_iterparse( + xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} + ) df_expected = DataFrame( { @@ -507,6 +581,7 @@ def test_prefix_namespace(parser): ) tm.assert_frame_equal(df_nmsp, df_expected) + tm.assert_frame_equal(df_iter, df_expected) @td.skip_if_no("lxml") @@ -591,6 +666,11 @@ def test_none_namespace_prefix(key): def test_file_elems_and_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) df_expected = DataFrame( { "category": ["cooking", "children", "web"], @@ -602,19 +682,27 @@ def test_file_elems_and_attrs(datapath, parser): ) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_file_only_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, attrs_only=True, parser=parser) + df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) df_expected = DataFrame({"category": ["cooking", "children", "web"]}) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_file_only_elems(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, elems_only=True, parser=parser) + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["title", "author", "year", "price"]}, + ) df_expected = DataFrame( { "title": ["Everyday Italian", "Harry Potter", "Learning XML"], @@ -625,6 +713,7 @@ def test_file_only_elems(datapath, parser): ) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_elem_and_attrs_only(datapath, parser): @@ -661,7 +750,13 @@ def test_attribute_centric_xml(): df_lxml = read_xml(xml, xpath=".//station") df_etree = read_xml(xml, xpath=".//station", parser="etree") + df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]}) + df_iter_et = read_xml_iterparse( + xml, parser="etree", iterparse={"station": ["Name", "coords"]} + ) + tm.assert_frame_equal(df_lxml, df_etree) + tm.assert_frame_equal(df_iter_lx, df_iter_et) # NAMES @@ -672,6 +767,12 @@ def test_names_option_output(datapath, parser): df_file = read_xml( filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser ) + df_iter = read_xml( + filename, + parser=parser, + names=["Col1", "Col2", "Col3", "Col4", "Col5"], + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) df_expected = DataFrame( { @@ -684,6 +785,7 @@ def test_names_option_output(datapath, parser): ) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_names_option_wrong_length(datapath, parser): @@ -736,10 +838,25 @@ def test_ascii_encoding(datapath, parser): @td.skip_if_no("lxml") def test_parser_consistency_with_encoding(datapath): filename = datapath("io", "data", "xml", "baby_names.xml") - df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") - df_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") + df_xpath_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") + df_xpath_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") - tm.assert_frame_equal(df_lxml, df_etree) + df_iter_lxml = read_xml( + filename, + parser="lxml", + encoding="ISO-8859-1", + iterparse={"row": ["rank", "malename", "femalename"]}, + ) + df_iter_etree = read_xml( + filename, + parser="etree", + encoding="ISO-8859-1", + iterparse={"row": ["rank", "malename", "femalename"]}, + ) + + tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree) + tm.assert_frame_equal(df_xpath_etree, df_iter_etree) + tm.assert_frame_equal(df_iter_lxml, df_iter_etree) @td.skip_if_no("lxml") @@ -805,7 +922,22 @@ def test_stylesheet_file(datapath): stylesheet=xsl, ) + df_iter = read_xml( + kml, + iterparse={ + "Placemark": [ + "id", + "name", + "styleUrl", + "extrude", + "altitudeMode", + "coordinates", + ] + }, + ) + tm.assert_frame_equal(df_kml, df_style) + tm.assert_frame_equal(df_kml, df_iter) def test_read_xml_passing_as_positional_deprecated(datapath, parser): @@ -1029,6 +1161,143 @@ def test_empty_stylesheet(val): read_xml(kml, stylesheet=val) +# ITERPARSE + + +def test_string_error(parser): + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + xml_default_nmsp, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_file_like_error(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + with open(filename) as f: + read_xml( + f, + parser=parser, + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + + +@pytest.mark.network +@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True) +def test_url_path_error(parser): + url = "https://www.w3schools.com/xml/books.xml" + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + url, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_compression_error(parser, compression_only): + with tm.ensure_clean(filename="geom_xml.zip") as path: + geom_df.to_xml(path, parser=parser, compression=compression_only) + + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + path, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + compression=compression_only, + ) + + +def test_wrong_dict_type(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(TypeError, match="list is not a valid type for iterparse"): + read_xml( + filename, + parser=parser, + iterparse=["category", "title", "year", "author", "price"], + ) + + +def test_wrong_dict_value(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + TypeError, match=" is not a valid type for value in iterparse" + ): + read_xml(filename, parser=parser, iterparse={"book": "category"}) + + +def test_bad_xml(datapath, parser): + bad_xml = """\ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + with tm.ensure_clean(filename="bad.xml") as path: + with open(path, "w") as f: + f.write(bad_xml) + + with pytest.raises( + SyntaxError, + match=( + "Extra content at the end of the document|" + "junk after document element" + ), + ): + read_xml( + path, + parser=parser, + parse_dates=["date"], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_no_result(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + ParserError, match="No result from selected items in iterparse." + ): + read_xml( + filename, + parser=parser, + iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]}, + ) + + +def test_empty_data(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + read_xml( + filename, + parser=parser, + iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]}, + ) + + @pytest.mark.network @td.skip_if_no("lxml") @tm.network( @@ -1071,12 +1340,23 @@ def test_online_stylesheet(): def test_compression_read(parser, compression_only): - with tm.ensure_clean() as path: - geom_df.to_xml(path, index=False, parser=parser, compression=compression_only) + with tm.ensure_clean() as comp_path: + geom_df.to_xml( + comp_path, index=False, parser=parser, compression=compression_only + ) - xml_df = read_xml(path, parser=parser, compression=compression_only) + df_xpath = read_xml(comp_path, parser=parser, compression=compression_only) + + df_iter = read_xml_iterparse_comp( + comp_path, + compression_only, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + compression=compression_only, + ) - tm.assert_frame_equal(xml_df, geom_df) + tm.assert_frame_equal(df_xpath, geom_df) + tm.assert_frame_equal(df_iter, geom_df) def test_wrong_compression(parser, compression, compression_only): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 801461ed4288a..6aa4ddfac7628 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -20,6 +20,20 @@ def parser(request): return request.param +@pytest.fixture( + params=[None, {"book": ["category", "title", "author", "year", "price"]}] +) +def iterparse(request): + return request.param + + +def read_xml_iterparse(data, **kwargs): + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(data) + return read_xml(path, **kwargs) + + xml_types = """\ @@ -68,6 +82,12 @@ def parser(request): def test_dtype_single_str(parser): df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + dtype={"degrees": "str"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -78,10 +98,17 @@ def test_dtype_single_str(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtypes_all_str(parser): df_result = read_xml(xml_dates, dtype="string", parser=parser) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + dtype="string", + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -94,6 +121,7 @@ def test_dtypes_all_str(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtypes_with_names(parser): @@ -103,6 +131,13 @@ def test_dtypes_with_names(parser): dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, parser=parser, ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + names=["Col1", "Col2", "Col3", "Col4"], + dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -114,10 +149,17 @@ def test_dtypes_with_names(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtype_nullable_int(parser): df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + dtype={"sides": "Int64"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -128,10 +170,17 @@ def test_dtype_nullable_int(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtype_float(parser): df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + dtype={"degrees": "float"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -142,13 +191,15 @@ def test_dtype_float(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) -def test_wrong_dtype(parser): +def test_wrong_dtype(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( - ValueError, match=('Unable to parse string "square" at position 0') + ValueError, match=('Unable to parse string "Everyday Italian" at position 0') ): - read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser) + read_xml(filename, dtype={"title": "Int64"}, parser=parser, iterparse=iterparse) def test_both_dtype_converters(parser): @@ -167,8 +218,16 @@ def test_both_dtype_converters(parser): converters={"degrees": str}, parser=parser, ) + df_iter = read_xml_iterparse( + xml_types, + dtype={"degrees": "str"}, + converters={"degrees": str}, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) # CONVERTERS @@ -176,6 +235,12 @@ def test_both_dtype_converters(parser): def test_converters_str(parser): df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + converters={"degrees": str}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -186,6 +251,7 @@ def test_converters_str(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_converters_date(parser): @@ -193,6 +259,12 @@ def test_converters_date(parser): df_result = read_xml( xml_dates, converters={"date": convert_to_datetime}, parser=parser ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + converters={"date": convert_to_datetime}, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -204,21 +276,29 @@ def test_converters_date(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) -def test_wrong_converters_type(parser): +def test_wrong_converters_type(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")): - read_xml(xml_types, converters={"degrees", str}, parser=parser) + read_xml(filename, converters={"year", str}, parser=parser, iterparse=iterparse) -def test_callable_func_converters(parser): +def test_callable_func_converters(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("'float' object is not callable")): - read_xml(xml_types, converters={"degrees": float()}, parser=parser) + read_xml( + filename, converters={"year": float()}, parser=parser, iterparse=iterparse + ) -def test_callable_str_converters(parser): +def test_callable_str_converters(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("'str' object is not callable")): - read_xml(xml_types, converters={"degrees": "float"}, parser=parser) + read_xml( + filename, converters={"year": "float"}, parser=parser, iterparse=iterparse + ) # PARSE DATES @@ -226,6 +306,12 @@ def test_callable_str_converters(parser): def test_parse_dates_column_name(parser): df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + parse_dates=["date"], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -237,10 +323,17 @@ def test_parse_dates_column_name(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_parse_dates_column_index(parser): df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + parse_dates=[3], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -252,11 +345,19 @@ def test_parse_dates_column_index(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_parse_dates_true(parser): df_result = read_xml(xml_dates, parse_dates=True, parser=parser) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + parse_dates=True, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -267,6 +368,7 @@ def test_parse_dates_true(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_parse_dates_dictionary(parser): @@ -301,6 +403,12 @@ def test_parse_dates_dictionary(parser): df_result = read_xml( xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser ) + df_iter = read_xml_iterparse( + xml, + parser=parser, + parse_dates={"date_end": ["year", "month", "day"]}, + iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, + ) df_expected = DataFrame( { @@ -312,6 +420,7 @@ def test_parse_dates_dictionary(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_day_first_parse_dates(parser): @@ -351,11 +460,20 @@ def test_day_first_parse_dates(parser): UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) + df_iter = read_xml_iterparse( + xml, + parse_dates=["date"], + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) -def test_wrong_parse_dates_type(parser): +def test_wrong_parse_dates_type(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( TypeError, match=("Only booleans, lists, and dictionaries are accepted") ): - read_xml(xml_dates, parse_dates={"date"}, parser=parser) + read_xml(filename, parse_dates={"date"}, parser=parser, iterparse=iterparse)