From 640c70e094b533d45b21ceb1c0c799e7843b82ce Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 30 Jan 2022 16:44:09 -0600 Subject: [PATCH 1/6] ENH: Add large file support for read_xml --- doc/source/user_guide/io.rst | 40 ++ doc/source/whatsnew/v1.5.0.rst | 38 ++ pandas/io/xml.py | 232 ++++++- pandas/tests/io/xml/test_xml_iterparse.py | 746 ++++++++++++++++++++++ 4 files changed, 1031 insertions(+), 25 deletions(-) create mode 100644 pandas/tests/io/xml/test_xml_iterparse.py diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 34f10c1b3ec28..43baaaebecd11 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3287,6 +3287,46 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``: df = pd.read_xml(xml, stylesheet=xsl) df +For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` +supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_ +which are memory-efficient methods to iterate through an XML tree and extract specific elements and attributes. +without holding entire tree in memory. + +.. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk +.. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse + +To use, you must pass the XML file path into ``read_xml`` and use the ``iterparse`` argument. Files should +not be compressed or from online sources but stored on local disk. Also, ``iterparse`` should be a dictionary +where the key is the repeating nodes in document (which become the rows) and the value is a list of any +element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not +used in this method, descendants do not need to share same relationship with one another. Below shows example +of reading in Wikipedia's very large (10 GB+) latest article data dump. + +.. code-block:: ipython + + In [1]: df = pd.read_xml( + ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml, + ... iterparse = {"page": ["title", "ns", "id"]}) + ... ) + ... df + Out[2]: + title ns id + 0 Gettysburg Address 0 21450 + 1 Main Page 0 42950 + 2 Declaration by United Nations 0 8435 + 3 Constitution of the United States of America 0 8435 + 4 Declaration of Independence (Israel) 0 17858 + ... ... ... ... + 3578760 Page:Black cat 1897 07 v2 n10.pdf/17 104 219649 + 3578761 Page:Black cat 1897 07 v2 n10.pdf/43 104 219649 + 3578762 Page:Black cat 1897 07 v2 n10.pdf/44 104 219649 + 3578763 The History of Tom Jones, a Foundling/Book IX 0 12084291 + 3578764 Page:Shakespeare of Stratford (1926) Yale.djvu/91 104 21450 + + [3578765 rows x 3 columns] + + .. versionadded:: 1.5.0 + .. _io.xml: diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 0f5b4a16d2f01..aa6c41eedf974 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -131,6 +131,44 @@ apply converter methods, and parse dates (:issue:`43567`). df df.dtypes +.. _whatsnew_150.read_xml_iterparse: + +read_xml now supports large XML using iterparse +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` +now supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_ +which are memory-efficient methods to iterate through XML trees and extract specific elements +and attributes without holding entire tree in memory. + +.. code-block:: ipython + + In [1]: df = pd.read_xml( + ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml, + ... iterparse = {"page": ["title", "ns", "id"]}) + ... ) + df + Out[2]: + title ns id + 0 Gettysburg Address 0 21450 + 1 Main Page 0 42950 + 2 Declaration by United Nations 0 8435 + 3 Constitution of the United States of America 0 8435 + 4 Declaration of Independence (Israel) 0 17858 + ... ... ... ... + 3578760 Page:Black cat 1897 07 v2 n10.pdf/17 104 219649 + 3578761 Page:Black cat 1897 07 v2 n10.pdf/43 104 219649 + 3578762 Page:Black cat 1897 07 v2 n10.pdf/44 104 219649 + 3578763 The History of Tom Jones, a Foundling/Book IX 0 12084291 + 3578764 Page:Shakespeare of Stratford (1926) Yale.djvu/91 104 21450 + + [3578765 rows x 3 columns] + + +.. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk +.. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse + + .. _whatsnew_150.api_breaking.other: Other API changes diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 8e463c94340c8..f80a48ff8a32d 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,7 +5,10 @@ from __future__ import annotations import io -from typing import Sequence +from typing import ( + Any, + Sequence, +) from pandas._typing import ( CompressionOptions, @@ -35,6 +38,7 @@ from pandas.io.common import ( file_exists, get_handle, + infer_compression, is_fsspec_url, is_url, stringify_path, @@ -134,6 +138,7 @@ def __init__( parse_dates: ParseDatesArg | None, encoding: str | None, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, + iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, ): @@ -148,6 +153,7 @@ def __init__( self.parse_dates = parse_dates self.encoding = encoding self.stylesheet = stylesheet + self.iterparse = iterparse self.is_style = None self.compression = compression self.storage_options = storage_options @@ -184,6 +190,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: raise AbstractMethodError(self) + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + """ + Parse xml nodes. + + This method will parse elements and underlying descendants + and attributes by iterparse, a method to iterate through an XML + tree without parsing entire XML tree in memory. + + Raises + ------ + ValueError + * If only elements and only attributes are specified. + + Notes + ----- + Namespace URIs will be removed from return node values.Also, + elements with missing children or attributes compared to siblings + will have optional keys filled withi None values. + """ + + raise AbstractMethodError(self) + def _validate_path(self) -> None: """ Validate xpath. @@ -239,12 +267,17 @@ def parse_data(self) -> list[dict[str, str | None]]: "To use stylesheet, you need lxml installed and selected as parser." ) - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self._validate_path() - self._validate_path() self._validate_names() - return self._parse_nodes() + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + ) + + return xml_dicts def _parse_nodes(self) -> list[dict[str, str | None]]: elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) @@ -330,6 +363,67 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: return dicts + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import iterparse + + dicts: list[dict[str, str | None]] = [] + row: dict[str, str | None] | None = None + + if not isinstance(self.iterparse, dict): + raise TypeError( + f"{type(self.iterparse).__name__} is not a valid type for iterparse" + ) + + row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" + if not is_list_like(self.iterparse[row_node]): + raise TypeError( + f"{type(self.iterparse[row_node])} is not a valid type " + "for value in iterparse" + ) + + if ( + not isinstance(self.path_or_buffer, str) + or is_url(self.path_or_buffer) + or is_fsspec_url(self.path_or_buffer) + or self.path_or_buffer.startswith((" None: """ Notes @@ -360,9 +454,14 @@ def _validate_path(self) -> None: ) def _validate_names(self) -> None: + children: list[Any] + if self.names: - parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) - children = parent.findall("*") if parent else [] + if self.iterparse: + children = self.iterparse[next(iter(self.iterparse))] + else: + parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) + children = parent.findall("*") if parent else [] if is_list_like(self.names): if len(self.names) < len(children): @@ -412,16 +511,22 @@ def parse_data(self) -> list[dict[str, str | None]]: """ from lxml.etree import XML - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) - if self.stylesheet is not None: - self.xsl_doc = XML(self._parse_doc(self.stylesheet)) - self.xml_doc = XML(self._transform_doc()) + if self.stylesheet: + self.xsl_doc = XML(self._parse_doc(self.stylesheet)) + self.xml_doc = XML(self._transform_doc()) + + self._validate_path() - self._validate_path() self._validate_names() - return self._parse_nodes() + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + ) + + return xml_dicts def _parse_nodes(self) -> list[dict[str, str | None]]: elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) @@ -506,6 +611,70 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: return dicts + def _iterparse_nodes(self) -> list[dict[str, str | None]]: + from lxml.etree import iterparse + + dicts: list[dict[str, str | None]] = [] + row: dict[str, str | None] | None = None + + if not isinstance(self.iterparse, dict): + raise TypeError( + f"{type(self.iterparse).__name__} is not a valid type for iterparse" + ) + + row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" + if not is_list_like(self.iterparse[row_node]): + raise TypeError( + f"{type(self.iterparse[row_node])} is not a valid type " + "for value in iterparse" + ) + + if ( + not isinstance(self.path_or_buffer, str) + or is_url(self.path_or_buffer) + or is_fsspec_url(self.path_or_buffer) + or self.path_or_buffer.startswith((" None: msg = ( @@ -527,21 +696,15 @@ def _validate_path(self) -> None: raise ValueError(msg) def _validate_names(self) -> None: - """ - Validate names. - - This method will check if names is a list and aligns with - length of parse nodes. + children: list[Any] - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ if self.names: - children = self.xml_doc.xpath( - self.xpath + "[1]/*", namespaces=self.namespaces - ) + if self.iterparse: + children = self.iterparse[next(iter(self.iterparse))] + else: + children = self.xml_doc.xpath( + self.xpath + "[1]/*", namespaces=self.namespaces + ) if is_list_like(self.names): if len(self.names) < len(children): @@ -703,6 +866,7 @@ def _parse( encoding: str | None, parser: XMLParsers, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, + iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, **kwargs, @@ -740,6 +904,7 @@ def _parse( parse_dates, encoding, stylesheet, + iterparse, compression, storage_options, ) @@ -759,6 +924,7 @@ def _parse( parse_dates, encoding, stylesheet, + iterparse, compression, storage_options, ) @@ -797,6 +963,7 @@ def read_xml( encoding: str | None = "utf-8", parser: XMLParsers = "lxml", stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, + iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> DataFrame: @@ -889,6 +1056,20 @@ def read_xml( transformation and not the original XML document. Only XSLT 1.0 scripts and not later versions is currently supported. + iterparse : dict, optional + The nodes or attributes to retrieve in iterparsing of XML document + as a dict with key being the name of repeating element and value being + list of elements or attribute names that are descendants of the repeated + element. Note: If this option is used, it will replace xpath parsing + and unlike xpath, descendants do not need to relate to each other but can + exist any where in document under the repeating element. This memory- + efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). + For example, :: + + iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} + + .. versionadded:: 1.5.0 + {decompression_options} .. versionchanged:: 1.4.0 Zstandard support. @@ -1021,6 +1202,7 @@ def read_xml( encoding=encoding, parser=parser, stylesheet=stylesheet, + iterparse=iterparse, compression=compression, storage_options=storage_options, ) diff --git a/pandas/tests/io/xml/test_xml_iterparse.py b/pandas/tests/io/xml/test_xml_iterparse.py new file mode 100644 index 0000000000000..4398786c87f3b --- /dev/null +++ b/pandas/tests/io/xml/test_xml_iterparse.py @@ -0,0 +1,746 @@ +from __future__ import annotations + +import pytest + +from pandas.errors import ParserError +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Series, + to_datetime, +) +import pandas._testing as tm + +from pandas.io.common import get_handle +from pandas.io.xml import read_xml + + +@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) +def parser(request): + return request.param + + +@pytest.fixture(params=["rb", "r"]) +def mode(request): + return request.param + + +geom_df = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4, float("nan"), 3], + } +) + +xml_str = """\ + + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + +xml_prefix_nmsp = """\ + + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + +""" + +bad_xml = """\ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + +# FILE + + +def test_file(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_file_xpath_compare(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_xpath = read_xml(filename, parser=parser) + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# LARGE FILE + + +@tm.network +@pytest.mark.slow +def test_large_url_xpath_compare(parser): + with tm.ensure_clean(filename="cta.xml") as path: + url = ( + "https://data.cityofchicago.org/api/views/" + "8pix-ypme/rows.xml?accessType=DOWNLOAD" + ) + (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False)) + + df_xpath = read_xml(path, parser=parser) + df_iter = read_xml( + path, + parser=parser, + iterparse={ + "row": [ + "_id", + "_uuid", + "_position", + "_address", + "stop_id", + "direction_id", + "stop_name", + "station_name", + "station_descriptive_name", + "map_id", + "ada", + "red", + "blue", + "g", + "brn", + "p", + "pexp", + "y", + "pnk", + "o", + "location", + ] + }, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# NAMESPACES + + +def test_namespace_prefix(parser): + with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: + with open(path, "w") as f: + f.write(xml_prefix_nmsp) + + df_iter = read_xml( + path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_namespace_prefix_xpath_compare(parser): + with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: + with open(path, "w") as f: + f.write(xml_prefix_nmsp) + + df_xpath = read_xml( + path, + xpath=".//ns:row", + namespaces={"ns": "http://example.com"}, + parser=parser, + ) + df_iter = read_xml( + path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +def test_default_namespace_xpath_compare(datapath): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + df_xpath = read_xml( + kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"} + ) + + df_iter = read_xml( + kml, + iterparse={ + "Placemark": [ + "id", + "name", + "Snippet", + "description", + "styleUrl", + "MultiGeometry", + ] + }, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# ELEMS_ONLY + + +def test_elems_only(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["title", "author", "year", "price"]}, + ) + + df_expected = DataFrame( + { + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": [2005, 2005, 2003], + "price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_elems_only_xpath_compare(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_xpath = read_xml(filename, elems_only=True, parser=parser) + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["title", "author", "year", "price"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# ATTRS_ONLY + + +def test_attrs_only(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) + df_expected = DataFrame({"category": ["cooking", "children", "web"]}) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_attrs_only_xpath_compare(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_xpath = read_xml(filename, attrs_only=True, parser=parser) + df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# NAMES + + +def test_names(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + df_iter = read_xml( + filename, + parser=parser, + names=["b_category", "b_title", "b_author", "b_year", "b_price"], + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) + + df_expected = DataFrame( + { + "b_category": ["cooking", "children", "web"], + "b_title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "b_author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "b_year": [2005, 2005, 2003], + "b_price": [30.00, 29.99, 39.95], + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_names_xpath_compare(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + df_xpath = read_xml( + filename, + parser=parser, + names=["b_category", "b_title", "b_author", "b_year", "b_price"], + ) + df_iter = read_xml( + filename, + parser=parser, + names=["b_category", "b_title", "b_author", "b_year", "b_price"], + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# DTYPE + + +def test_dtypes(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + df_iter = read_xml( + filename, + parser=parser, + dtype={"year": "Int64", "price": "Float64"}, + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + + df_expected = DataFrame( + { + "category": ["cooking", "children", "web"], + "title": ["Everyday Italian", "Harry Potter", "Learning XML"], + "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], + "year": Series([2005, 2005, 2003]).astype("Int64"), + "price": Series([30.00, 29.99, 39.95]).astype("Float64"), + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_dtypes_xpath_compare(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + + df_xpath = read_xml( + filename, parser=parser, dtype={"year": "Int64", "price": "Float64"} + ) + + df_iter = read_xml( + filename, + parser=parser, + dtype={"year": "Int64", "price": "Float64"}, + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# CONVERTERS + + +def test_converters(parser): + convert_to_datetime = lambda x: to_datetime(x) + with tm.ensure_clean(filename="xml_string.xml") as path: + with open(path, "w") as f: + f.write(xml_str) + + df_iter = read_xml( + path, + converters={"date": convert_to_datetime}, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_converters_xpath_compare(parser): + convert_to_datetime = lambda x: to_datetime(x) + with tm.ensure_clean(filename="xml_string.xml") as path: + with open(path, "w") as f: + f.write(xml_str) + + df_xpath = read_xml( + path, converters={"date": convert_to_datetime}, parser=parser + ) + + df_iter = read_xml( + path, + converters={"date": convert_to_datetime}, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# PARSE_DATES + + +def test_date_parse(parser): + with tm.ensure_clean(filename="xml_string.xml") as path: + with open(path, "w") as f: + f.write(xml_str) + + df_iter = read_xml( + path, + parse_dates=["date"], + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_iter, df_expected) + + +def test_date_parse_xpath_compare(parser): + with tm.ensure_clean(filename="xml_string.xml") as path: + with open(path, "w") as f: + f.write(xml_str) + + df_xpath = read_xml(path, parse_dates=["date"], parser=parser) + + df_iter = read_xml( + path, + parse_dates=["date"], + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# ENCODING + + +def test_encoding(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + + df_iter = read_xml( + filename, + parser=parser, + encoding="ISO-8859-1", + iterparse={"row": ["rank", "malename", "femalename"]}, + ) + + df_expected = DataFrame( + { + "rank": [1, 2, 3, 4, 5], + "malename": ["José", "Luis", "Carlos", "Juan", "Jorge"], + "femalename": ["Sofía", "Valentina", "Isabella", "Camila", "Valeria"], + } + ) + + tm.assert_frame_equal(df_iter.head(), df_expected) + + +def test_encoding_xpath_compare(datapath, parser): + filename = datapath("io", "data", "xml", "baby_names.xml") + df_xpath = read_xml(filename, parser=parser, encoding="ISO-8859-1") + + df_iter = read_xml( + filename, + parser=parser, + encoding="ISO-8859-1", + iterparse={"row": ["rank", "malename", "femalename"]}, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# STYLESHEET + + +@td.skip_if_no("lxml") +def test_stylesheet_xpath_compare(datapath): + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + xsl = datapath("io", "data", "xml", "flatten_doc.xsl") + + df_style = read_xml( + kml, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=xsl, + ) + + df_iter = read_xml( + kml, + iterparse={ + "Placemark": [ + "id", + "name", + "styleUrl", + "extrude", + "altitudeMode", + "coordinates", + ] + }, + ) + + tm.assert_frame_equal(df_style, df_iter) + + +# COMPRESSION + + +def test_compression_compare(parser, compression_only): + with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path: + geom_df.to_xml(comp_path, parser=parser, compression=compression_only) + + with get_handle(comp_path, "r", compression=compression_only) as handles: + with open(ext_path, "w") as f: + f.write(handles.handle.read()) + + df_iter = read_xml( + ext_path, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + compression=compression_only, + ) + + tm.assert_frame_equal(geom_df, df_iter) + + +# STORAGE OPTIONS + + +@tm.network +@pytest.mark.slow +def test_s3_xpath_compare(parser): + # Python Software Foundation (2019 IRS-990 RETURN) + s3_path = "s3://irs-form-990/201923199349319487_public.xml" + + df_xpath = read_xml( + s3_path, + xpath=".//irs:Form990PartVIISectionAGrp", + namespaces={"irs": "http://www.irs.gov/efile"}, + parser=parser, + storage_options={"anon": True}, + ) + + with tm.ensure_clean(filename="irs990.xml") as path: + with get_handle(s3_path, "rb", is_text=False) as handles: + with open(path, "wb") as f: + f.write(handles.handle.read()) + + df_iter = read_xml( + path, + parser=parser, + iterparse={ + "Form990PartVIISectionAGrp": [ + "PersonNm", + "TitleTxt", + "AverageHoursPerWeekRt", + "AverageHoursPerWeekRltdOrgRt", + "IndividualTrusteeOrDirectorInd", + "OfficerInd", + "ReportableCompFromOrgAmt", + "ReportableCompFromRltdOrgAmt", + "OtherCompensationAmt", + "HighestCompensatedEmployeeInd", + ] + }, + ) + + tm.assert_frame_equal(df_xpath, df_iter) + + +# PARSER ERROR + + +def test_string_error(parser): + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + xml_str, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_file_like_error(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + with open(filename) as f: + read_xml( + f, + parser=parser, + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + + +@tm.network +def test_url_path_error(parser): + url = "https://www.w3schools.com/xml/books.xml" + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + url, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_compression_error(parser, compression_only): + with tm.ensure_clean(filename="geom_xml.zip") as path: + geom_df.to_xml(path, parser=parser, compression=compression_only) + + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + path, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + compression=compression_only, + ) + + +@tm.network +@td.skip_if_no("s3fs") +def test_storage_options_error(parser): + # Python Software Foundation (2019 IRS-990 RETURN) + s3 = "s3://irs-form-990/201923199349319487_public.xml" + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + s3, + parser=parser, + iterparse={ + "Form990PartVIISectionAGrp": [ + "PersonNm", + "TitleTxt", + "AverageHoursPerWeekRt", + "AverageHoursPerWeekRltdOrgRt", + "IndividualTrusteeOrDirectorInd", + "OfficerInd", + "ReportableCompFromOrgAmt", + "ReportableCompFromRltdOrgAmt", + "OtherCompensationAmt", + ] + }, + storage_options={"anon": True}, + ) + + +# OTHER EXCEPTIONS + + +def test_wrong_dict_type(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(TypeError, match="list is not a valid type for iterparse"): + read_xml( + filename, + parser=parser, + iterparse=["category", "title", "year", "author", "price"], + ) + + +def test_wrong_dict_value(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + TypeError, match=" is not a valid type for value in iterparse" + ): + read_xml(filename, parser=parser, iterparse={"book": "category"}) + + +def test_bad_xml(datapath, parser): + with tm.ensure_clean(filename="bad.xml") as path: + with open(path, "w") as f: + f.write(bad_xml) + + with pytest.raises( + SyntaxError, match="Extra content at the end of the document" + ): + read_xml( + path, + parse_dates=["date"], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_no_result(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + ParserError, match="No result from selected items in iterparse." + ): + read_xml( + filename, + parser=parser, + iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]}, + ) From 4011b4b467b0788fcb203ca10d4d5c9ccaad6374 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 31 Jan 2022 08:03:34 -0600 Subject: [PATCH 2/6] Combine tests, slightly fix docs --- doc/source/user_guide/io.rst | 23 +- doc/source/whatsnew/v1.5.0.rst | 4 +- pandas/io/xml.py | 29 +- pandas/tests/io/xml/test_xml.py | 293 ++++++++- pandas/tests/io/xml/test_xml_dtypes.py | 122 ++++ pandas/tests/io/xml/test_xml_iterparse.py | 746 ---------------------- 6 files changed, 415 insertions(+), 802 deletions(-) delete mode 100644 pandas/tests/io/xml/test_xml_iterparse.py diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 43baaaebecd11..f6411d6041871 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3292,23 +3292,25 @@ supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iter which are memory-efficient methods to iterate through an XML tree and extract specific elements and attributes. without holding entire tree in memory. + .. versionadded:: 1.5.0 + .. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk .. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse -To use, you must pass the XML file path into ``read_xml`` and use the ``iterparse`` argument. Files should -not be compressed or from online sources but stored on local disk. Also, ``iterparse`` should be a dictionary -where the key is the repeating nodes in document (which become the rows) and the value is a list of any -element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not +To use this feature, you must pass a physical XML file path into ``read_xml`` and use the ``iterparse`` argument. +Files should not be compressed or point to online sources but stored on local disk. Also, ``iterparse`` should be +a dictionary where the key is the repeating nodes in document (which become the rows) and the value is a list of +any element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not used in this method, descendants do not need to share same relationship with one another. Below shows example -of reading in Wikipedia's very large (10 GB+) latest article data dump. +of reading in Wikipedia's very large (12 GB+) latest article data dump. .. code-block:: ipython In [1]: df = pd.read_xml( - ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml, - ... iterparse = {"page": ["title", "ns", "id"]}) - ... ) - ... df + ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml", + ... iterparse = {"page": ["title", "ns", "id"]} + ... ) + ... df Out[2]: title ns id 0 Gettysburg Address 0 21450 @@ -3325,9 +3327,6 @@ of reading in Wikipedia's very large (10 GB+) latest article data dump. [3578765 rows x 3 columns] - .. versionadded:: 1.5.0 - - .. _io.xml: Writing XML diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index aa6c41eedf974..f1691b1bee331 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -139,12 +139,12 @@ read_xml now supports large XML using iterparse For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` now supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_ which are memory-efficient methods to iterate through XML trees and extract specific elements -and attributes without holding entire tree in memory. +and attributes without holding entire tree in memory (:issue:`#45442`). .. code-block:: ipython In [1]: df = pd.read_xml( - ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml, + ... "/path/to/downloaded/enwikisource-latest-pages-articles.xml", ... iterparse = {"page": ["title", "ns", "id"]}) ... ) df diff --git a/pandas/io/xml.py b/pandas/io/xml.py index f80a48ff8a32d..b8aa6980cde94 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -98,6 +98,12 @@ class _XMLFrameParser: URL, file, file-like object, or a raw string containing XSLT, `etree` does not support XSLT but retained for consistency. + iterparse : dict, optional + Dict of row element and descendant elements and/or attributes to + retrieve in iterparsing of XML document. + + .. versionadded:: 1.5.0 + {decompression_options} .. versionchanged:: 1.4.0 Zstandard support. @@ -183,30 +189,33 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: Notes ----- - Namespace URIs will be removed from return node values.Also, + Namespace URIs will be removed from return node values. Also, elements with missing children or attributes compared to siblings - will have optional keys filled withi None values. + will have optional keys filled with None values. """ raise AbstractMethodError(self) def _iterparse_nodes(self) -> list[dict[str, str | None]]: """ - Parse xml nodes. + Iterparse xml nodes. - This method will parse elements and underlying descendants - and attributes by iterparse, a method to iterate through an XML - tree without parsing entire XML tree in memory. + This method will read in local disk, decompressed XML files for elements + and underlying descendants using iterparse, a method to iterate through + an XML tree without holding entire XML tree in memory. Raises ------ - ValueError - * If only elements and only attributes are specified. + TypeError + * If `iterparse` is not a dict or its dict value is not list-like. + ParserError + * If `path_or_buffer` is not a physical, decompressed file on disk. + * If no data is returned from selected items in `iterparse`. Notes ----- - Namespace URIs will be removed from return node values.Also, - elements with missing children or attributes compared to siblings + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes in submitted list will have optional keys filled withi None values. """ diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 03c176fd7bc8b..2be8dd32f2c76 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -13,11 +13,13 @@ import pytest from pandas.compat._optional import import_optional_dependency +from pandas.errors import ParserError import pandas.util._test_decorators as td from pandas import DataFrame import pandas._testing as tm +from pandas.io.common import get_handle from pandas.io.xml import read_xml """ @@ -251,21 +253,65 @@ def test_parser_consistency_file(datapath): df_file_lxml = read_xml(filename, parser="lxml") df_file_etree = read_xml(filename, parser="etree") + df_iter_lxml = read_xml( + filename, + parser="lxml", + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + df_iter_etree = read_xml( + filename, + parser="etree", + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + tm.assert_frame_equal(df_file_lxml, df_file_etree) + tm.assert_frame_equal(df_file_lxml, df_iter_lxml) + tm.assert_frame_equal(df_iter_lxml, df_iter_etree) @tm.network @pytest.mark.slow -@td.skip_if_no("lxml") -def test_parser_consistency_url(): +def test_parser_consistency_url(parser): url = ( "https://data.cityofchicago.org/api/views/" "8pix-ypme/rows.xml?accessType=DOWNLOAD" ) - df_url_lxml = read_xml(url, xpath=".//row/row", parser="lxml") - df_url_etree = read_xml(url, xpath=".//row/row", parser="etree") - tm.assert_frame_equal(df_url_lxml, df_url_etree) + with tm.ensure_clean(filename="cta.xml") as path: + (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False)) + + df_xpath = read_xml(path, parser=parser) + df_iter = read_xml( + path, + parser=parser, + iterparse={ + "row": [ + "_id", + "_uuid", + "_position", + "_address", + "stop_id", + "direction_id", + "stop_name", + "station_name", + "station_descriptive_name", + "map_id", + "ada", + "red", + "blue", + "g", + "brn", + "p", + "pexp", + "y", + "pnk", + "o", + "location", + ] + }, + ) + + tm.assert_frame_equal(df_xpath, df_iter) def test_file_like(datapath, parser, mode): @@ -401,26 +447,6 @@ def test_wrong_file_path_etree(): read_xml(filename, parser="etree") -@tm.network -@td.skip_if_no("lxml") -def test_url(): - url = "https://www.w3schools.com/xml/books.xml" - df_url = read_xml(url, xpath=".//book[count(*)=4]") - - df_expected = DataFrame( - { - "category": ["cooking", "children", "web"], - "title": ["Everyday Italian", "Harry Potter", "Learning XML"], - "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], - "year": [2005, 2005, 2003], - "price": [30.00, 29.99, 39.95], - "cover": [None, None, "paperback"], - } - ) - - tm.assert_frame_equal(df_url, df_expected) - - @tm.network def test_wrong_url(parser): with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): @@ -466,6 +492,14 @@ def test_default_namespace(parser): parser=parser, ) + with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: + with open(path, "w") as f: + f.write(xml_default_nmsp) + + df_iter = read_xml( + path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} + ) + df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -475,6 +509,7 @@ def test_default_namespace(parser): ) tm.assert_frame_equal(df_nmsp, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_prefix_namespace(parser): @@ -485,6 +520,14 @@ def test_prefix_namespace(parser): parser=parser, ) + with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: + with open(path, "w") as f: + f.write(xml_prefix_nmsp) + + df_iter = read_xml( + path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} + ) + df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -494,6 +537,7 @@ def test_prefix_namespace(parser): ) tm.assert_frame_equal(df_nmsp, df_expected) + tm.assert_frame_equal(df_iter, df_expected) @td.skip_if_no("lxml") @@ -594,14 +638,21 @@ def test_file_elems_and_attrs(datapath, parser): def test_file_only_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, attrs_only=True, parser=parser) + df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) df_expected = DataFrame({"category": ["cooking", "children", "web"]}) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_file_only_elems(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, elems_only=True, parser=parser) + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["title", "author", "year", "price"]}, + ) df_expected = DataFrame( { "title": ["Everyday Italian", "Harry Potter", "Learning XML"], @@ -612,6 +663,7 @@ def test_file_only_elems(datapath, parser): ) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_elem_and_attrs_only(datapath, parser): @@ -659,6 +711,12 @@ def test_names_option_output(datapath, parser): df_file = read_xml( filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser ) + df_iter = read_xml( + filename, + parser=parser, + names=["Col1", "Col2", "Col3", "Col4", "Col5"], + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) df_expected = DataFrame( { @@ -671,6 +729,7 @@ def test_names_option_output(datapath, parser): ) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_names_option_wrong_length(datapath, parser): @@ -723,10 +782,25 @@ def test_ascii_encoding(datapath, parser): @td.skip_if_no("lxml") def test_parser_consistency_with_encoding(datapath): filename = datapath("io", "data", "xml", "baby_names.xml") - df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") - df_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") + df_xpath_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1") + df_xpath_etree = read_xml(filename, parser="etree", encoding="iso-8859-1") - tm.assert_frame_equal(df_lxml, df_etree) + df_iter_lxml = read_xml( + filename, + parser="lxml", + encoding="ISO-8859-1", + iterparse={"row": ["rank", "malename", "femalename"]}, + ) + df_iter_etree = read_xml( + filename, + parser="lxml", + encoding="ISO-8859-1", + iterparse={"row": ["rank", "malename", "femalename"]}, + ) + + tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree) + tm.assert_frame_equal(df_xpath_etree, df_iter_etree) + tm.assert_frame_equal(df_iter_lxml, df_iter_etree) @td.skip_if_no("lxml") @@ -792,7 +866,22 @@ def test_stylesheet_file(datapath): stylesheet=xsl, ) + df_iter = read_xml( + kml, + iterparse={ + "Placemark": [ + "id", + "name", + "styleUrl", + "extrude", + "altitudeMode", + "coordinates", + ] + }, + ) + tm.assert_frame_equal(df_kml, df_style) + tm.assert_frame_equal(df_kml, df_iter) def test_read_xml_passing_as_positional_deprecated(datapath, parser): @@ -1016,6 +1105,132 @@ def test_empty_stylesheet(val): read_xml(kml, stylesheet=val) +# ITERPARSE + + +def test_string_error(parser): + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + xml_default_nmsp, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_file_like_error(datapath, parser, mode): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + with open(filename) as f: + read_xml( + f, + parser=parser, + iterparse={"book": ["category", "title", "year", "author", "price"]}, + ) + + +@tm.network +def test_url_path_error(parser): + url = "https://www.w3schools.com/xml/books.xml" + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + url, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_compression_error(parser, compression_only): + with tm.ensure_clean(filename="geom_xml.zip") as path: + geom_df.to_xml(path, parser=parser, compression=compression_only) + + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + path, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + compression=compression_only, + ) + + +def test_wrong_dict_type(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(TypeError, match="list is not a valid type for iterparse"): + read_xml( + filename, + parser=parser, + iterparse=["category", "title", "year", "author", "price"], + ) + + +def test_wrong_dict_value(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + TypeError, match=" is not a valid type for value in iterparse" + ): + read_xml(filename, parser=parser, iterparse={"book": "category"}) + + +def test_bad_xml(datapath, parser): + bad_xml = """\ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + with tm.ensure_clean(filename="bad.xml") as path: + with open(path, "w") as f: + f.write(bad_xml) + + with pytest.raises( + SyntaxError, + match=( + "Extra content at the end of the document|" + "junk after document element" + ), + ): + read_xml( + path, + parser=parser, + parse_dates=["date"], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + + +def test_no_result(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises( + ParserError, match="No result from selected items in iterparse." + ): + read_xml( + filename, + parser=parser, + iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]}, + ) + + @tm.network @td.skip_if_no("lxml") def test_online_stylesheet(): @@ -1055,12 +1270,26 @@ def test_online_stylesheet(): def test_compression_read(parser, compression_only): - with tm.ensure_clean() as path: - geom_df.to_xml(path, index=False, parser=parser, compression=compression_only) + with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path: + geom_df.to_xml( + comp_path, index=False, parser=parser, compression=compression_only + ) + + df_xpath = read_xml(comp_path, parser=parser, compression=compression_only) + + with get_handle(comp_path, "r", compression=compression_only) as handles: + with open(ext_path, "w") as f: + f.write(handles.handle.read()) - xml_df = read_xml(path, parser=parser, compression=compression_only) + df_iter = read_xml( + ext_path, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + compression=compression_only, + ) - tm.assert_frame_equal(xml_df, geom_df) + tm.assert_frame_equal(df_xpath, geom_df) + tm.assert_frame_equal(df_iter, geom_df) def test_wrong_compression(parser, compression, compression_only): diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 801461ed4288a..afe1d5de720d8 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -68,6 +68,16 @@ def parser(request): def test_dtype_single_str(parser): df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_types) + + df_iter = read_xml( + path, + parser=parser, + dtype={"degrees": "str"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -78,10 +88,21 @@ def test_dtype_single_str(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtypes_all_str(parser): df_result = read_xml(xml_dates, dtype="string", parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_dates) + + df_iter = read_xml( + path, + parser=parser, + dtype="string", + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -94,6 +115,7 @@ def test_dtypes_all_str(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtypes_with_names(parser): @@ -103,6 +125,17 @@ def test_dtypes_with_names(parser): dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, parser=parser, ) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_dates) + + df_iter = read_xml( + path, + parser=parser, + names=["Col1", "Col2", "Col3", "Col4"], + dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -114,10 +147,21 @@ def test_dtypes_with_names(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtype_nullable_int(parser): df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_types) + + df_iter = read_xml( + path, + parser=parser, + dtype={"sides": "Int64"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -128,10 +172,21 @@ def test_dtype_nullable_int(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_dtype_float(parser): df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_types) + + df_iter = read_xml( + path, + parser=parser, + dtype={"degrees": "float"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -142,6 +197,7 @@ def test_dtype_float(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_wrong_dtype(parser): @@ -176,6 +232,16 @@ def test_both_dtype_converters(parser): def test_converters_str(parser): df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_types) + + df_iter = read_xml( + path, + parser=parser, + converters={"degrees": str}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -186,6 +252,7 @@ def test_converters_str(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_converters_date(parser): @@ -193,6 +260,16 @@ def test_converters_date(parser): df_result = read_xml( xml_dates, converters={"date": convert_to_datetime}, parser=parser ) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_dates) + + df_iter = read_xml( + path, + parser=parser, + converters={"date": convert_to_datetime}, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -204,6 +281,7 @@ def test_converters_date(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_wrong_converters_type(parser): @@ -226,6 +304,16 @@ def test_callable_str_converters(parser): def test_parse_dates_column_name(parser): df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_dates) + + df_iter = read_xml( + path, + parser=parser, + parse_dates=["date"], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -237,10 +325,21 @@ def test_parse_dates_column_name(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_parse_dates_column_index(parser): df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_dates) + + df_iter = read_xml( + path, + parser=parser, + parse_dates=[3], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -252,10 +351,21 @@ def test_parse_dates_column_index(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_parse_dates_true(parser): df_result = read_xml(xml_dates, parse_dates=True, parser=parser) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml_dates) + + df_iter = read_xml( + path, + parser=parser, + parse_dates=True, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -267,6 +377,7 @@ def test_parse_dates_true(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_parse_dates_dictionary(parser): @@ -301,6 +412,16 @@ def test_parse_dates_dictionary(parser): df_result = read_xml( xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser ) + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(xml) + + df_iter = read_xml( + path, + parser=parser, + parse_dates={"date_end": ["year", "month", "day"]}, + iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, + ) df_expected = DataFrame( { @@ -312,6 +433,7 @@ def test_parse_dates_dictionary(parser): ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_day_first_parse_dates(parser): diff --git a/pandas/tests/io/xml/test_xml_iterparse.py b/pandas/tests/io/xml/test_xml_iterparse.py deleted file mode 100644 index 4398786c87f3b..0000000000000 --- a/pandas/tests/io/xml/test_xml_iterparse.py +++ /dev/null @@ -1,746 +0,0 @@ -from __future__ import annotations - -import pytest - -from pandas.errors import ParserError -import pandas.util._test_decorators as td - -from pandas import ( - DataFrame, - Series, - to_datetime, -) -import pandas._testing as tm - -from pandas.io.common import get_handle -from pandas.io.xml import read_xml - - -@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) -def parser(request): - return request.param - - -@pytest.fixture(params=["rb", "r"]) -def mode(request): - return request.param - - -geom_df = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4, float("nan"), 3], - } -) - -xml_str = """\ - - - - square - 00360 - 4.0 - 2020-01-01 - - - circle - 00360 - - 2021-01-01 - - - triangle - 00180 - 3.0 - 2022-01-01 - -""" - -xml_prefix_nmsp = """\ - - - - square - 360 - 4.0 - - - circle - 360 - - - - triangle - 180 - 3.0 - -""" - -bad_xml = """\ - - - square - 00360 - 4.0 - 2020-01-01 - - - circle - 00360 - - 2021-01-01 - - - triangle - 00180 - 3.0 - 2022-01-01 - -""" - -# FILE - - -def test_file(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - df_iter = read_xml( - filename, - parser=parser, - iterparse={"book": ["category", "title", "year", "author", "price"]}, - ) - - df_expected = DataFrame( - { - "category": ["cooking", "children", "web"], - "title": ["Everyday Italian", "Harry Potter", "Learning XML"], - "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], - "year": [2005, 2005, 2003], - "price": [30.00, 29.99, 39.95], - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_file_xpath_compare(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - df_xpath = read_xml(filename, parser=parser) - df_iter = read_xml( - filename, - parser=parser, - iterparse={"book": ["category", "title", "author", "year", "price"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# LARGE FILE - - -@tm.network -@pytest.mark.slow -def test_large_url_xpath_compare(parser): - with tm.ensure_clean(filename="cta.xml") as path: - url = ( - "https://data.cityofchicago.org/api/views/" - "8pix-ypme/rows.xml?accessType=DOWNLOAD" - ) - (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False)) - - df_xpath = read_xml(path, parser=parser) - df_iter = read_xml( - path, - parser=parser, - iterparse={ - "row": [ - "_id", - "_uuid", - "_position", - "_address", - "stop_id", - "direction_id", - "stop_name", - "station_name", - "station_descriptive_name", - "map_id", - "ada", - "red", - "blue", - "g", - "brn", - "p", - "pexp", - "y", - "pnk", - "o", - "location", - ] - }, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# NAMESPACES - - -def test_namespace_prefix(parser): - with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: - with open(path, "w") as f: - f.write(xml_prefix_nmsp) - - df_iter = read_xml( - path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} - ) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_namespace_prefix_xpath_compare(parser): - with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: - with open(path, "w") as f: - f.write(xml_prefix_nmsp) - - df_xpath = read_xml( - path, - xpath=".//ns:row", - namespaces={"ns": "http://example.com"}, - parser=parser, - ) - df_iter = read_xml( - path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -def test_default_namespace_xpath_compare(datapath): - kml = datapath("io", "data", "xml", "cta_rail_lines.kml") - - df_xpath = read_xml( - kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"} - ) - - df_iter = read_xml( - kml, - iterparse={ - "Placemark": [ - "id", - "name", - "Snippet", - "description", - "styleUrl", - "MultiGeometry", - ] - }, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# ELEMS_ONLY - - -def test_elems_only(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - - df_iter = read_xml( - filename, - parser=parser, - iterparse={"book": ["title", "author", "year", "price"]}, - ) - - df_expected = DataFrame( - { - "title": ["Everyday Italian", "Harry Potter", "Learning XML"], - "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], - "year": [2005, 2005, 2003], - "price": [30.00, 29.99, 39.95], - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_elems_only_xpath_compare(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - df_xpath = read_xml(filename, elems_only=True, parser=parser) - df_iter = read_xml( - filename, - parser=parser, - iterparse={"book": ["title", "author", "year", "price"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# ATTRS_ONLY - - -def test_attrs_only(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) - df_expected = DataFrame({"category": ["cooking", "children", "web"]}) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_attrs_only_xpath_compare(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - df_xpath = read_xml(filename, attrs_only=True, parser=parser) - df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]}) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# NAMES - - -def test_names(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - - df_iter = read_xml( - filename, - parser=parser, - names=["b_category", "b_title", "b_author", "b_year", "b_price"], - iterparse={"book": ["category", "title", "author", "year", "price"]}, - ) - - df_expected = DataFrame( - { - "b_category": ["cooking", "children", "web"], - "b_title": ["Everyday Italian", "Harry Potter", "Learning XML"], - "b_author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], - "b_year": [2005, 2005, 2003], - "b_price": [30.00, 29.99, 39.95], - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_names_xpath_compare(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - df_xpath = read_xml( - filename, - parser=parser, - names=["b_category", "b_title", "b_author", "b_year", "b_price"], - ) - df_iter = read_xml( - filename, - parser=parser, - names=["b_category", "b_title", "b_author", "b_year", "b_price"], - iterparse={"book": ["category", "title", "author", "year", "price"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# DTYPE - - -def test_dtypes(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - - df_iter = read_xml( - filename, - parser=parser, - dtype={"year": "Int64", "price": "Float64"}, - iterparse={"book": ["category", "title", "year", "author", "price"]}, - ) - - df_expected = DataFrame( - { - "category": ["cooking", "children", "web"], - "title": ["Everyday Italian", "Harry Potter", "Learning XML"], - "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], - "year": Series([2005, 2005, 2003]).astype("Int64"), - "price": Series([30.00, 29.99, 39.95]).astype("Float64"), - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_dtypes_xpath_compare(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - - df_xpath = read_xml( - filename, parser=parser, dtype={"year": "Int64", "price": "Float64"} - ) - - df_iter = read_xml( - filename, - parser=parser, - dtype={"year": "Int64", "price": "Float64"}, - iterparse={"book": ["category", "title", "year", "author", "price"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# CONVERTERS - - -def test_converters(parser): - convert_to_datetime = lambda x: to_datetime(x) - with tm.ensure_clean(filename="xml_string.xml") as path: - with open(path, "w") as f: - f.write(xml_str) - - df_iter = read_xml( - path, - converters={"date": convert_to_datetime}, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_converters_xpath_compare(parser): - convert_to_datetime = lambda x: to_datetime(x) - with tm.ensure_clean(filename="xml_string.xml") as path: - with open(path, "w") as f: - f.write(xml_str) - - df_xpath = read_xml( - path, converters={"date": convert_to_datetime}, parser=parser - ) - - df_iter = read_xml( - path, - converters={"date": convert_to_datetime}, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# PARSE_DATES - - -def test_date_parse(parser): - with tm.ensure_clean(filename="xml_string.xml") as path: - with open(path, "w") as f: - f.write(xml_str) - - df_iter = read_xml( - path, - parse_dates=["date"], - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), - } - ) - - tm.assert_frame_equal(df_iter, df_expected) - - -def test_date_parse_xpath_compare(parser): - with tm.ensure_clean(filename="xml_string.xml") as path: - with open(path, "w") as f: - f.write(xml_str) - - df_xpath = read_xml(path, parse_dates=["date"], parser=parser) - - df_iter = read_xml( - path, - parse_dates=["date"], - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# ENCODING - - -def test_encoding(datapath, parser): - filename = datapath("io", "data", "xml", "baby_names.xml") - - df_iter = read_xml( - filename, - parser=parser, - encoding="ISO-8859-1", - iterparse={"row": ["rank", "malename", "femalename"]}, - ) - - df_expected = DataFrame( - { - "rank": [1, 2, 3, 4, 5], - "malename": ["José", "Luis", "Carlos", "Juan", "Jorge"], - "femalename": ["Sofía", "Valentina", "Isabella", "Camila", "Valeria"], - } - ) - - tm.assert_frame_equal(df_iter.head(), df_expected) - - -def test_encoding_xpath_compare(datapath, parser): - filename = datapath("io", "data", "xml", "baby_names.xml") - df_xpath = read_xml(filename, parser=parser, encoding="ISO-8859-1") - - df_iter = read_xml( - filename, - parser=parser, - encoding="ISO-8859-1", - iterparse={"row": ["rank", "malename", "femalename"]}, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# STYLESHEET - - -@td.skip_if_no("lxml") -def test_stylesheet_xpath_compare(datapath): - kml = datapath("io", "data", "xml", "cta_rail_lines.kml") - xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - - df_style = read_xml( - kml, - xpath=".//k:Placemark", - namespaces={"k": "http://www.opengis.net/kml/2.2"}, - stylesheet=xsl, - ) - - df_iter = read_xml( - kml, - iterparse={ - "Placemark": [ - "id", - "name", - "styleUrl", - "extrude", - "altitudeMode", - "coordinates", - ] - }, - ) - - tm.assert_frame_equal(df_style, df_iter) - - -# COMPRESSION - - -def test_compression_compare(parser, compression_only): - with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path: - geom_df.to_xml(comp_path, parser=parser, compression=compression_only) - - with get_handle(comp_path, "r", compression=compression_only) as handles: - with open(ext_path, "w") as f: - f.write(handles.handle.read()) - - df_iter = read_xml( - ext_path, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides"]}, - compression=compression_only, - ) - - tm.assert_frame_equal(geom_df, df_iter) - - -# STORAGE OPTIONS - - -@tm.network -@pytest.mark.slow -def test_s3_xpath_compare(parser): - # Python Software Foundation (2019 IRS-990 RETURN) - s3_path = "s3://irs-form-990/201923199349319487_public.xml" - - df_xpath = read_xml( - s3_path, - xpath=".//irs:Form990PartVIISectionAGrp", - namespaces={"irs": "http://www.irs.gov/efile"}, - parser=parser, - storage_options={"anon": True}, - ) - - with tm.ensure_clean(filename="irs990.xml") as path: - with get_handle(s3_path, "rb", is_text=False) as handles: - with open(path, "wb") as f: - f.write(handles.handle.read()) - - df_iter = read_xml( - path, - parser=parser, - iterparse={ - "Form990PartVIISectionAGrp": [ - "PersonNm", - "TitleTxt", - "AverageHoursPerWeekRt", - "AverageHoursPerWeekRltdOrgRt", - "IndividualTrusteeOrDirectorInd", - "OfficerInd", - "ReportableCompFromOrgAmt", - "ReportableCompFromRltdOrgAmt", - "OtherCompensationAmt", - "HighestCompensatedEmployeeInd", - ] - }, - ) - - tm.assert_frame_equal(df_xpath, df_iter) - - -# PARSER ERROR - - -def test_string_error(parser): - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - xml_str, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - -def test_file_like_error(datapath, parser, mode): - filename = datapath("io", "data", "xml", "books.xml") - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - with open(filename) as f: - read_xml( - f, - parser=parser, - iterparse={"book": ["category", "title", "year", "author", "price"]}, - ) - - -@tm.network -def test_url_path_error(parser): - url = "https://www.w3schools.com/xml/books.xml" - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - url, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - -def test_compression_error(parser, compression_only): - with tm.ensure_clean(filename="geom_xml.zip") as path: - geom_df.to_xml(path, parser=parser, compression=compression_only) - - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - path, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - compression=compression_only, - ) - - -@tm.network -@td.skip_if_no("s3fs") -def test_storage_options_error(parser): - # Python Software Foundation (2019 IRS-990 RETURN) - s3 = "s3://irs-form-990/201923199349319487_public.xml" - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - s3, - parser=parser, - iterparse={ - "Form990PartVIISectionAGrp": [ - "PersonNm", - "TitleTxt", - "AverageHoursPerWeekRt", - "AverageHoursPerWeekRltdOrgRt", - "IndividualTrusteeOrDirectorInd", - "OfficerInd", - "ReportableCompFromOrgAmt", - "ReportableCompFromRltdOrgAmt", - "OtherCompensationAmt", - ] - }, - storage_options={"anon": True}, - ) - - -# OTHER EXCEPTIONS - - -def test_wrong_dict_type(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - with pytest.raises(TypeError, match="list is not a valid type for iterparse"): - read_xml( - filename, - parser=parser, - iterparse=["category", "title", "year", "author", "price"], - ) - - -def test_wrong_dict_value(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - with pytest.raises( - TypeError, match=" is not a valid type for value in iterparse" - ): - read_xml(filename, parser=parser, iterparse={"book": "category"}) - - -def test_bad_xml(datapath, parser): - with tm.ensure_clean(filename="bad.xml") as path: - with open(path, "w") as f: - f.write(bad_xml) - - with pytest.raises( - SyntaxError, match="Extra content at the end of the document" - ): - read_xml( - path, - parse_dates=["date"], - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - -def test_no_result(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") - with pytest.raises( - ParserError, match="No result from selected items in iterparse." - ): - read_xml( - filename, - parser=parser, - iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]}, - ) From 5514025c29144b648675d3d67d4a7eb1f7f87984 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 27 Feb 2022 22:54:21 -0600 Subject: [PATCH 3/6] Adjust pytest decorator on URL test; fix doc strings --- pandas/io/xml.py | 8 +++++--- pandas/tests/io/xml/test_xml.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 2850a98f08430..76780fc6b2241 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -102,8 +102,9 @@ class _XMLFrameParser: `etree` does not support XSLT but retained for consistency. iterparse : dict, optional - Dict of row element and descendant elements and/or attributes to - retrieve in iterparsing of XML document. + Dict with row element as key and list of descendant elements + and/or attributes as value to be retrieved in iterparsing of + XML document. .. versionadded:: 1.5.0 @@ -123,6 +124,7 @@ class _XMLFrameParser: To subclass this class effectively you must override the following methods:` * :func:`parse_data` * :func:`_parse_nodes` + * :func:`_iterparse_nodes` * :func:`_parse_doc` * :func:`_validate_names` * :func:`_validate_path` @@ -217,7 +219,7 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: ----- Namespace URIs will be removed from return node values. Also, elements with missing children or attributes in submitted list - will have optional keys filled withi None values. + will have optional keys filled with None values. """ raise AbstractMethodError(self) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 79040debd9fc8..1d34b0ecf2731 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1165,7 +1165,8 @@ def test_file_like_error(datapath, parser, mode): ) -@tm.network +@pytest.mark.network +@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True) def test_url_path_error(parser): url = "https://www.w3schools.com/xml/books.xml" with pytest.raises( From 2c4d81f55d4f9d36cd79ffbf4e8ca255835cb707 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 28 Feb 2022 22:52:15 -0600 Subject: [PATCH 4/6] Adjust tests for helper function --- pandas/tests/io/xml/test_xml.py | 57 ++++---- pandas/tests/io/xml/test_xml_dtypes.py | 184 ++++++++++--------------- 2 files changed, 105 insertions(+), 136 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 1d34b0ecf2731..77f90d88614b1 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -245,6 +245,21 @@ def parser(request): return request.param +def read_xml_iterparse(data, **kwargs): + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(data) + return read_xml(path, **kwargs) + + +def read_xml_iterparse_comp(comp_path, compression_only, **kwargs): + with get_handle(comp_path, "r", compression=compression_only) as handles: + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(handles.handle.read()) + return read_xml(path, **kwargs) + + # FILE / URL @@ -525,13 +540,11 @@ def test_default_namespace(parser): parser=parser, ) - with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: - with open(path, "w") as f: - f.write(xml_default_nmsp) - - df_iter = read_xml( - path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} - ) + df_iter = read_xml_iterparse( + xml_default_nmsp, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -552,14 +565,9 @@ def test_prefix_namespace(parser): namespaces={"doc": "http://example.com"}, parser=parser, ) - - with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path: - with open(path, "w") as f: - f.write(xml_prefix_nmsp) - - df_iter = read_xml( - path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} - ) + df_iter = read_xml_iterparse( + xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]} + ) df_expected = DataFrame( { @@ -1307,23 +1315,20 @@ def test_online_stylesheet(): def test_compression_read(parser, compression_only): - with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path: + with tm.ensure_clean() as comp_path: geom_df.to_xml( comp_path, index=False, parser=parser, compression=compression_only ) df_xpath = read_xml(comp_path, parser=parser, compression=compression_only) - with get_handle(comp_path, "r", compression=compression_only) as handles: - with open(ext_path, "w") as f: - f.write(handles.handle.read()) - - df_iter = read_xml( - ext_path, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides"]}, - compression=compression_only, - ) + df_iter = read_xml_iterparse_comp( + comp_path, + compression_only, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + compression=compression_only, + ) tm.assert_frame_equal(df_xpath, geom_df) tm.assert_frame_equal(df_iter, geom_df) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index afe1d5de720d8..bbf5545052584 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -20,6 +20,13 @@ def parser(request): return request.param +def read_xml_iterparse(data, **kwargs): + with tm.ensure_clean() as path: + with open(path, "w") as f: + f.write(data) + return read_xml(path, **kwargs) + + xml_types = """\ @@ -68,16 +75,12 @@ def parser(request): def test_dtype_single_str(parser): df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_types) - - df_iter = read_xml( - path, - parser=parser, - dtype={"degrees": "str"}, - iterparse={"row": ["shape", "degrees", "sides"]}, - ) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + dtype={"degrees": "str"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -93,16 +96,12 @@ def test_dtype_single_str(parser): def test_dtypes_all_str(parser): df_result = read_xml(xml_dates, dtype="string", parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_dates) - - df_iter = read_xml( - path, - parser=parser, - dtype="string", - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + dtype="string", + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -125,17 +124,13 @@ def test_dtypes_with_names(parser): dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, parser=parser, ) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_dates) - - df_iter = read_xml( - path, - parser=parser, - names=["Col1", "Col2", "Col3", "Col4"], - dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + names=["Col1", "Col2", "Col3", "Col4"], + dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -152,16 +147,12 @@ def test_dtypes_with_names(parser): def test_dtype_nullable_int(parser): df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_types) - - df_iter = read_xml( - path, - parser=parser, - dtype={"sides": "Int64"}, - iterparse={"row": ["shape", "degrees", "sides"]}, - ) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + dtype={"sides": "Int64"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -177,16 +168,12 @@ def test_dtype_nullable_int(parser): def test_dtype_float(parser): df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_types) - - df_iter = read_xml( - path, - parser=parser, - dtype={"degrees": "float"}, - iterparse={"row": ["shape", "degrees", "sides"]}, - ) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + dtype={"degrees": "float"}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -232,16 +219,12 @@ def test_both_dtype_converters(parser): def test_converters_str(parser): df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_types) - - df_iter = read_xml( - path, - parser=parser, - converters={"degrees": str}, - iterparse={"row": ["shape", "degrees", "sides"]}, - ) + df_iter = read_xml_iterparse( + xml_types, + parser=parser, + converters={"degrees": str}, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) df_expected = DataFrame( { @@ -260,16 +243,12 @@ def test_converters_date(parser): df_result = read_xml( xml_dates, converters={"date": convert_to_datetime}, parser=parser ) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_dates) - - df_iter = read_xml( - path, - parser=parser, - converters={"date": convert_to_datetime}, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + converters={"date": convert_to_datetime}, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -304,16 +283,12 @@ def test_callable_str_converters(parser): def test_parse_dates_column_name(parser): df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_dates) - - df_iter = read_xml( - path, - parser=parser, - parse_dates=["date"], - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + parse_dates=["date"], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -330,16 +305,12 @@ def test_parse_dates_column_name(parser): def test_parse_dates_column_index(parser): df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_dates) - - df_iter = read_xml( - path, - parser=parser, - parse_dates=[3], - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + parse_dates=[3], + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -356,16 +327,13 @@ def test_parse_dates_column_index(parser): def test_parse_dates_true(parser): df_result = read_xml(xml_dates, parse_dates=True, parser=parser) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml_dates) - df_iter = read_xml( - path, - parser=parser, - parse_dates=True, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) + df_iter = read_xml_iterparse( + xml_dates, + parser=parser, + parse_dates=True, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) df_expected = DataFrame( { @@ -412,16 +380,12 @@ def test_parse_dates_dictionary(parser): df_result = read_xml( xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser ) - with tm.ensure_clean() as path: - with open(path, "w") as f: - f.write(xml) - - df_iter = read_xml( - path, - parser=parser, - parse_dates={"date_end": ["year", "month", "day"]}, - iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, - ) + df_iter = read_xml_iterparse( + xml, + parser=parser, + parse_dates={"date_end": ["year", "month", "day"]}, + iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, + ) df_expected = DataFrame( { From 3d065b5ba5d8fbf1fed29b3d9755054e8ee13976 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Tue, 8 Mar 2022 20:58:59 -0600 Subject: [PATCH 5/6] Add iterparse feature to some tests --- pandas/io/xml.py | 2 +- pandas/tests/io/xml/test_xml.py | 29 +++++++++++++- pandas/tests/io/xml/test_xml_dtypes.py | 54 ++++++++++++++++++++------ 3 files changed, 71 insertions(+), 14 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 76780fc6b2241..d5178f17e2bf7 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1072,7 +1072,7 @@ def read_xml( The nodes or attributes to retrieve in iterparsing of XML document as a dict with key being the name of repeating element and value being list of elements or attribute names that are descendants of the repeated - element. Note: If this option is used, it will replace xpath parsing + element. Note: If this option is used, it will replace ``xpath`` parsing and unlike xpath, descendants do not need to relate to each other but can exist any where in document under the repeating element. This memory- efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 77f90d88614b1..bfb6bb19452bd 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,7 +14,10 @@ from pandas.compat import is_ci_environment from pandas.compat._optional import import_optional_dependency -from pandas.errors import ParserError +from pandas.errors import ( + EmptyDataError, + ParserError, +) import pandas.util._test_decorators as td from pandas import DataFrame @@ -663,6 +666,11 @@ def test_none_namespace_prefix(key): def test_file_elems_and_attrs(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") df_file = read_xml(filename, parser=parser) + df_iter = read_xml( + filename, + parser=parser, + iterparse={"book": ["category", "title", "author", "year", "price"]}, + ) df_expected = DataFrame( { "category": ["cooking", "children", "web"], @@ -674,6 +682,7 @@ def test_file_elems_and_attrs(datapath, parser): ) tm.assert_frame_equal(df_file, df_expected) + tm.assert_frame_equal(df_iter, df_expected) def test_file_only_attrs(datapath, parser): @@ -741,7 +750,13 @@ def test_attribute_centric_xml(): df_lxml = read_xml(xml, xpath=".//station") df_etree = read_xml(xml, xpath=".//station", parser="etree") + df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]}) + df_iter_et = read_xml_iterparse( + xml, parser="etree", iterparse={"station": ["Name", "coords"]} + ) + tm.assert_frame_equal(df_lxml, df_etree) + tm.assert_frame_equal(df_iter_lx, df_iter_et) # NAMES @@ -834,7 +849,7 @@ def test_parser_consistency_with_encoding(datapath): ) df_iter_etree = read_xml( filename, - parser="lxml", + parser="etree", encoding="ISO-8859-1", iterparse={"row": ["rank", "malename", "femalename"]}, ) @@ -1273,6 +1288,16 @@ def test_no_result(datapath, parser): ) +def test_empty_data(datapath, parser): + filename = datapath("io", "data", "xml", "books.xml") + with pytest.raises(EmptyDataError, match="No columns to parse from file"): + read_xml( + filename, + parser=parser, + iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]}, + ) + + @pytest.mark.network @td.skip_if_no("lxml") @tm.network( diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index bbf5545052584..6aa4ddfac7628 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -20,6 +20,13 @@ def parser(request): return request.param +@pytest.fixture( + params=[None, {"book": ["category", "title", "author", "year", "price"]}] +) +def iterparse(request): + return request.param + + def read_xml_iterparse(data, **kwargs): with tm.ensure_clean() as path: with open(path, "w") as f: @@ -187,11 +194,12 @@ def test_dtype_float(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_wrong_dtype(parser): +def test_wrong_dtype(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( - ValueError, match=('Unable to parse string "square" at position 0') + ValueError, match=('Unable to parse string "Everyday Italian" at position 0') ): - read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser) + read_xml(filename, dtype={"title": "Int64"}, parser=parser, iterparse=iterparse) def test_both_dtype_converters(parser): @@ -210,8 +218,16 @@ def test_both_dtype_converters(parser): converters={"degrees": str}, parser=parser, ) + df_iter = read_xml_iterparse( + xml_types, + dtype={"degrees": "str"}, + converters={"degrees": str}, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) # CONVERTERS @@ -263,19 +279,26 @@ def test_converters_date(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_wrong_converters_type(parser): +def test_wrong_converters_type(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")): - read_xml(xml_types, converters={"degrees", str}, parser=parser) + read_xml(filename, converters={"year", str}, parser=parser, iterparse=iterparse) -def test_callable_func_converters(parser): +def test_callable_func_converters(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("'float' object is not callable")): - read_xml(xml_types, converters={"degrees": float()}, parser=parser) + read_xml( + filename, converters={"year": float()}, parser=parser, iterparse=iterparse + ) -def test_callable_str_converters(parser): +def test_callable_str_converters(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("'str' object is not callable")): - read_xml(xml_types, converters={"degrees": "float"}, parser=parser) + read_xml( + filename, converters={"year": "float"}, parser=parser, iterparse=iterparse + ) # PARSE DATES @@ -437,11 +460,20 @@ def test_day_first_parse_dates(parser): UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) + df_iter = read_xml_iterparse( + xml, + parse_dates=["date"], + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) + tm.assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_iter, df_expected) -def test_wrong_parse_dates_type(parser): +def test_wrong_parse_dates_type(datapath, parser, iterparse): + filename = datapath("io", "data", "xml", "books.xml") with pytest.raises( TypeError, match=("Only booleans, lists, and dictionaries are accepted") ): - read_xml(xml_dates, parse_dates={"date"}, parser=parser) + read_xml(filename, parse_dates={"date"}, parser=parser, iterparse=iterparse) From e37c20a2820966ea0e6dee0d1566eefd125862d8 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Fri, 18 Mar 2022 12:03:24 -0500 Subject: [PATCH 6/6] Add IO docs link in docstring --- pandas/io/xml.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 0a2a144af3309..181b0fe115f4c 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1130,6 +1130,10 @@ def read_xml( exceptions due to issues with XML document, ``xpath``, or other parameters. + See the :ref:`read_xml documentation in the IO section of the docs + ` for more information in using this method to parse XML + files to DataFrames. + Examples -------- >>> xml = '''