From 640c70e094b533d45b21ceb1c0c799e7843b82ce Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Sun, 30 Jan 2022 16:44:09 -0600
Subject: [PATCH 1/6] ENH: Add large file support for read_xml

---
 doc/source/user_guide/io.rst              |  40 ++
 doc/source/whatsnew/v1.5.0.rst            |  38 ++
 pandas/io/xml.py                          | 232 ++++++-
 pandas/tests/io/xml/test_xml_iterparse.py | 746 ++++++++++++++++++++++
 4 files changed, 1031 insertions(+), 25 deletions(-)
 create mode 100644 pandas/tests/io/xml/test_xml_iterparse.py

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 34f10c1b3ec28..43baaaebecd11 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -3287,6 +3287,46 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``:
    df = pd.read_xml(xml, stylesheet=xsl)
    df
 
+For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
+supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_
+which are memory-efficient methods to iterate through an XML tree and extract specific elements and attributes.
+without holding entire tree in memory.
+
+.. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk
+.. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
+
+To use, you must pass the XML file path into ``read_xml`` and use the ``iterparse`` argument. Files should
+not be compressed or from online sources but stored on local disk. Also, ``iterparse`` should be a dictionary
+where the key is the repeating nodes in document (which become the rows) and the value is a list of any
+element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not
+used in this method, descendants do not need to share same relationship with one another. Below shows example
+of reading in Wikipedia's very large (10 GB+) latest article data dump.
+
+.. code-block:: ipython
+
+    In [1]: df = pd.read_xml(
+    ...      "/path/to/downloaded/enwikisource-latest-pages-articles.xml,
+    ...      iterparse = {"page": ["title", "ns", "id"]})
+    ...  )
+    ...  df
+    Out[2]:
+                                                         title   ns        id
+    0                                       Gettysburg Address    0     21450
+    1                                                Main Page    0     42950
+    2                            Declaration by United Nations    0      8435
+    3             Constitution of the United States of America    0      8435
+    4                     Declaration of Independence (Israel)    0     17858
+    ...                                                    ...  ...       ...
+    3578760               Page:Black cat 1897 07 v2 n10.pdf/17  104    219649
+    3578761               Page:Black cat 1897 07 v2 n10.pdf/43  104    219649
+    3578762               Page:Black cat 1897 07 v2 n10.pdf/44  104    219649
+    3578763      The History of Tom Jones, a Foundling/Book IX    0  12084291
+    3578764  Page:Shakespeare of Stratford (1926) Yale.djvu/91  104     21450
+
+    [3578765 rows x 3 columns]
+
+    .. versionadded:: 1.5.0
+
 
 .. _io.xml:
 
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 0f5b4a16d2f01..aa6c41eedf974 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -131,6 +131,44 @@ apply converter methods, and parse dates (:issue:`43567`).
     df
     df.dtypes
 
+.. _whatsnew_150.read_xml_iterparse:
+
+read_xml now supports large XML using iterparse
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
+now supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_
+which are memory-efficient methods to iterate through XML trees and extract specific elements
+and attributes without holding entire tree in memory.
+
+.. code-block:: ipython
+
+    In [1]: df = pd.read_xml(
+    ...      "/path/to/downloaded/enwikisource-latest-pages-articles.xml,
+    ...      iterparse = {"page": ["title", "ns", "id"]})
+    ...  )
+    df
+    Out[2]:
+                                                         title   ns        id
+    0                                       Gettysburg Address    0     21450
+    1                                                Main Page    0     42950
+    2                            Declaration by United Nations    0      8435
+    3             Constitution of the United States of America    0      8435
+    4                     Declaration of Independence (Israel)    0     17858
+    ...                                                    ...  ...       ...
+    3578760               Page:Black cat 1897 07 v2 n10.pdf/17  104    219649
+    3578761               Page:Black cat 1897 07 v2 n10.pdf/43  104    219649
+    3578762               Page:Black cat 1897 07 v2 n10.pdf/44  104    219649
+    3578763      The History of Tom Jones, a Foundling/Book IX    0  12084291
+    3578764  Page:Shakespeare of Stratford (1926) Yale.djvu/91  104     21450
+
+    [3578765 rows x 3 columns]
+
+
+.. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk
+.. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
+
+
 .. _whatsnew_150.api_breaking.other:
 
 Other API changes
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 8e463c94340c8..f80a48ff8a32d 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -5,7 +5,10 @@
 from __future__ import annotations
 
 import io
-from typing import Sequence
+from typing import (
+    Any,
+    Sequence,
+)
 
 from pandas._typing import (
     CompressionOptions,
@@ -35,6 +38,7 @@
 from pandas.io.common import (
     file_exists,
     get_handle,
+    infer_compression,
     is_fsspec_url,
     is_url,
     stringify_path,
@@ -134,6 +138,7 @@ def __init__(
         parse_dates: ParseDatesArg | None,
         encoding: str | None,
         stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
+        iterparse: dict[str, list[str]] | None,
         compression: CompressionOptions,
         storage_options: StorageOptions,
     ):
@@ -148,6 +153,7 @@ def __init__(
         self.parse_dates = parse_dates
         self.encoding = encoding
         self.stylesheet = stylesheet
+        self.iterparse = iterparse
         self.is_style = None
         self.compression = compression
         self.storage_options = storage_options
@@ -184,6 +190,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
 
         raise AbstractMethodError(self)
 
+    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
+        """
+        Parse xml nodes.
+
+        This method will parse elements and underlying descendants
+        and attributes by iterparse, a method to iterate through an XML
+        tree without parsing entire XML tree in memory.
+
+        Raises
+        ------
+        ValueError
+            * If only elements and only attributes are specified.
+
+        Notes
+        -----
+        Namespace URIs will be removed from return node values.Also,
+        elements with missing children or attributes compared to siblings
+        will have optional keys filled withi None values.
+        """
+
+        raise AbstractMethodError(self)
+
     def _validate_path(self) -> None:
         """
         Validate xpath.
@@ -239,12 +267,17 @@ def parse_data(self) -> list[dict[str, str | None]]:
                 "To use stylesheet, you need lxml installed and selected as parser."
             )
 
-        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+        if self.iterparse is None:
+            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+            self._validate_path()
 
-        self._validate_path()
         self._validate_names()
 
-        return self._parse_nodes()
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
+        )
+
+        return xml_dicts
 
     def _parse_nodes(self) -> list[dict[str, str | None]]:
         elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
@@ -330,6 +363,67 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
 
         return dicts
 
+    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
+        from xml.etree.ElementTree import iterparse
+
+        dicts: list[dict[str, str | None]] = []
+        row: dict[str, str | None] | None = None
+
+        if not isinstance(self.iterparse, dict):
+            raise TypeError(
+                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
+            )
+
+        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
+        if not is_list_like(self.iterparse[row_node]):
+            raise TypeError(
+                f"{type(self.iterparse[row_node])} is not a valid type "
+                "for value in iterparse"
+            )
+
+        if (
+            not isinstance(self.path_or_buffer, str)
+            or is_url(self.path_or_buffer)
+            or is_fsspec_url(self.path_or_buffer)
+            or self.path_or_buffer.startswith(("<?xml", "<"))
+            or infer_compression(self.path_or_buffer, "infer") is not None
+        ):
+            raise ParserError(
+                "iterparse is designed for large XML files that are fully extracted on "
+                "local disk and not as compressed files or online sources."
+            )
+
+        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
+            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
+
+            if event == "start":
+                if curr_elem == row_node:
+                    row = {}
+
+            if row is not None:
+                for col in self.iterparse[row_node]:
+                    if curr_elem == col:
+                        row[col] = elem.text.strip() if elem.text else None
+                    if col in elem.attrib:
+                        row[col] = elem.attrib[col]
+
+            if event == "end":
+                if curr_elem == row_node and row is not None:
+                    dicts.append(row)
+                    row = None
+                elem.clear()
+
+        if dicts == []:
+            raise ParserError("No result from selected items in iterparse.")
+
+        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
+        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
+
+        if self.names:
+            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
+
+        return dicts
+
     def _validate_path(self) -> None:
         """
         Notes
@@ -360,9 +454,14 @@ def _validate_path(self) -> None:
             )
 
     def _validate_names(self) -> None:
+        children: list[Any]
+
         if self.names:
-            parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
-            children = parent.findall("*") if parent else []
+            if self.iterparse:
+                children = self.iterparse[next(iter(self.iterparse))]
+            else:
+                parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
+                children = parent.findall("*") if parent else []
 
             if is_list_like(self.names):
                 if len(self.names) < len(children):
@@ -412,16 +511,22 @@ def parse_data(self) -> list[dict[str, str | None]]:
         """
         from lxml.etree import XML
 
-        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+        if self.iterparse is None:
+            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
 
-        if self.stylesheet is not None:
-            self.xsl_doc = XML(self._parse_doc(self.stylesheet))
-            self.xml_doc = XML(self._transform_doc())
+            if self.stylesheet:
+                self.xsl_doc = XML(self._parse_doc(self.stylesheet))
+                self.xml_doc = XML(self._transform_doc())
+
+            self._validate_path()
 
-        self._validate_path()
         self._validate_names()
 
-        return self._parse_nodes()
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
+        )
+
+        return xml_dicts
 
     def _parse_nodes(self) -> list[dict[str, str | None]]:
         elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
@@ -506,6 +611,70 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
 
         return dicts
 
+    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
+        from lxml.etree import iterparse
+
+        dicts: list[dict[str, str | None]] = []
+        row: dict[str, str | None] | None = None
+
+        if not isinstance(self.iterparse, dict):
+            raise TypeError(
+                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
+            )
+
+        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
+        if not is_list_like(self.iterparse[row_node]):
+            raise TypeError(
+                f"{type(self.iterparse[row_node])} is not a valid type "
+                "for value in iterparse"
+            )
+
+        if (
+            not isinstance(self.path_or_buffer, str)
+            or is_url(self.path_or_buffer)
+            or is_fsspec_url(self.path_or_buffer)
+            or self.path_or_buffer.startswith(("<?xml", "<"))
+            or infer_compression(self.path_or_buffer, "infer") is not None
+        ):
+            raise ParserError(
+                "iterparse is designed for large XML files that are fully extracted on "
+                "local disk and not as compressed files or online sources."
+            )
+
+        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
+            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
+
+            if event == "start":
+                if curr_elem == row_node:
+                    row = {}
+
+            if row is not None:
+                for col in self.iterparse[row_node]:
+                    if curr_elem == col:
+                        row[col] = elem.text.strip() if elem.text else None
+                    if col in elem.attrib:
+                        row[col] = elem.attrib[col]
+
+            if event == "end":
+                if curr_elem == row_node and row is not None:
+                    dicts.append(row)
+                    row = None
+
+                elem.clear()
+                while elem.getprevious() is not None:
+                    del elem.getparent()[0]
+
+        if dicts == []:
+            raise ParserError("No result from selected items in iterparse.")
+
+        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
+        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
+
+        if self.names:
+            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
+
+        return dicts
+
     def _validate_path(self) -> None:
 
         msg = (
@@ -527,21 +696,15 @@ def _validate_path(self) -> None:
             raise ValueError(msg)
 
     def _validate_names(self) -> None:
-        """
-        Validate names.
-
-        This method will check if names is a list and aligns with
-        length of parse nodes.
+        children: list[Any]
 
-        Raises
-        ------
-        ValueError
-            * If value is not a list and less then length of nodes.
-        """
         if self.names:
-            children = self.xml_doc.xpath(
-                self.xpath + "[1]/*", namespaces=self.namespaces
-            )
+            if self.iterparse:
+                children = self.iterparse[next(iter(self.iterparse))]
+            else:
+                children = self.xml_doc.xpath(
+                    self.xpath + "[1]/*", namespaces=self.namespaces
+                )
 
             if is_list_like(self.names):
                 if len(self.names) < len(children):
@@ -703,6 +866,7 @@ def _parse(
     encoding: str | None,
     parser: XMLParsers,
     stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
+    iterparse: dict[str, list[str]] | None,
     compression: CompressionOptions,
     storage_options: StorageOptions,
     **kwargs,
@@ -740,6 +904,7 @@ def _parse(
                 parse_dates,
                 encoding,
                 stylesheet,
+                iterparse,
                 compression,
                 storage_options,
             )
@@ -759,6 +924,7 @@ def _parse(
             parse_dates,
             encoding,
             stylesheet,
+            iterparse,
             compression,
             storage_options,
         )
@@ -797,6 +963,7 @@ def read_xml(
     encoding: str | None = "utf-8",
     parser: XMLParsers = "lxml",
     stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
+    iterparse: dict[str, list[str]] | None = None,
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions = None,
 ) -> DataFrame:
@@ -889,6 +1056,20 @@ def read_xml(
         transformation and not the original XML document. Only XSLT 1.0
         scripts and not later versions is currently supported.
 
+    iterparse : dict, optional
+        The nodes or attributes to retrieve in iterparsing of XML document
+        as a dict with key being the name of repeating element and value being
+        list of elements or attribute names that are descendants of the repeated
+        element. Note: If this option is used, it will replace xpath parsing
+        and unlike xpath, descendants do not need to relate to each other but can
+        exist any where in document under the repeating element. This memory-
+        efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
+        For example, ::
+
+            iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
+
+        .. versionadded:: 1.5.0
+
     {decompression_options}
 
         .. versionchanged:: 1.4.0 Zstandard support.
@@ -1021,6 +1202,7 @@ def read_xml(
         encoding=encoding,
         parser=parser,
         stylesheet=stylesheet,
+        iterparse=iterparse,
         compression=compression,
         storage_options=storage_options,
     )
diff --git a/pandas/tests/io/xml/test_xml_iterparse.py b/pandas/tests/io/xml/test_xml_iterparse.py
new file mode 100644
index 0000000000000..4398786c87f3b
--- /dev/null
+++ b/pandas/tests/io/xml/test_xml_iterparse.py
@@ -0,0 +1,746 @@
+from __future__ import annotations
+
+import pytest
+
+from pandas.errors import ParserError
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Series,
+    to_datetime,
+)
+import pandas._testing as tm
+
+from pandas.io.common import get_handle
+from pandas.io.xml import read_xml
+
+
+@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
+def parser(request):
+    return request.param
+
+
+@pytest.fixture(params=["rb", "r"])
+def mode(request):
+    return request.param
+
+
+geom_df = DataFrame(
+    {
+        "shape": ["square", "circle", "triangle"],
+        "degrees": [360, 360, 180],
+        "sides": [4, float("nan"), 3],
+    }
+)
+
+xml_str = """\
+<?xml version='1.0' encoding='utf-8'?>
+<data>
+  <row>
+    <shape>square</shape>
+    <degrees>00360</degrees>
+    <sides>4.0</sides>
+    <date>2020-01-01</date>
+   </row>
+  <row>
+    <shape>circle</shape>
+    <degrees>00360</degrees>
+    <sides/>
+    <date>2021-01-01</date>
+  </row>
+  <row>
+    <shape>triangle</shape>
+    <degrees>00180</degrees>
+    <sides>3.0</sides>
+    <date>2022-01-01</date>
+  </row>
+</data>"""
+
+xml_prefix_nmsp = """\
+<?xml version='1.0' encoding='utf-8'?>
+<doc:data xmlns:doc="http://example.com">
+  <doc:row>
+    <doc:shape>square</doc:shape>
+    <doc:degrees>360</doc:degrees>
+    <doc:sides>4.0</doc:sides>
+  </doc:row>
+  <doc:row>
+    <doc:shape>circle</doc:shape>
+    <doc:degrees>360</doc:degrees>
+    <doc:sides/>
+  </doc:row>
+  <doc:row>
+    <doc:shape>triangle</doc:shape>
+    <doc:degrees>180</doc:degrees>
+    <doc:sides>3.0</doc:sides>
+  </doc:row>
+</doc:data>"""
+
+bad_xml = """\
+<?xml version='1.0' encoding='utf-8'?>
+  <row>
+    <shape>square</shape>
+    <degrees>00360</degrees>
+    <sides>4.0</sides>
+    <date>2020-01-01</date>
+   </row>
+  <row>
+    <shape>circle</shape>
+    <degrees>00360</degrees>
+    <sides/>
+    <date>2021-01-01</date>
+  </row>
+  <row>
+    <shape>triangle</shape>
+    <degrees>00180</degrees>
+    <sides>3.0</sides>
+    <date>2022-01-01</date>
+  </row>
+"""
+
+# FILE
+
+
+def test_file(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        iterparse={"book": ["category", "title", "year", "author", "price"]},
+    )
+
+    df_expected = DataFrame(
+        {
+            "category": ["cooking", "children", "web"],
+            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
+            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
+            "year": [2005, 2005, 2003],
+            "price": [30.00, 29.99, 39.95],
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_file_xpath_compare(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    df_xpath = read_xml(filename, parser=parser)
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        iterparse={"book": ["category", "title", "author", "year", "price"]},
+    )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# LARGE FILE
+
+
+@tm.network
+@pytest.mark.slow
+def test_large_url_xpath_compare(parser):
+    with tm.ensure_clean(filename="cta.xml") as path:
+        url = (
+            "https://data.cityofchicago.org/api/views/"
+            "8pix-ypme/rows.xml?accessType=DOWNLOAD"
+        )
+        (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False))
+
+        df_xpath = read_xml(path, parser=parser)
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            iterparse={
+                "row": [
+                    "_id",
+                    "_uuid",
+                    "_position",
+                    "_address",
+                    "stop_id",
+                    "direction_id",
+                    "stop_name",
+                    "station_name",
+                    "station_descriptive_name",
+                    "map_id",
+                    "ada",
+                    "red",
+                    "blue",
+                    "g",
+                    "brn",
+                    "p",
+                    "pexp",
+                    "y",
+                    "pnk",
+                    "o",
+                    "location",
+                ]
+            },
+        )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# NAMESPACES
+
+
+def test_namespace_prefix(parser):
+    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_prefix_nmsp)
+
+        df_iter = read_xml(
+            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
+        )
+
+    df_expected = DataFrame(
+        {
+            "shape": ["square", "circle", "triangle"],
+            "degrees": [360, 360, 180],
+            "sides": [4.0, float("nan"), 3.0],
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_namespace_prefix_xpath_compare(parser):
+    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_prefix_nmsp)
+
+        df_xpath = read_xml(
+            path,
+            xpath=".//ns:row",
+            namespaces={"ns": "http://example.com"},
+            parser=parser,
+        )
+        df_iter = read_xml(
+            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
+        )
+
+        tm.assert_frame_equal(df_xpath, df_iter)
+
+
+def test_default_namespace_xpath_compare(datapath):
+    kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
+
+    df_xpath = read_xml(
+        kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}
+    )
+
+    df_iter = read_xml(
+        kml,
+        iterparse={
+            "Placemark": [
+                "id",
+                "name",
+                "Snippet",
+                "description",
+                "styleUrl",
+                "MultiGeometry",
+            ]
+        },
+    )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# ELEMS_ONLY
+
+
+def test_elems_only(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        iterparse={"book": ["title", "author", "year", "price"]},
+    )
+
+    df_expected = DataFrame(
+        {
+            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
+            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
+            "year": [2005, 2005, 2003],
+            "price": [30.00, 29.99, 39.95],
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_elems_only_xpath_compare(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    df_xpath = read_xml(filename, elems_only=True, parser=parser)
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        iterparse={"book": ["title", "author", "year", "price"]},
+    )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# ATTRS_ONLY
+
+
+def test_attrs_only(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]})
+    df_expected = DataFrame({"category": ["cooking", "children", "web"]})
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_attrs_only_xpath_compare(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    df_xpath = read_xml(filename, attrs_only=True, parser=parser)
+    df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]})
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# NAMES
+
+
+def test_names(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        names=["b_category", "b_title", "b_author", "b_year", "b_price"],
+        iterparse={"book": ["category", "title", "author", "year", "price"]},
+    )
+
+    df_expected = DataFrame(
+        {
+            "b_category": ["cooking", "children", "web"],
+            "b_title": ["Everyday Italian", "Harry Potter", "Learning XML"],
+            "b_author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
+            "b_year": [2005, 2005, 2003],
+            "b_price": [30.00, 29.99, 39.95],
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_names_xpath_compare(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    df_xpath = read_xml(
+        filename,
+        parser=parser,
+        names=["b_category", "b_title", "b_author", "b_year", "b_price"],
+    )
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        names=["b_category", "b_title", "b_author", "b_year", "b_price"],
+        iterparse={"book": ["category", "title", "author", "year", "price"]},
+    )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# DTYPE
+
+
+def test_dtypes(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        dtype={"year": "Int64", "price": "Float64"},
+        iterparse={"book": ["category", "title", "year", "author", "price"]},
+    )
+
+    df_expected = DataFrame(
+        {
+            "category": ["cooking", "children", "web"],
+            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
+            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
+            "year": Series([2005, 2005, 2003]).astype("Int64"),
+            "price": Series([30.00, 29.99, 39.95]).astype("Float64"),
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_dtypes_xpath_compare(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+
+    df_xpath = read_xml(
+        filename, parser=parser, dtype={"year": "Int64", "price": "Float64"}
+    )
+
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        dtype={"year": "Int64", "price": "Float64"},
+        iterparse={"book": ["category", "title", "year", "author", "price"]},
+    )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# CONVERTERS
+
+
+def test_converters(parser):
+    convert_to_datetime = lambda x: to_datetime(x)
+    with tm.ensure_clean(filename="xml_string.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_str)
+
+        df_iter = read_xml(
+            path,
+            converters={"date": convert_to_datetime},
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+    df_expected = DataFrame(
+        {
+            "shape": ["square", "circle", "triangle"],
+            "degrees": [360, 360, 180],
+            "sides": [4.0, float("nan"), 3.0],
+            "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_converters_xpath_compare(parser):
+    convert_to_datetime = lambda x: to_datetime(x)
+    with tm.ensure_clean(filename="xml_string.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_str)
+
+        df_xpath = read_xml(
+            path, converters={"date": convert_to_datetime}, parser=parser
+        )
+
+        df_iter = read_xml(
+            path,
+            converters={"date": convert_to_datetime},
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# PARSE_DATES
+
+
+def test_date_parse(parser):
+    with tm.ensure_clean(filename="xml_string.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_str)
+
+        df_iter = read_xml(
+            path,
+            parse_dates=["date"],
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+    df_expected = DataFrame(
+        {
+            "shape": ["square", "circle", "triangle"],
+            "degrees": [360, 360, 180],
+            "sides": [4.0, float("nan"), 3.0],
+            "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
+        }
+    )
+
+    tm.assert_frame_equal(df_iter, df_expected)
+
+
+def test_date_parse_xpath_compare(parser):
+    with tm.ensure_clean(filename="xml_string.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_str)
+
+        df_xpath = read_xml(path, parse_dates=["date"], parser=parser)
+
+        df_iter = read_xml(
+            path,
+            parse_dates=["date"],
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# ENCODING
+
+
+def test_encoding(datapath, parser):
+    filename = datapath("io", "data", "xml", "baby_names.xml")
+
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        encoding="ISO-8859-1",
+        iterparse={"row": ["rank", "malename", "femalename"]},
+    )
+
+    df_expected = DataFrame(
+        {
+            "rank": [1, 2, 3, 4, 5],
+            "malename": ["José", "Luis", "Carlos", "Juan", "Jorge"],
+            "femalename": ["Sofía", "Valentina", "Isabella", "Camila", "Valeria"],
+        }
+    )
+
+    tm.assert_frame_equal(df_iter.head(), df_expected)
+
+
+def test_encoding_xpath_compare(datapath, parser):
+    filename = datapath("io", "data", "xml", "baby_names.xml")
+    df_xpath = read_xml(filename, parser=parser, encoding="ISO-8859-1")
+
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        encoding="ISO-8859-1",
+        iterparse={"row": ["rank", "malename", "femalename"]},
+    )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# STYLESHEET
+
+
+@td.skip_if_no("lxml")
+def test_stylesheet_xpath_compare(datapath):
+    kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
+    xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
+
+    df_style = read_xml(
+        kml,
+        xpath=".//k:Placemark",
+        namespaces={"k": "http://www.opengis.net/kml/2.2"},
+        stylesheet=xsl,
+    )
+
+    df_iter = read_xml(
+        kml,
+        iterparse={
+            "Placemark": [
+                "id",
+                "name",
+                "styleUrl",
+                "extrude",
+                "altitudeMode",
+                "coordinates",
+            ]
+        },
+    )
+
+    tm.assert_frame_equal(df_style, df_iter)
+
+
+# COMPRESSION
+
+
+def test_compression_compare(parser, compression_only):
+    with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path:
+        geom_df.to_xml(comp_path, parser=parser, compression=compression_only)
+
+        with get_handle(comp_path, "r", compression=compression_only) as handles:
+            with open(ext_path, "w") as f:
+                f.write(handles.handle.read())
+
+            df_iter = read_xml(
+                ext_path,
+                parser=parser,
+                iterparse={"row": ["shape", "degrees", "sides"]},
+                compression=compression_only,
+            )
+
+    tm.assert_frame_equal(geom_df, df_iter)
+
+
+# STORAGE OPTIONS
+
+
+@tm.network
+@pytest.mark.slow
+def test_s3_xpath_compare(parser):
+    # Python Software Foundation (2019 IRS-990 RETURN)
+    s3_path = "s3://irs-form-990/201923199349319487_public.xml"
+
+    df_xpath = read_xml(
+        s3_path,
+        xpath=".//irs:Form990PartVIISectionAGrp",
+        namespaces={"irs": "http://www.irs.gov/efile"},
+        parser=parser,
+        storage_options={"anon": True},
+    )
+
+    with tm.ensure_clean(filename="irs990.xml") as path:
+        with get_handle(s3_path, "rb", is_text=False) as handles:
+            with open(path, "wb") as f:
+                f.write(handles.handle.read())
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            iterparse={
+                "Form990PartVIISectionAGrp": [
+                    "PersonNm",
+                    "TitleTxt",
+                    "AverageHoursPerWeekRt",
+                    "AverageHoursPerWeekRltdOrgRt",
+                    "IndividualTrusteeOrDirectorInd",
+                    "OfficerInd",
+                    "ReportableCompFromOrgAmt",
+                    "ReportableCompFromRltdOrgAmt",
+                    "OtherCompensationAmt",
+                    "HighestCompensatedEmployeeInd",
+                ]
+            },
+        )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
+
+
+# PARSER ERROR
+
+
+def test_string_error(parser):
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        read_xml(
+            xml_str,
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+
+def test_file_like_error(datapath, parser, mode):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        with open(filename) as f:
+            read_xml(
+                f,
+                parser=parser,
+                iterparse={"book": ["category", "title", "year", "author", "price"]},
+            )
+
+
+@tm.network
+def test_url_path_error(parser):
+    url = "https://www.w3schools.com/xml/books.xml"
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        read_xml(
+            url,
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+
+def test_compression_error(parser, compression_only):
+    with tm.ensure_clean(filename="geom_xml.zip") as path:
+        geom_df.to_xml(path, parser=parser, compression=compression_only)
+
+        with pytest.raises(
+            ParserError, match=("iterparse is designed for large XML files")
+        ):
+            read_xml(
+                path,
+                parser=parser,
+                iterparse={"row": ["shape", "degrees", "sides", "date"]},
+                compression=compression_only,
+            )
+
+
+@tm.network
+@td.skip_if_no("s3fs")
+def test_storage_options_error(parser):
+    # Python Software Foundation (2019 IRS-990 RETURN)
+    s3 = "s3://irs-form-990/201923199349319487_public.xml"
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        read_xml(
+            s3,
+            parser=parser,
+            iterparse={
+                "Form990PartVIISectionAGrp": [
+                    "PersonNm",
+                    "TitleTxt",
+                    "AverageHoursPerWeekRt",
+                    "AverageHoursPerWeekRltdOrgRt",
+                    "IndividualTrusteeOrDirectorInd",
+                    "OfficerInd",
+                    "ReportableCompFromOrgAmt",
+                    "ReportableCompFromRltdOrgAmt",
+                    "OtherCompensationAmt",
+                ]
+            },
+            storage_options={"anon": True},
+        )
+
+
+# OTHER EXCEPTIONS
+
+
+def test_wrong_dict_type(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(TypeError, match="list is not a valid type for iterparse"):
+        read_xml(
+            filename,
+            parser=parser,
+            iterparse=["category", "title", "year", "author", "price"],
+        )
+
+
+def test_wrong_dict_value(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(
+        TypeError, match="<class 'str'> is not a valid type for value in iterparse"
+    ):
+        read_xml(filename, parser=parser, iterparse={"book": "category"})
+
+
+def test_bad_xml(datapath, parser):
+    with tm.ensure_clean(filename="bad.xml") as path:
+        with open(path, "w") as f:
+            f.write(bad_xml)
+
+        with pytest.raises(
+            SyntaxError, match="Extra content at the end of the document"
+        ):
+            read_xml(
+                path,
+                parse_dates=["date"],
+                iterparse={"row": ["shape", "degrees", "sides", "date"]},
+            )
+
+
+def test_no_result(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(
+        ParserError, match="No result from selected items in iterparse."
+    ):
+        read_xml(
+            filename,
+            parser=parser,
+            iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]},
+        )

From 4011b4b467b0788fcb203ca10d4d5c9ccaad6374 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Mon, 31 Jan 2022 08:03:34 -0600
Subject: [PATCH 2/6] Combine tests, slightly fix docs

---
 doc/source/user_guide/io.rst              |  23 +-
 doc/source/whatsnew/v1.5.0.rst            |   4 +-
 pandas/io/xml.py                          |  29 +-
 pandas/tests/io/xml/test_xml.py           | 293 ++++++++-
 pandas/tests/io/xml/test_xml_dtypes.py    | 122 ++++
 pandas/tests/io/xml/test_xml_iterparse.py | 746 ----------------------
 6 files changed, 415 insertions(+), 802 deletions(-)
 delete mode 100644 pandas/tests/io/xml/test_xml_iterparse.py

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 43baaaebecd11..f6411d6041871 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -3292,23 +3292,25 @@ supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iter
 which are memory-efficient methods to iterate through an XML tree and extract specific elements and attributes.
 without holding entire tree in memory.
 
+    .. versionadded:: 1.5.0
+
 .. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk
 .. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse
 
-To use, you must pass the XML file path into ``read_xml`` and use the ``iterparse`` argument. Files should
-not be compressed or from online sources but stored on local disk. Also, ``iterparse`` should be a dictionary
-where the key is the repeating nodes in document (which become the rows) and the value is a list of any
-element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not
+To use this feature, you must pass a physical XML file path into ``read_xml`` and use the ``iterparse`` argument.
+Files should not be compressed or point to online sources but stored on local disk. Also, ``iterparse`` should be
+a dictionary where the key is the repeating nodes in document (which become the rows) and the value is a list of
+any element or attribute that is a descendant (i.e., child, grandchild) of repeating node. Since XPath is not
 used in this method, descendants do not need to share same relationship with one another. Below shows example
-of reading in Wikipedia's very large (10 GB+) latest article data dump.
+of reading in Wikipedia's very large (12 GB+) latest article data dump.
 
 .. code-block:: ipython
 
     In [1]: df = pd.read_xml(
-    ...      "/path/to/downloaded/enwikisource-latest-pages-articles.xml,
-    ...      iterparse = {"page": ["title", "ns", "id"]})
-    ...  )
-    ...  df
+    ...         "/path/to/downloaded/enwikisource-latest-pages-articles.xml",
+    ...         iterparse = {"page": ["title", "ns", "id"]}
+    ...     )
+    ...     df
     Out[2]:
                                                          title   ns        id
     0                                       Gettysburg Address    0     21450
@@ -3325,9 +3327,6 @@ of reading in Wikipedia's very large (10 GB+) latest article data dump.
 
     [3578765 rows x 3 columns]
 
-    .. versionadded:: 1.5.0
-
-
 .. _io.xml:
 
 Writing XML
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index aa6c41eedf974..f1691b1bee331 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -139,12 +139,12 @@ read_xml now supports large XML using iterparse
 For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
 now supports parsing such sizeable files using `lxml's iterparse`_ and `etree's iterparse`_
 which are memory-efficient methods to iterate through XML trees and extract specific elements
-and attributes without holding entire tree in memory.
+and attributes without holding entire tree in memory (:issue:`#45442`).
 
 .. code-block:: ipython
 
     In [1]: df = pd.read_xml(
-    ...      "/path/to/downloaded/enwikisource-latest-pages-articles.xml,
+    ...      "/path/to/downloaded/enwikisource-latest-pages-articles.xml",
     ...      iterparse = {"page": ["title", "ns", "id"]})
     ...  )
     df
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index f80a48ff8a32d..b8aa6980cde94 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -98,6 +98,12 @@ class _XMLFrameParser:
         URL, file, file-like object, or a raw string containing XSLT,
         `etree` does not support XSLT but retained for consistency.
 
+    iterparse : dict, optional
+        Dict of row element and descendant elements and/or attributes to
+        retrieve in iterparsing of XML document.
+
+        .. versionadded:: 1.5.0
+
     {decompression_options}
 
         .. versionchanged:: 1.4.0 Zstandard support.
@@ -183,30 +189,33 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
 
         Notes
         -----
-        Namespace URIs will be removed from return node values.Also,
+        Namespace URIs will be removed from return node values. Also,
         elements with missing children or attributes compared to siblings
-        will have optional keys filled withi None values.
+        will have optional keys filled with None values.
         """
 
         raise AbstractMethodError(self)
 
     def _iterparse_nodes(self) -> list[dict[str, str | None]]:
         """
-        Parse xml nodes.
+        Iterparse xml nodes.
 
-        This method will parse elements and underlying descendants
-        and attributes by iterparse, a method to iterate through an XML
-        tree without parsing entire XML tree in memory.
+        This method will read in local disk, decompressed XML files for elements
+        and underlying descendants using iterparse, a method to iterate through
+        an XML tree without holding entire XML tree in memory.
 
         Raises
         ------
-        ValueError
-            * If only elements and only attributes are specified.
+        TypeError
+            * If `iterparse` is not a dict or its dict value is not list-like.
+        ParserError
+            * If `path_or_buffer` is not a physical, decompressed file on disk.
+            * If no data is returned from selected items in `iterparse`.
 
         Notes
         -----
-        Namespace URIs will be removed from return node values.Also,
-        elements with missing children or attributes compared to siblings
+        Namespace URIs will be removed from return node values. Also,
+        elements with missing children or attributes in submitted list
         will have optional keys filled withi None values.
         """
 
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 03c176fd7bc8b..2be8dd32f2c76 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -13,11 +13,13 @@
 import pytest
 
 from pandas.compat._optional import import_optional_dependency
+from pandas.errors import ParserError
 import pandas.util._test_decorators as td
 
 from pandas import DataFrame
 import pandas._testing as tm
 
+from pandas.io.common import get_handle
 from pandas.io.xml import read_xml
 
 """
@@ -251,21 +253,65 @@ def test_parser_consistency_file(datapath):
     df_file_lxml = read_xml(filename, parser="lxml")
     df_file_etree = read_xml(filename, parser="etree")
 
+    df_iter_lxml = read_xml(
+        filename,
+        parser="lxml",
+        iterparse={"book": ["category", "title", "year", "author", "price"]},
+    )
+    df_iter_etree = read_xml(
+        filename,
+        parser="etree",
+        iterparse={"book": ["category", "title", "year", "author", "price"]},
+    )
+
     tm.assert_frame_equal(df_file_lxml, df_file_etree)
+    tm.assert_frame_equal(df_file_lxml, df_iter_lxml)
+    tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
 
 
 @tm.network
 @pytest.mark.slow
-@td.skip_if_no("lxml")
-def test_parser_consistency_url():
+def test_parser_consistency_url(parser):
     url = (
         "https://data.cityofchicago.org/api/views/"
         "8pix-ypme/rows.xml?accessType=DOWNLOAD"
     )
-    df_url_lxml = read_xml(url, xpath=".//row/row", parser="lxml")
-    df_url_etree = read_xml(url, xpath=".//row/row", parser="etree")
 
-    tm.assert_frame_equal(df_url_lxml, df_url_etree)
+    with tm.ensure_clean(filename="cta.xml") as path:
+        (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False))
+
+        df_xpath = read_xml(path, parser=parser)
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            iterparse={
+                "row": [
+                    "_id",
+                    "_uuid",
+                    "_position",
+                    "_address",
+                    "stop_id",
+                    "direction_id",
+                    "stop_name",
+                    "station_name",
+                    "station_descriptive_name",
+                    "map_id",
+                    "ada",
+                    "red",
+                    "blue",
+                    "g",
+                    "brn",
+                    "p",
+                    "pexp",
+                    "y",
+                    "pnk",
+                    "o",
+                    "location",
+                ]
+            },
+        )
+
+    tm.assert_frame_equal(df_xpath, df_iter)
 
 
 def test_file_like(datapath, parser, mode):
@@ -401,26 +447,6 @@ def test_wrong_file_path_etree():
         read_xml(filename, parser="etree")
 
 
-@tm.network
-@td.skip_if_no("lxml")
-def test_url():
-    url = "https://www.w3schools.com/xml/books.xml"
-    df_url = read_xml(url, xpath=".//book[count(*)=4]")
-
-    df_expected = DataFrame(
-        {
-            "category": ["cooking", "children", "web"],
-            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
-            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
-            "year": [2005, 2005, 2003],
-            "price": [30.00, 29.99, 39.95],
-            "cover": [None, None, "paperback"],
-        }
-    )
-
-    tm.assert_frame_equal(df_url, df_expected)
-
-
 @tm.network
 def test_wrong_url(parser):
     with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")):
@@ -466,6 +492,14 @@ def test_default_namespace(parser):
         parser=parser,
     )
 
+    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_default_nmsp)
+
+        df_iter = read_xml(
+            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
+        )
+
     df_expected = DataFrame(
         {
             "shape": ["square", "circle", "triangle"],
@@ -475,6 +509,7 @@ def test_default_namespace(parser):
     )
 
     tm.assert_frame_equal(df_nmsp, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_prefix_namespace(parser):
@@ -485,6 +520,14 @@ def test_prefix_namespace(parser):
         parser=parser,
     )
 
+    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
+        with open(path, "w") as f:
+            f.write(xml_prefix_nmsp)
+
+        df_iter = read_xml(
+            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
+        )
+
     df_expected = DataFrame(
         {
             "shape": ["square", "circle", "triangle"],
@@ -494,6 +537,7 @@ def test_prefix_namespace(parser):
     )
 
     tm.assert_frame_equal(df_nmsp, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 @td.skip_if_no("lxml")
@@ -594,14 +638,21 @@ def test_file_elems_and_attrs(datapath, parser):
 def test_file_only_attrs(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")
     df_file = read_xml(filename, attrs_only=True, parser=parser)
+    df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]})
     df_expected = DataFrame({"category": ["cooking", "children", "web"]})
 
     tm.assert_frame_equal(df_file, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_file_only_elems(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")
     df_file = read_xml(filename, elems_only=True, parser=parser)
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        iterparse={"book": ["title", "author", "year", "price"]},
+    )
     df_expected = DataFrame(
         {
             "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
@@ -612,6 +663,7 @@ def test_file_only_elems(datapath, parser):
     )
 
     tm.assert_frame_equal(df_file, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_elem_and_attrs_only(datapath, parser):
@@ -659,6 +711,12 @@ def test_names_option_output(datapath, parser):
     df_file = read_xml(
         filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser
     )
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        names=["Col1", "Col2", "Col3", "Col4", "Col5"],
+        iterparse={"book": ["category", "title", "author", "year", "price"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -671,6 +729,7 @@ def test_names_option_output(datapath, parser):
     )
 
     tm.assert_frame_equal(df_file, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_names_option_wrong_length(datapath, parser):
@@ -723,10 +782,25 @@ def test_ascii_encoding(datapath, parser):
 @td.skip_if_no("lxml")
 def test_parser_consistency_with_encoding(datapath):
     filename = datapath("io", "data", "xml", "baby_names.xml")
-    df_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1")
-    df_etree = read_xml(filename, parser="etree", encoding="iso-8859-1")
+    df_xpath_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1")
+    df_xpath_etree = read_xml(filename, parser="etree", encoding="iso-8859-1")
 
-    tm.assert_frame_equal(df_lxml, df_etree)
+    df_iter_lxml = read_xml(
+        filename,
+        parser="lxml",
+        encoding="ISO-8859-1",
+        iterparse={"row": ["rank", "malename", "femalename"]},
+    )
+    df_iter_etree = read_xml(
+        filename,
+        parser="lxml",
+        encoding="ISO-8859-1",
+        iterparse={"row": ["rank", "malename", "femalename"]},
+    )
+
+    tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree)
+    tm.assert_frame_equal(df_xpath_etree, df_iter_etree)
+    tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
 
 
 @td.skip_if_no("lxml")
@@ -792,7 +866,22 @@ def test_stylesheet_file(datapath):
         stylesheet=xsl,
     )
 
+    df_iter = read_xml(
+        kml,
+        iterparse={
+            "Placemark": [
+                "id",
+                "name",
+                "styleUrl",
+                "extrude",
+                "altitudeMode",
+                "coordinates",
+            ]
+        },
+    )
+
     tm.assert_frame_equal(df_kml, df_style)
+    tm.assert_frame_equal(df_kml, df_iter)
 
 
 def test_read_xml_passing_as_positional_deprecated(datapath, parser):
@@ -1016,6 +1105,132 @@ def test_empty_stylesheet(val):
         read_xml(kml, stylesheet=val)
 
 
+# ITERPARSE
+
+
+def test_string_error(parser):
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        read_xml(
+            xml_default_nmsp,
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+
+def test_file_like_error(datapath, parser, mode):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        with open(filename) as f:
+            read_xml(
+                f,
+                parser=parser,
+                iterparse={"book": ["category", "title", "year", "author", "price"]},
+            )
+
+
+@tm.network
+def test_url_path_error(parser):
+    url = "https://www.w3schools.com/xml/books.xml"
+    with pytest.raises(
+        ParserError, match=("iterparse is designed for large XML files")
+    ):
+        read_xml(
+            url,
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
+
+def test_compression_error(parser, compression_only):
+    with tm.ensure_clean(filename="geom_xml.zip") as path:
+        geom_df.to_xml(path, parser=parser, compression=compression_only)
+
+        with pytest.raises(
+            ParserError, match=("iterparse is designed for large XML files")
+        ):
+            read_xml(
+                path,
+                parser=parser,
+                iterparse={"row": ["shape", "degrees", "sides", "date"]},
+                compression=compression_only,
+            )
+
+
+def test_wrong_dict_type(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(TypeError, match="list is not a valid type for iterparse"):
+        read_xml(
+            filename,
+            parser=parser,
+            iterparse=["category", "title", "year", "author", "price"],
+        )
+
+
+def test_wrong_dict_value(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(
+        TypeError, match="<class 'str'> is not a valid type for value in iterparse"
+    ):
+        read_xml(filename, parser=parser, iterparse={"book": "category"})
+
+
+def test_bad_xml(datapath, parser):
+    bad_xml = """\
+<?xml version='1.0' encoding='utf-8'?>
+  <row>
+    <shape>square</shape>
+    <degrees>00360</degrees>
+    <sides>4.0</sides>
+    <date>2020-01-01</date>
+   </row>
+  <row>
+    <shape>circle</shape>
+    <degrees>00360</degrees>
+    <sides/>
+    <date>2021-01-01</date>
+  </row>
+  <row>
+    <shape>triangle</shape>
+    <degrees>00180</degrees>
+    <sides>3.0</sides>
+    <date>2022-01-01</date>
+  </row>
+"""
+    with tm.ensure_clean(filename="bad.xml") as path:
+        with open(path, "w") as f:
+            f.write(bad_xml)
+
+        with pytest.raises(
+            SyntaxError,
+            match=(
+                "Extra content at the end of the document|"
+                "junk after document element"
+            ),
+        ):
+            read_xml(
+                path,
+                parser=parser,
+                parse_dates=["date"],
+                iterparse={"row": ["shape", "degrees", "sides", "date"]},
+            )
+
+
+def test_no_result(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(
+        ParserError, match="No result from selected items in iterparse."
+    ):
+        read_xml(
+            filename,
+            parser=parser,
+            iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]},
+        )
+
+
 @tm.network
 @td.skip_if_no("lxml")
 def test_online_stylesheet():
@@ -1055,12 +1270,26 @@ def test_online_stylesheet():
 
 
 def test_compression_read(parser, compression_only):
-    with tm.ensure_clean() as path:
-        geom_df.to_xml(path, index=False, parser=parser, compression=compression_only)
+    with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path:
+        geom_df.to_xml(
+            comp_path, index=False, parser=parser, compression=compression_only
+        )
+
+        df_xpath = read_xml(comp_path, parser=parser, compression=compression_only)
+
+        with get_handle(comp_path, "r", compression=compression_only) as handles:
+            with open(ext_path, "w") as f:
+                f.write(handles.handle.read())
 
-        xml_df = read_xml(path, parser=parser, compression=compression_only)
+            df_iter = read_xml(
+                ext_path,
+                parser=parser,
+                iterparse={"row": ["shape", "degrees", "sides"]},
+                compression=compression_only,
+            )
 
-    tm.assert_frame_equal(xml_df, geom_df)
+    tm.assert_frame_equal(df_xpath, geom_df)
+    tm.assert_frame_equal(df_iter, geom_df)
 
 
 def test_wrong_compression(parser, compression, compression_only):
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py
index 801461ed4288a..afe1d5de720d8 100644
--- a/pandas/tests/io/xml/test_xml_dtypes.py
+++ b/pandas/tests/io/xml/test_xml_dtypes.py
@@ -68,6 +68,16 @@ def parser(request):
 
 def test_dtype_single_str(parser):
     df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_types)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            dtype={"degrees": "str"},
+            iterparse={"row": ["shape", "degrees", "sides"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -78,10 +88,21 @@ def test_dtype_single_str(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_dtypes_all_str(parser):
     df_result = read_xml(xml_dates, dtype="string", parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_dates)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            dtype="string",
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -94,6 +115,7 @@ def test_dtypes_all_str(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_dtypes_with_names(parser):
@@ -103,6 +125,17 @@ def test_dtypes_with_names(parser):
         dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"},
         parser=parser,
     )
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_dates)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            names=["Col1", "Col2", "Col3", "Col4"],
+            dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"},
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -114,10 +147,21 @@ def test_dtypes_with_names(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_dtype_nullable_int(parser):
     df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_types)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            dtype={"sides": "Int64"},
+            iterparse={"row": ["shape", "degrees", "sides"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -128,10 +172,21 @@ def test_dtype_nullable_int(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_dtype_float(parser):
     df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_types)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            dtype={"degrees": "float"},
+            iterparse={"row": ["shape", "degrees", "sides"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -142,6 +197,7 @@ def test_dtype_float(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_wrong_dtype(parser):
@@ -176,6 +232,16 @@ def test_both_dtype_converters(parser):
 
 def test_converters_str(parser):
     df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_types)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            converters={"degrees": str},
+            iterparse={"row": ["shape", "degrees", "sides"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -186,6 +252,7 @@ def test_converters_str(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_converters_date(parser):
@@ -193,6 +260,16 @@ def test_converters_date(parser):
     df_result = read_xml(
         xml_dates, converters={"date": convert_to_datetime}, parser=parser
     )
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_dates)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            converters={"date": convert_to_datetime},
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -204,6 +281,7 @@ def test_converters_date(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_wrong_converters_type(parser):
@@ -226,6 +304,16 @@ def test_callable_str_converters(parser):
 
 def test_parse_dates_column_name(parser):
     df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_dates)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            parse_dates=["date"],
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -237,10 +325,21 @@ def test_parse_dates_column_name(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_parse_dates_column_index(parser):
     df_result = read_xml(xml_dates, parse_dates=[3], parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_dates)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            parse_dates=[3],
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -252,10 +351,21 @@ def test_parse_dates_column_index(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_parse_dates_true(parser):
     df_result = read_xml(xml_dates, parse_dates=True, parser=parser)
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml_dates)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            parse_dates=True,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -267,6 +377,7 @@ def test_parse_dates_true(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_parse_dates_dictionary(parser):
@@ -301,6 +412,16 @@ def test_parse_dates_dictionary(parser):
     df_result = read_xml(
         xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser
     )
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(xml)
+
+        df_iter = read_xml(
+            path,
+            parser=parser,
+            parse_dates={"date_end": ["year", "month", "day"]},
+            iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]},
+        )
 
     df_expected = DataFrame(
         {
@@ -312,6 +433,7 @@ def test_parse_dates_dictionary(parser):
     )
 
     tm.assert_frame_equal(df_result, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_day_first_parse_dates(parser):
diff --git a/pandas/tests/io/xml/test_xml_iterparse.py b/pandas/tests/io/xml/test_xml_iterparse.py
deleted file mode 100644
index 4398786c87f3b..0000000000000
--- a/pandas/tests/io/xml/test_xml_iterparse.py
+++ /dev/null
@@ -1,746 +0,0 @@
-from __future__ import annotations
-
-import pytest
-
-from pandas.errors import ParserError
-import pandas.util._test_decorators as td
-
-from pandas import (
-    DataFrame,
-    Series,
-    to_datetime,
-)
-import pandas._testing as tm
-
-from pandas.io.common import get_handle
-from pandas.io.xml import read_xml
-
-
-@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
-def parser(request):
-    return request.param
-
-
-@pytest.fixture(params=["rb", "r"])
-def mode(request):
-    return request.param
-
-
-geom_df = DataFrame(
-    {
-        "shape": ["square", "circle", "triangle"],
-        "degrees": [360, 360, 180],
-        "sides": [4, float("nan"), 3],
-    }
-)
-
-xml_str = """\
-<?xml version='1.0' encoding='utf-8'?>
-<data>
-  <row>
-    <shape>square</shape>
-    <degrees>00360</degrees>
-    <sides>4.0</sides>
-    <date>2020-01-01</date>
-   </row>
-  <row>
-    <shape>circle</shape>
-    <degrees>00360</degrees>
-    <sides/>
-    <date>2021-01-01</date>
-  </row>
-  <row>
-    <shape>triangle</shape>
-    <degrees>00180</degrees>
-    <sides>3.0</sides>
-    <date>2022-01-01</date>
-  </row>
-</data>"""
-
-xml_prefix_nmsp = """\
-<?xml version='1.0' encoding='utf-8'?>
-<doc:data xmlns:doc="http://example.com">
-  <doc:row>
-    <doc:shape>square</doc:shape>
-    <doc:degrees>360</doc:degrees>
-    <doc:sides>4.0</doc:sides>
-  </doc:row>
-  <doc:row>
-    <doc:shape>circle</doc:shape>
-    <doc:degrees>360</doc:degrees>
-    <doc:sides/>
-  </doc:row>
-  <doc:row>
-    <doc:shape>triangle</doc:shape>
-    <doc:degrees>180</doc:degrees>
-    <doc:sides>3.0</doc:sides>
-  </doc:row>
-</doc:data>"""
-
-bad_xml = """\
-<?xml version='1.0' encoding='utf-8'?>
-  <row>
-    <shape>square</shape>
-    <degrees>00360</degrees>
-    <sides>4.0</sides>
-    <date>2020-01-01</date>
-   </row>
-  <row>
-    <shape>circle</shape>
-    <degrees>00360</degrees>
-    <sides/>
-    <date>2021-01-01</date>
-  </row>
-  <row>
-    <shape>triangle</shape>
-    <degrees>00180</degrees>
-    <sides>3.0</sides>
-    <date>2022-01-01</date>
-  </row>
-"""
-
-# FILE
-
-
-def test_file(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        iterparse={"book": ["category", "title", "year", "author", "price"]},
-    )
-
-    df_expected = DataFrame(
-        {
-            "category": ["cooking", "children", "web"],
-            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
-            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
-            "year": [2005, 2005, 2003],
-            "price": [30.00, 29.99, 39.95],
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_file_xpath_compare(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    df_xpath = read_xml(filename, parser=parser)
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        iterparse={"book": ["category", "title", "author", "year", "price"]},
-    )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# LARGE FILE
-
-
-@tm.network
-@pytest.mark.slow
-def test_large_url_xpath_compare(parser):
-    with tm.ensure_clean(filename="cta.xml") as path:
-        url = (
-            "https://data.cityofchicago.org/api/views/"
-            "8pix-ypme/rows.xml?accessType=DOWNLOAD"
-        )
-        (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False))
-
-        df_xpath = read_xml(path, parser=parser)
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            iterparse={
-                "row": [
-                    "_id",
-                    "_uuid",
-                    "_position",
-                    "_address",
-                    "stop_id",
-                    "direction_id",
-                    "stop_name",
-                    "station_name",
-                    "station_descriptive_name",
-                    "map_id",
-                    "ada",
-                    "red",
-                    "blue",
-                    "g",
-                    "brn",
-                    "p",
-                    "pexp",
-                    "y",
-                    "pnk",
-                    "o",
-                    "location",
-                ]
-            },
-        )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# NAMESPACES
-
-
-def test_namespace_prefix(parser):
-    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_prefix_nmsp)
-
-        df_iter = read_xml(
-            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
-        )
-
-    df_expected = DataFrame(
-        {
-            "shape": ["square", "circle", "triangle"],
-            "degrees": [360, 360, 180],
-            "sides": [4.0, float("nan"), 3.0],
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_namespace_prefix_xpath_compare(parser):
-    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_prefix_nmsp)
-
-        df_xpath = read_xml(
-            path,
-            xpath=".//ns:row",
-            namespaces={"ns": "http://example.com"},
-            parser=parser,
-        )
-        df_iter = read_xml(
-            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
-        )
-
-        tm.assert_frame_equal(df_xpath, df_iter)
-
-
-def test_default_namespace_xpath_compare(datapath):
-    kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
-
-    df_xpath = read_xml(
-        kml, xpath=".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}
-    )
-
-    df_iter = read_xml(
-        kml,
-        iterparse={
-            "Placemark": [
-                "id",
-                "name",
-                "Snippet",
-                "description",
-                "styleUrl",
-                "MultiGeometry",
-            ]
-        },
-    )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# ELEMS_ONLY
-
-
-def test_elems_only(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        iterparse={"book": ["title", "author", "year", "price"]},
-    )
-
-    df_expected = DataFrame(
-        {
-            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
-            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
-            "year": [2005, 2005, 2003],
-            "price": [30.00, 29.99, 39.95],
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_elems_only_xpath_compare(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    df_xpath = read_xml(filename, elems_only=True, parser=parser)
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        iterparse={"book": ["title", "author", "year", "price"]},
-    )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# ATTRS_ONLY
-
-
-def test_attrs_only(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]})
-    df_expected = DataFrame({"category": ["cooking", "children", "web"]})
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_attrs_only_xpath_compare(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    df_xpath = read_xml(filename, attrs_only=True, parser=parser)
-    df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]})
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# NAMES
-
-
-def test_names(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        names=["b_category", "b_title", "b_author", "b_year", "b_price"],
-        iterparse={"book": ["category", "title", "author", "year", "price"]},
-    )
-
-    df_expected = DataFrame(
-        {
-            "b_category": ["cooking", "children", "web"],
-            "b_title": ["Everyday Italian", "Harry Potter", "Learning XML"],
-            "b_author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
-            "b_year": [2005, 2005, 2003],
-            "b_price": [30.00, 29.99, 39.95],
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_names_xpath_compare(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    df_xpath = read_xml(
-        filename,
-        parser=parser,
-        names=["b_category", "b_title", "b_author", "b_year", "b_price"],
-    )
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        names=["b_category", "b_title", "b_author", "b_year", "b_price"],
-        iterparse={"book": ["category", "title", "author", "year", "price"]},
-    )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# DTYPE
-
-
-def test_dtypes(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        dtype={"year": "Int64", "price": "Float64"},
-        iterparse={"book": ["category", "title", "year", "author", "price"]},
-    )
-
-    df_expected = DataFrame(
-        {
-            "category": ["cooking", "children", "web"],
-            "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
-            "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
-            "year": Series([2005, 2005, 2003]).astype("Int64"),
-            "price": Series([30.00, 29.99, 39.95]).astype("Float64"),
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_dtypes_xpath_compare(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-
-    df_xpath = read_xml(
-        filename, parser=parser, dtype={"year": "Int64", "price": "Float64"}
-    )
-
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        dtype={"year": "Int64", "price": "Float64"},
-        iterparse={"book": ["category", "title", "year", "author", "price"]},
-    )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# CONVERTERS
-
-
-def test_converters(parser):
-    convert_to_datetime = lambda x: to_datetime(x)
-    with tm.ensure_clean(filename="xml_string.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_str)
-
-        df_iter = read_xml(
-            path,
-            converters={"date": convert_to_datetime},
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-    df_expected = DataFrame(
-        {
-            "shape": ["square", "circle", "triangle"],
-            "degrees": [360, 360, 180],
-            "sides": [4.0, float("nan"), 3.0],
-            "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_converters_xpath_compare(parser):
-    convert_to_datetime = lambda x: to_datetime(x)
-    with tm.ensure_clean(filename="xml_string.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_str)
-
-        df_xpath = read_xml(
-            path, converters={"date": convert_to_datetime}, parser=parser
-        )
-
-        df_iter = read_xml(
-            path,
-            converters={"date": convert_to_datetime},
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# PARSE_DATES
-
-
-def test_date_parse(parser):
-    with tm.ensure_clean(filename="xml_string.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_str)
-
-        df_iter = read_xml(
-            path,
-            parse_dates=["date"],
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-    df_expected = DataFrame(
-        {
-            "shape": ["square", "circle", "triangle"],
-            "degrees": [360, 360, 180],
-            "sides": [4.0, float("nan"), 3.0],
-            "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
-        }
-    )
-
-    tm.assert_frame_equal(df_iter, df_expected)
-
-
-def test_date_parse_xpath_compare(parser):
-    with tm.ensure_clean(filename="xml_string.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_str)
-
-        df_xpath = read_xml(path, parse_dates=["date"], parser=parser)
-
-        df_iter = read_xml(
-            path,
-            parse_dates=["date"],
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# ENCODING
-
-
-def test_encoding(datapath, parser):
-    filename = datapath("io", "data", "xml", "baby_names.xml")
-
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        encoding="ISO-8859-1",
-        iterparse={"row": ["rank", "malename", "femalename"]},
-    )
-
-    df_expected = DataFrame(
-        {
-            "rank": [1, 2, 3, 4, 5],
-            "malename": ["José", "Luis", "Carlos", "Juan", "Jorge"],
-            "femalename": ["Sofía", "Valentina", "Isabella", "Camila", "Valeria"],
-        }
-    )
-
-    tm.assert_frame_equal(df_iter.head(), df_expected)
-
-
-def test_encoding_xpath_compare(datapath, parser):
-    filename = datapath("io", "data", "xml", "baby_names.xml")
-    df_xpath = read_xml(filename, parser=parser, encoding="ISO-8859-1")
-
-    df_iter = read_xml(
-        filename,
-        parser=parser,
-        encoding="ISO-8859-1",
-        iterparse={"row": ["rank", "malename", "femalename"]},
-    )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# STYLESHEET
-
-
-@td.skip_if_no("lxml")
-def test_stylesheet_xpath_compare(datapath):
-    kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
-    xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
-
-    df_style = read_xml(
-        kml,
-        xpath=".//k:Placemark",
-        namespaces={"k": "http://www.opengis.net/kml/2.2"},
-        stylesheet=xsl,
-    )
-
-    df_iter = read_xml(
-        kml,
-        iterparse={
-            "Placemark": [
-                "id",
-                "name",
-                "styleUrl",
-                "extrude",
-                "altitudeMode",
-                "coordinates",
-            ]
-        },
-    )
-
-    tm.assert_frame_equal(df_style, df_iter)
-
-
-# COMPRESSION
-
-
-def test_compression_compare(parser, compression_only):
-    with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path:
-        geom_df.to_xml(comp_path, parser=parser, compression=compression_only)
-
-        with get_handle(comp_path, "r", compression=compression_only) as handles:
-            with open(ext_path, "w") as f:
-                f.write(handles.handle.read())
-
-            df_iter = read_xml(
-                ext_path,
-                parser=parser,
-                iterparse={"row": ["shape", "degrees", "sides"]},
-                compression=compression_only,
-            )
-
-    tm.assert_frame_equal(geom_df, df_iter)
-
-
-# STORAGE OPTIONS
-
-
-@tm.network
-@pytest.mark.slow
-def test_s3_xpath_compare(parser):
-    # Python Software Foundation (2019 IRS-990 RETURN)
-    s3_path = "s3://irs-form-990/201923199349319487_public.xml"
-
-    df_xpath = read_xml(
-        s3_path,
-        xpath=".//irs:Form990PartVIISectionAGrp",
-        namespaces={"irs": "http://www.irs.gov/efile"},
-        parser=parser,
-        storage_options={"anon": True},
-    )
-
-    with tm.ensure_clean(filename="irs990.xml") as path:
-        with get_handle(s3_path, "rb", is_text=False) as handles:
-            with open(path, "wb") as f:
-                f.write(handles.handle.read())
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            iterparse={
-                "Form990PartVIISectionAGrp": [
-                    "PersonNm",
-                    "TitleTxt",
-                    "AverageHoursPerWeekRt",
-                    "AverageHoursPerWeekRltdOrgRt",
-                    "IndividualTrusteeOrDirectorInd",
-                    "OfficerInd",
-                    "ReportableCompFromOrgAmt",
-                    "ReportableCompFromRltdOrgAmt",
-                    "OtherCompensationAmt",
-                    "HighestCompensatedEmployeeInd",
-                ]
-            },
-        )
-
-    tm.assert_frame_equal(df_xpath, df_iter)
-
-
-# PARSER ERROR
-
-
-def test_string_error(parser):
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        read_xml(
-            xml_str,
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-
-def test_file_like_error(datapath, parser, mode):
-    filename = datapath("io", "data", "xml", "books.xml")
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        with open(filename) as f:
-            read_xml(
-                f,
-                parser=parser,
-                iterparse={"book": ["category", "title", "year", "author", "price"]},
-            )
-
-
-@tm.network
-def test_url_path_error(parser):
-    url = "https://www.w3schools.com/xml/books.xml"
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        read_xml(
-            url,
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
-
-
-def test_compression_error(parser, compression_only):
-    with tm.ensure_clean(filename="geom_xml.zip") as path:
-        geom_df.to_xml(path, parser=parser, compression=compression_only)
-
-        with pytest.raises(
-            ParserError, match=("iterparse is designed for large XML files")
-        ):
-            read_xml(
-                path,
-                parser=parser,
-                iterparse={"row": ["shape", "degrees", "sides", "date"]},
-                compression=compression_only,
-            )
-
-
-@tm.network
-@td.skip_if_no("s3fs")
-def test_storage_options_error(parser):
-    # Python Software Foundation (2019 IRS-990 RETURN)
-    s3 = "s3://irs-form-990/201923199349319487_public.xml"
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        read_xml(
-            s3,
-            parser=parser,
-            iterparse={
-                "Form990PartVIISectionAGrp": [
-                    "PersonNm",
-                    "TitleTxt",
-                    "AverageHoursPerWeekRt",
-                    "AverageHoursPerWeekRltdOrgRt",
-                    "IndividualTrusteeOrDirectorInd",
-                    "OfficerInd",
-                    "ReportableCompFromOrgAmt",
-                    "ReportableCompFromRltdOrgAmt",
-                    "OtherCompensationAmt",
-                ]
-            },
-            storage_options={"anon": True},
-        )
-
-
-# OTHER EXCEPTIONS
-
-
-def test_wrong_dict_type(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    with pytest.raises(TypeError, match="list is not a valid type for iterparse"):
-        read_xml(
-            filename,
-            parser=parser,
-            iterparse=["category", "title", "year", "author", "price"],
-        )
-
-
-def test_wrong_dict_value(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    with pytest.raises(
-        TypeError, match="<class 'str'> is not a valid type for value in iterparse"
-    ):
-        read_xml(filename, parser=parser, iterparse={"book": "category"})
-
-
-def test_bad_xml(datapath, parser):
-    with tm.ensure_clean(filename="bad.xml") as path:
-        with open(path, "w") as f:
-            f.write(bad_xml)
-
-        with pytest.raises(
-            SyntaxError, match="Extra content at the end of the document"
-        ):
-            read_xml(
-                path,
-                parse_dates=["date"],
-                iterparse={"row": ["shape", "degrees", "sides", "date"]},
-            )
-
-
-def test_no_result(datapath, parser):
-    filename = datapath("io", "data", "xml", "books.xml")
-    with pytest.raises(
-        ParserError, match="No result from selected items in iterparse."
-    ):
-        read_xml(
-            filename,
-            parser=parser,
-            iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]},
-        )

From 5514025c29144b648675d3d67d4a7eb1f7f87984 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Sun, 27 Feb 2022 22:54:21 -0600
Subject: [PATCH 3/6] Adjust pytest decorator on URL test; fix doc strings

---
 pandas/io/xml.py                | 8 +++++---
 pandas/tests/io/xml/test_xml.py | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 2850a98f08430..76780fc6b2241 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -102,8 +102,9 @@ class _XMLFrameParser:
         `etree` does not support XSLT but retained for consistency.
 
     iterparse : dict, optional
-        Dict of row element and descendant elements and/or attributes to
-        retrieve in iterparsing of XML document.
+        Dict with row element as key and list of descendant elements
+        and/or attributes as value to be retrieved in iterparsing of
+        XML document.
 
         .. versionadded:: 1.5.0
 
@@ -123,6 +124,7 @@ class _XMLFrameParser:
     To subclass this class effectively you must override the following methods:`
         * :func:`parse_data`
         * :func:`_parse_nodes`
+        * :func:`_iterparse_nodes`
         * :func:`_parse_doc`
         * :func:`_validate_names`
         * :func:`_validate_path`
@@ -217,7 +219,7 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
         -----
         Namespace URIs will be removed from return node values. Also,
         elements with missing children or attributes in submitted list
-        will have optional keys filled withi None values.
+        will have optional keys filled with None values.
         """
 
         raise AbstractMethodError(self)
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 79040debd9fc8..1d34b0ecf2731 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -1165,7 +1165,8 @@ def test_file_like_error(datapath, parser, mode):
             )
 
 
-@tm.network
+@pytest.mark.network
+@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True)
 def test_url_path_error(parser):
     url = "https://www.w3schools.com/xml/books.xml"
     with pytest.raises(

From 2c4d81f55d4f9d36cd79ffbf4e8ca255835cb707 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Mon, 28 Feb 2022 22:52:15 -0600
Subject: [PATCH 4/6] Adjust tests for helper function

---
 pandas/tests/io/xml/test_xml.py        |  57 ++++----
 pandas/tests/io/xml/test_xml_dtypes.py | 184 ++++++++++---------------
 2 files changed, 105 insertions(+), 136 deletions(-)

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 1d34b0ecf2731..77f90d88614b1 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -245,6 +245,21 @@ def parser(request):
     return request.param
 
 
+def read_xml_iterparse(data, **kwargs):
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(data)
+        return read_xml(path, **kwargs)
+
+
+def read_xml_iterparse_comp(comp_path, compression_only, **kwargs):
+    with get_handle(comp_path, "r", compression=compression_only) as handles:
+        with tm.ensure_clean() as path:
+            with open(path, "w") as f:
+                f.write(handles.handle.read())
+            return read_xml(path, **kwargs)
+
+
 # FILE / URL
 
 
@@ -525,13 +540,11 @@ def test_default_namespace(parser):
         parser=parser,
     )
 
-    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_default_nmsp)
-
-        df_iter = read_xml(
-            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
-        )
+    df_iter = read_xml_iterparse(
+        xml_default_nmsp,
+        parser=parser,
+        iterparse={"row": ["shape", "degrees", "sides"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -552,14 +565,9 @@ def test_prefix_namespace(parser):
         namespaces={"doc": "http://example.com"},
         parser=parser,
     )
-
-    with tm.ensure_clean(filename="xml_prefix_nmsp.xml") as path:
-        with open(path, "w") as f:
-            f.write(xml_prefix_nmsp)
-
-        df_iter = read_xml(
-            path, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
-        )
+    df_iter = read_xml_iterparse(
+        xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
+    )
 
     df_expected = DataFrame(
         {
@@ -1307,23 +1315,20 @@ def test_online_stylesheet():
 
 
 def test_compression_read(parser, compression_only):
-    with tm.ensure_clean() as comp_path, tm.ensure_clean() as ext_path:
+    with tm.ensure_clean() as comp_path:
         geom_df.to_xml(
             comp_path, index=False, parser=parser, compression=compression_only
         )
 
         df_xpath = read_xml(comp_path, parser=parser, compression=compression_only)
 
-        with get_handle(comp_path, "r", compression=compression_only) as handles:
-            with open(ext_path, "w") as f:
-                f.write(handles.handle.read())
-
-            df_iter = read_xml(
-                ext_path,
-                parser=parser,
-                iterparse={"row": ["shape", "degrees", "sides"]},
-                compression=compression_only,
-            )
+        df_iter = read_xml_iterparse_comp(
+            comp_path,
+            compression_only,
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides"]},
+            compression=compression_only,
+        )
 
     tm.assert_frame_equal(df_xpath, geom_df)
     tm.assert_frame_equal(df_iter, geom_df)
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py
index afe1d5de720d8..bbf5545052584 100644
--- a/pandas/tests/io/xml/test_xml_dtypes.py
+++ b/pandas/tests/io/xml/test_xml_dtypes.py
@@ -20,6 +20,13 @@ def parser(request):
     return request.param
 
 
+def read_xml_iterparse(data, **kwargs):
+    with tm.ensure_clean() as path:
+        with open(path, "w") as f:
+            f.write(data)
+        return read_xml(path, **kwargs)
+
+
 xml_types = """\
 <?xml version='1.0' encoding='utf-8'?>
 <data>
@@ -68,16 +75,12 @@ def parser(request):
 
 def test_dtype_single_str(parser):
     df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_types)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            dtype={"degrees": "str"},
-            iterparse={"row": ["shape", "degrees", "sides"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_types,
+        parser=parser,
+        dtype={"degrees": "str"},
+        iterparse={"row": ["shape", "degrees", "sides"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -93,16 +96,12 @@ def test_dtype_single_str(parser):
 
 def test_dtypes_all_str(parser):
     df_result = read_xml(xml_dates, dtype="string", parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_dates)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            dtype="string",
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_dates,
+        parser=parser,
+        dtype="string",
+        iterparse={"row": ["shape", "degrees", "sides", "date"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -125,17 +124,13 @@ def test_dtypes_with_names(parser):
         dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"},
         parser=parser,
     )
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_dates)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            names=["Col1", "Col2", "Col3", "Col4"],
-            dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"},
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_dates,
+        parser=parser,
+        names=["Col1", "Col2", "Col3", "Col4"],
+        dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"},
+        iterparse={"row": ["shape", "degrees", "sides", "date"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -152,16 +147,12 @@ def test_dtypes_with_names(parser):
 
 def test_dtype_nullable_int(parser):
     df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_types)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            dtype={"sides": "Int64"},
-            iterparse={"row": ["shape", "degrees", "sides"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_types,
+        parser=parser,
+        dtype={"sides": "Int64"},
+        iterparse={"row": ["shape", "degrees", "sides"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -177,16 +168,12 @@ def test_dtype_nullable_int(parser):
 
 def test_dtype_float(parser):
     df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_types)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            dtype={"degrees": "float"},
-            iterparse={"row": ["shape", "degrees", "sides"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_types,
+        parser=parser,
+        dtype={"degrees": "float"},
+        iterparse={"row": ["shape", "degrees", "sides"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -232,16 +219,12 @@ def test_both_dtype_converters(parser):
 
 def test_converters_str(parser):
     df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_types)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            converters={"degrees": str},
-            iterparse={"row": ["shape", "degrees", "sides"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_types,
+        parser=parser,
+        converters={"degrees": str},
+        iterparse={"row": ["shape", "degrees", "sides"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -260,16 +243,12 @@ def test_converters_date(parser):
     df_result = read_xml(
         xml_dates, converters={"date": convert_to_datetime}, parser=parser
     )
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_dates)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            converters={"date": convert_to_datetime},
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_dates,
+        parser=parser,
+        converters={"date": convert_to_datetime},
+        iterparse={"row": ["shape", "degrees", "sides", "date"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -304,16 +283,12 @@ def test_callable_str_converters(parser):
 
 def test_parse_dates_column_name(parser):
     df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_dates)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            parse_dates=["date"],
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_dates,
+        parser=parser,
+        parse_dates=["date"],
+        iterparse={"row": ["shape", "degrees", "sides", "date"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -330,16 +305,12 @@ def test_parse_dates_column_name(parser):
 
 def test_parse_dates_column_index(parser):
     df_result = read_xml(xml_dates, parse_dates=[3], parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_dates)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            parse_dates=[3],
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_dates,
+        parser=parser,
+        parse_dates=[3],
+        iterparse={"row": ["shape", "degrees", "sides", "date"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -356,16 +327,13 @@ def test_parse_dates_column_index(parser):
 
 def test_parse_dates_true(parser):
     df_result = read_xml(xml_dates, parse_dates=True, parser=parser)
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml_dates)
 
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            parse_dates=True,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml_dates,
+        parser=parser,
+        parse_dates=True,
+        iterparse={"row": ["shape", "degrees", "sides", "date"]},
+    )
 
     df_expected = DataFrame(
         {
@@ -412,16 +380,12 @@ def test_parse_dates_dictionary(parser):
     df_result = read_xml(
         xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser
     )
-    with tm.ensure_clean() as path:
-        with open(path, "w") as f:
-            f.write(xml)
-
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            parse_dates={"date_end": ["year", "month", "day"]},
-            iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]},
-        )
+    df_iter = read_xml_iterparse(
+        xml,
+        parser=parser,
+        parse_dates={"date_end": ["year", "month", "day"]},
+        iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]},
+    )
 
     df_expected = DataFrame(
         {

From 3d065b5ba5d8fbf1fed29b3d9755054e8ee13976 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Tue, 8 Mar 2022 20:58:59 -0600
Subject: [PATCH 5/6] Add iterparse feature to some tests

---
 pandas/io/xml.py                       |  2 +-
 pandas/tests/io/xml/test_xml.py        | 29 +++++++++++++-
 pandas/tests/io/xml/test_xml_dtypes.py | 54 ++++++++++++++++++++------
 3 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 76780fc6b2241..d5178f17e2bf7 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -1072,7 +1072,7 @@ def read_xml(
         The nodes or attributes to retrieve in iterparsing of XML document
         as a dict with key being the name of repeating element and value being
         list of elements or attribute names that are descendants of the repeated
-        element. Note: If this option is used, it will replace xpath parsing
+        element. Note: If this option is used, it will replace ``xpath`` parsing
         and unlike xpath, descendants do not need to relate to each other but can
         exist any where in document under the repeating element. This memory-
         efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 77f90d88614b1..bfb6bb19452bd 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -14,7 +14,10 @@
 
 from pandas.compat import is_ci_environment
 from pandas.compat._optional import import_optional_dependency
-from pandas.errors import ParserError
+from pandas.errors import (
+    EmptyDataError,
+    ParserError,
+)
 import pandas.util._test_decorators as td
 
 from pandas import DataFrame
@@ -663,6 +666,11 @@ def test_none_namespace_prefix(key):
 def test_file_elems_and_attrs(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")
     df_file = read_xml(filename, parser=parser)
+    df_iter = read_xml(
+        filename,
+        parser=parser,
+        iterparse={"book": ["category", "title", "author", "year", "price"]},
+    )
     df_expected = DataFrame(
         {
             "category": ["cooking", "children", "web"],
@@ -674,6 +682,7 @@ def test_file_elems_and_attrs(datapath, parser):
     )
 
     tm.assert_frame_equal(df_file, df_expected)
+    tm.assert_frame_equal(df_iter, df_expected)
 
 
 def test_file_only_attrs(datapath, parser):
@@ -741,7 +750,13 @@ def test_attribute_centric_xml():
     df_lxml = read_xml(xml, xpath=".//station")
     df_etree = read_xml(xml, xpath=".//station", parser="etree")
 
+    df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]})
+    df_iter_et = read_xml_iterparse(
+        xml, parser="etree", iterparse={"station": ["Name", "coords"]}
+    )
+
     tm.assert_frame_equal(df_lxml, df_etree)
+    tm.assert_frame_equal(df_iter_lx, df_iter_et)
 
 
 # NAMES
@@ -834,7 +849,7 @@ def test_parser_consistency_with_encoding(datapath):
     )
     df_iter_etree = read_xml(
         filename,
-        parser="lxml",
+        parser="etree",
         encoding="ISO-8859-1",
         iterparse={"row": ["rank", "malename", "femalename"]},
     )
@@ -1273,6 +1288,16 @@ def test_no_result(datapath, parser):
         )
 
 
+def test_empty_data(datapath, parser):
+    filename = datapath("io", "data", "xml", "books.xml")
+    with pytest.raises(EmptyDataError, match="No columns to parse from file"):
+        read_xml(
+            filename,
+            parser=parser,
+            iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]},
+        )
+
+
 @pytest.mark.network
 @td.skip_if_no("lxml")
 @tm.network(
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py
index bbf5545052584..6aa4ddfac7628 100644
--- a/pandas/tests/io/xml/test_xml_dtypes.py
+++ b/pandas/tests/io/xml/test_xml_dtypes.py
@@ -20,6 +20,13 @@ def parser(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[None, {"book": ["category", "title", "author", "year", "price"]}]
+)
+def iterparse(request):
+    return request.param
+
+
 def read_xml_iterparse(data, **kwargs):
     with tm.ensure_clean() as path:
         with open(path, "w") as f:
@@ -187,11 +194,12 @@ def test_dtype_float(parser):
     tm.assert_frame_equal(df_iter, df_expected)
 
 
-def test_wrong_dtype(parser):
+def test_wrong_dtype(datapath, parser, iterparse):
+    filename = datapath("io", "data", "xml", "books.xml")
     with pytest.raises(
-        ValueError, match=('Unable to parse string "square" at position 0')
+        ValueError, match=('Unable to parse string "Everyday Italian" at position 0')
     ):
-        read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser)
+        read_xml(filename, dtype={"title": "Int64"}, parser=parser, iterparse=iterparse)
 
 
 def test_both_dtype_converters(parser):
@@ -210,8 +218,16 @@ def test_both_dtype_converters(parser):
             converters={"degrees": str},
             parser=parser,
         )
+        df_iter = read_xml_iterparse(
+            xml_types,
+            dtype={"degrees": "str"},
+            converters={"degrees": str},
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides"]},
+        )
 
         tm.assert_frame_equal(df_result, df_expected)
+        tm.assert_frame_equal(df_iter, df_expected)
 
 
 # CONVERTERS
@@ -263,19 +279,26 @@ def test_converters_date(parser):
     tm.assert_frame_equal(df_iter, df_expected)
 
 
-def test_wrong_converters_type(parser):
+def test_wrong_converters_type(datapath, parser, iterparse):
+    filename = datapath("io", "data", "xml", "books.xml")
     with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")):
-        read_xml(xml_types, converters={"degrees", str}, parser=parser)
+        read_xml(filename, converters={"year", str}, parser=parser, iterparse=iterparse)
 
 
-def test_callable_func_converters(parser):
+def test_callable_func_converters(datapath, parser, iterparse):
+    filename = datapath("io", "data", "xml", "books.xml")
     with pytest.raises(TypeError, match=("'float' object is not callable")):
-        read_xml(xml_types, converters={"degrees": float()}, parser=parser)
+        read_xml(
+            filename, converters={"year": float()}, parser=parser, iterparse=iterparse
+        )
 
 
-def test_callable_str_converters(parser):
+def test_callable_str_converters(datapath, parser, iterparse):
+    filename = datapath("io", "data", "xml", "books.xml")
     with pytest.raises(TypeError, match=("'str' object is not callable")):
-        read_xml(xml_types, converters={"degrees": "float"}, parser=parser)
+        read_xml(
+            filename, converters={"year": "float"}, parser=parser, iterparse=iterparse
+        )
 
 
 # PARSE DATES
@@ -437,11 +460,20 @@ def test_day_first_parse_dates(parser):
         UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format"
     ):
         df_result = read_xml(xml, parse_dates=["date"], parser=parser)
+        df_iter = read_xml_iterparse(
+            xml,
+            parse_dates=["date"],
+            parser=parser,
+            iterparse={"row": ["shape", "degrees", "sides", "date"]},
+        )
+
         tm.assert_frame_equal(df_result, df_expected)
+        tm.assert_frame_equal(df_iter, df_expected)
 
 
-def test_wrong_parse_dates_type(parser):
+def test_wrong_parse_dates_type(datapath, parser, iterparse):
+    filename = datapath("io", "data", "xml", "books.xml")
     with pytest.raises(
         TypeError, match=("Only booleans, lists, and dictionaries are accepted")
     ):
-        read_xml(xml_dates, parse_dates={"date"}, parser=parser)
+        read_xml(filename, parse_dates={"date"}, parser=parser, iterparse=iterparse)

From e37c20a2820966ea0e6dee0d1566eefd125862d8 Mon Sep 17 00:00:00 2001
From: Parfait Gasana <parfait.gasana@gmail.com>
Date: Fri, 18 Mar 2022 12:03:24 -0500
Subject: [PATCH 6/6] Add IO docs link in docstring

---
 pandas/io/xml.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 0a2a144af3309..181b0fe115f4c 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -1130,6 +1130,10 @@ def read_xml(
     exceptions due to issues with XML document, ``xpath``, or other
     parameters.
 
+    See the :ref:`read_xml documentation in the IO section of the docs
+    <io.read_xml>` for more information in using this method to parse XML
+    files to DataFrames.
+
     Examples
     --------
     >>> xml = '''<?xml version='1.0' encoding='utf-8'?>