Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC: added docstring for storage_options in read_html #54815

Merged
merged 4 commits into from
Aug 29, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
AbstractMethodError,
EmptyDataError,
)
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import check_dtype_backend

Expand All @@ -32,6 +33,7 @@
from pandas.core.indexes.base import Index
from pandas.core.indexes.multi import MultiIndex
from pandas.core.series import Series
from pandas.core.shared_docs import _shared_docs

from pandas.io.common import (
file_exists,
Expand Down Expand Up @@ -363,13 +365,13 @@ def _parse_tfoot_tr(self, table):
"""
raise AbstractMethodError(self)

def _parse_tables(self, doc, match, attrs):
def _parse_tables(self, document, match, attrs):
"""
Return all tables from the parsed DOM.

Parameters
----------
doc : the DOM from which to parse the table element.
document : the DOM from which to parse the table element.

match : str or regular expression
The text to search for in the DOM tree.
Expand Down Expand Up @@ -594,9 +596,9 @@ def __init__(self, *args, **kwargs) -> None:

self._strainer = SoupStrainer("table")

def _parse_tables(self, doc, match, attrs):
def _parse_tables(self, document, match, attrs):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)
tables = document.find_all(element_name, attrs=attrs)
if not tables:
raise ValueError("No tables found")

Expand Down Expand Up @@ -726,7 +728,7 @@ def _parse_td(self, row):
# <thead> or <tfoot> (see _parse_thead_tr).
return row.xpath("./td|./th")

def _parse_tables(self, doc, match, kwargs):
def _parse_tables(self, document, match, kwargs):
pattern = match.pattern

# 1. check all descendants for the given pattern and only search tables
Expand All @@ -738,7 +740,7 @@ def _parse_tables(self, doc, match, kwargs):
if kwargs:
xpath_expr += _build_xpath_expr(kwargs)

tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
tables = document.xpath(xpath_expr, namespaces=_re_namespace)

tables = self._handle_hidden_tables(tables, "attrib")
if self.displayed_only:
Expand Down Expand Up @@ -1026,6 +1028,7 @@ def _parse(
return ret


@doc(storage_options=_shared_docs["storage_options"])
def read_html(
io: FilePath | ReadBuffer[str],
*,
Expand Down Expand Up @@ -1096,13 +1099,13 @@ def read_html(
passed to lxml or Beautiful Soup. However, these attributes must be
valid HTML table attributes to work correctly. For example, ::

attrs = {'id': 'table'}
attrs = {{'id': 'table'}}

is a valid attribute dictionary because the 'id' HTML tag attribute is
a valid HTML attribute for *any* HTML tag as per `this document
<https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::

attrs = {'asdf': 'table'}
attrs = {{'asdf': 'table'}}

is *not* a valid attribute dictionary because 'asdf' is not a valid
HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
Expand Down Expand Up @@ -1144,13 +1147,13 @@ def read_html(
displayed_only : bool, default True
Whether elements with "display: none" should be parsed.

extract_links : {None, "all", "header", "body", "footer"}
extract_links : {{None, "all", "header", "body", "footer"}}
Table elements in the specified section(s) with <a> tags will have their
href extracted.

.. versionadded:: 1.5.0

dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:

Expand All @@ -1161,6 +1164,10 @@ def read_html(

.. versionadded:: 2.0

{storage_options}

.. versionadded:: 2.1.0

Returns
-------
dfs
Expand Down