Skip to content

Commit

Permalink
API: Make most arguments for read_html and read_json keyword-ony (#27573
Browse files Browse the repository at this point in the history
)

* Deprecate use of most positional arguments for read_html and read_json

* Import pandas._testing instead of pandas.util.testing

* Import pandas._testing instead of pandas.util.testing

* Update pandas/util/_decorators.py

Co-Authored-By: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Change displayed warning message

* Update pandas/io/html.py

Co-Authored-By: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Update pandas/io/json/_json.py

Co-Authored-By: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Restore permissions to v1.0.0.rst

* Fix expected warning message in tests for deprecate_nonkeyword_arguments

* Reformat too long line

* Remove a test too similar to another one.

* Update Whatsnew

* Fix linting

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
alexitkes and jorisvandenbossche committed Apr 7, 2020
1 parent 25e5a74 commit 467e1c2
Show file tree
Hide file tree
Showing 7 changed files with 318 additions and 49 deletions.
13 changes: 13 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,14 +250,27 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns

Deprecations
~~~~~~~~~~~~

- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)

- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`)
- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`)
- The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`)
- :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`)

- Passing any arguments but the first one to :func:`read_html` as
positional arguments is deprecated since version 1.1. All other
arguments should be given as keyword arguments (:issue:`27573`).

- Passing any arguments but `path_or_buf` (the first one) to
:func:`read_json` as positional arguments is deprecated since
version 1.1. All other arguments should be given as keyword
arguments (:issue:`27573`).

-

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 2 additions & 0 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError, EmptyDataError
from pandas.util._decorators import deprecate_nonkeyword_arguments

from pandas.core.dtypes.common import is_list_like

Expand Down Expand Up @@ -921,6 +922,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
return ret


@deprecate_nonkeyword_arguments(version="2.0")
def read_html(
io,
match=".+",
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._libs.tslibs import iNaT
from pandas._typing import JSONSerializable
from pandas.errors import AbstractMethodError
from pandas.util._decorators import deprecate_kwarg
from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments

from pandas.core.dtypes.common import ensure_str, is_period_dtype

Expand Down Expand Up @@ -345,6 +345,9 @@ def _write(


@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None)
@deprecate_nonkeyword_arguments(
version="2.0", allowed_args=["path_or_buf"], stacklevel=3
)
def read_json(
path_or_buf=None,
orient=None,
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/io/json/test_deprecated_kwargs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Tests for the deprecated keyword arguments for `read_json`.
"""

import pandas as pd
import pandas._testing as tm

from pandas.io.json import read_json


def test_deprecated_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
buf = df.to_json(orient="split")
with tm.assert_produces_warning(FutureWarning):
tm.assert_frame_equal(df, read_json(buf, "split"))
buf = df.to_json(orient="columns")
with tm.assert_produces_warning(FutureWarning):
tm.assert_frame_equal(df, read_json(buf, "columns"))
buf = df.to_json(orient="index")
with tm.assert_produces_warning(FutureWarning):
tm.assert_frame_equal(df, read_json(buf, "index"))


def test_good_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
with tm.assert_produces_warning(None):
tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split"))
tm.assert_frame_equal(
df, read_json(df.to_json(orient="columns"), orient="columns")
)
tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index"))
116 changes: 68 additions & 48 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_invalid_flavor():
msg = r"\{" + flavor + r"\} is not a valid set of flavors"

with pytest.raises(ValueError, match=msg):
read_html(url, "google", flavor=flavor)
read_html(url, match="google", flavor=flavor)


@td.skip_if_no("bs4")
Expand Down Expand Up @@ -121,13 +121,26 @@ def test_to_html_compat(self):
res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
tm.assert_frame_equal(res, df)

@tm.network
def test_banklist_url_positional_match(self):
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"
# Passing match argument as positional should cause a FutureWarning.
with tm.assert_produces_warning(FutureWarning):
df1 = self.read_html(
url, "First Federal Bank of Florida", attrs={"id": "table"}
)
with tm.assert_produces_warning(FutureWarning):
df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})

assert_framelist_equal(df1, df2)

@tm.network
def test_banklist_url(self):
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"
df1 = self.read_html(
url, "First Federal Bank of Florida", attrs={"id": "table"}
url, match="First Federal Bank of Florida", attrs={"id": "table"}
)
df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})
df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"})

assert_framelist_equal(df1, df2)

Expand All @@ -137,21 +150,25 @@ def test_spam_url(self):
"https://raw.githubusercontent.com/pandas-dev/pandas/master/"
"pandas/tests/io/data/html/spam.html"
)
df1 = self.read_html(url, ".*Water.*")
df2 = self.read_html(url, "Unit")
df1 = self.read_html(url, match=".*Water.*")
df2 = self.read_html(url, match="Unit")

assert_framelist_equal(df1, df2)

@pytest.mark.slow
def test_banklist(self):
df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"})
df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"})
df1 = self.read_html(
self.banklist_data, match=".*Florida.*", attrs={"id": "table"}
)
df2 = self.read_html(
self.banklist_data, match="Metcalf Bank", attrs={"id": "table"}
)

assert_framelist_equal(df1, df2)

def test_spam(self):
df1 = self.read_html(self.spam_data, ".*Water.*")
df2 = self.read_html(self.spam_data, "Unit")
df1 = self.read_html(self.spam_data, match=".*Water.*")
df2 = self.read_html(self.spam_data, match="Unit")
assert_framelist_equal(df1, df2)

assert df1[0].iloc[0, 0] == "Proximates"
Expand All @@ -168,81 +185,82 @@ def test_banklist_no_match(self):
assert isinstance(df, DataFrame)

def test_spam_header(self):
df = self.read_html(self.spam_data, ".*Water.*", header=2)[0]
df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0]
assert df.columns[0] == "Proximates"
assert not df.empty

def test_skiprows_int(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, "Unit", skiprows=1)
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)

assert_framelist_equal(df1, df2)

def test_skiprows_range(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0]
df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0]
tm.assert_frame_equal(df1, df2)
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2))

assert_framelist_equal(df1, df2)

def test_skiprows_list(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2])
df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1])
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2])
df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1])

assert_framelist_equal(df1, df2)

def test_skiprows_set(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2})
df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1})
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2})
df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1})

assert_framelist_equal(df1, df2)

def test_skiprows_slice(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, "Unit", skiprows=1)
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)

assert_framelist_equal(df1, df2)

def test_skiprows_slice_short(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2))
df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2))
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2))

assert_framelist_equal(df1, df2)

def test_skiprows_slice_long(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5))
df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1))
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1))

assert_framelist_equal(df1, df2)

def test_skiprows_ndarray(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2))
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2))

assert_framelist_equal(df1, df2)

def test_skiprows_invalid(self):
with pytest.raises(TypeError, match=("is not a valid type for skipping rows")):
self.read_html(self.spam_data, ".*Water.*", skiprows="asdf")
self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf")

def test_index(self):
df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, "Unit", index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
assert_framelist_equal(df1, df2)

def test_header_and_index_no_types(self):
df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
assert_framelist_equal(df1, df2)

def test_header_and_index_with_types(self):
df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
assert_framelist_equal(df1, df2)

def test_infer_types(self):

# 10892 infer_types removed
df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, "Unit", index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
assert_framelist_equal(df1, df2)

def test_string_io(self):
Expand All @@ -252,25 +270,25 @@ def test_string_io(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data2 = StringIO(f.read())

df1 = self.read_html(data1, ".*Water.*")
df2 = self.read_html(data2, "Unit")
df1 = self.read_html(data1, match=".*Water.*")
df2 = self.read_html(data2, match="Unit")
assert_framelist_equal(df1, df2)

def test_string(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data = f.read()

df1 = self.read_html(data, ".*Water.*")
df2 = self.read_html(data, "Unit")
df1 = self.read_html(data, match=".*Water.*")
df2 = self.read_html(data, match="Unit")

assert_framelist_equal(df1, df2)

def test_file_like(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
df1 = self.read_html(f, ".*Water.*")
df1 = self.read_html(f, match=".*Water.*")

with open(self.spam_data, **self.spam_data_kwargs) as f:
df2 = self.read_html(f, "Unit")
df2 = self.read_html(f, match="Unit")

assert_framelist_equal(df1, df2)

Expand All @@ -292,7 +310,7 @@ def test_invalid_url(self):
def test_file_url(self):
url = self.banklist_data
dfs = self.read_html(
file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"}
file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"}
)
assert isinstance(dfs, list)
for df in dfs:
Expand All @@ -308,7 +326,7 @@ def test_invalid_table_attrs(self):

def _bank_data(self, *args, **kwargs):
return self.read_html(
self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs
self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs
)

@pytest.mark.slow
Expand Down Expand Up @@ -358,7 +376,7 @@ def test_regex_idempotency(self):
def test_negative_skiprows(self):
msg = r"\(you passed a negative value\)"
with pytest.raises(ValueError, match=msg):
self.read_html(self.spam_data, "Water", skiprows=-1)
self.read_html(self.spam_data, match="Water", skiprows=-1)

@tm.network
def test_multiple_matches(self):
Expand Down Expand Up @@ -600,7 +618,9 @@ def test_gold_canyon(self):
raw_text = f.read()

assert gc in raw_text
df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0]
df = self.read_html(
self.banklist_data, match="Gold Canyon", attrs={"id": "table"}
)[0]
assert gc in df.to_string()

def test_different_number_of_cols(self):
Expand Down Expand Up @@ -855,7 +875,7 @@ def test_wikipedia_states_table(self, datapath):
data = datapath("io", "data", "html", "wikipedia_states.html")
assert os.path.isfile(data), f"{repr(data)} is not a file"
assert os.path.getsize(data), f"{repr(data)} is an empty file"
result = self.read_html(data, "Arizona", header=1)[0]
result = self.read_html(data, match="Arizona", header=1)[0]
assert result.shape == (60, 12)
assert "Unnamed" in result.columns[-1]
assert result["sq mi"].dtype == np.dtype("float64")
Expand Down Expand Up @@ -1065,7 +1085,7 @@ def test_works_on_valid_markup(self, datapath):
@pytest.mark.slow
def test_fallback_success(self, datapath):
banklist_data = datapath("io", "data", "html", "banklist.html")
self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"])
self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"])

def test_to_html_timestamp(self):
rng = date_range("2000-01-01", periods=10)
Expand Down

0 comments on commit 467e1c2

Please sign in to comment.