Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: casting in datetimelike isin #56427

Merged
merged 3 commits into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ Other Deprecations
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer``. (:issue:`54229`)
- Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`)
- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`)
- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`)
- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`)
- Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`)
- Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`)
Expand Down Expand Up @@ -525,6 +526,7 @@ Datetimelike
^^^^^^^^^^^^
- Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`)
- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`)
- Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`)
- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`)
- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`)
Expand Down
7 changes: 5 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2756,8 +2756,11 @@ def maybe_convert_objects(ndarray[object] objects,
res[:] = NPY_NAT
return res
elif dtype is not None:
# EA, we don't expect to get here, but _could_ implement
raise NotImplementedError(dtype)
# i.e. PeriodDtype, DatetimeTZDtype
cls = dtype.construct_array_type()
obj = cls._from_sequence([], dtype=dtype)
taker = -np.ones((<object>objects).shape, dtype=np.intp)
return obj.take(taker, allow_fill=True)
else:
# we don't guess
seen.object_ = True
Expand Down
21 changes: 21 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,8 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
# TODO: de-duplicate with equals, validate_comparison_value
return np.zeros(self.shape, dtype=bool)

values = ensure_wrapped_if_datetimelike(values)

if not isinstance(values, type(self)):
inferable = [
"timedelta",
Expand All @@ -764,6 +766,14 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
"period",
]
if values.dtype == object:
values = lib.maybe_convert_objects(
values,
convert_non_numeric=True,
dtype_if_all_nat=self.dtype,
)
if values.dtype != object:
return self.isin(values)

inferred = lib.infer_dtype(values, skipna=False)
if inferred not in inferable:
if inferred == "string":
Expand All @@ -778,6 +788,17 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
values = type(self)._from_sequence(values)
except ValueError:
return isin(self.astype(object), values)
else:
warnings.warn(
# GH#53111
f"The behavior of 'isin' with dtype={self.dtype} and "
"castable values (e.g. strings) is deprecated. In a "
"future version, these will not be considered matching "
"by isin. Explicitly cast to the appropriate dtype before "
"calling isin instead.",
FutureWarning,
stacklevel=find_stack_level(),
)

if self.dtype.kind in "mM":
self = cast("DatetimeArray | TimedeltaArray", self)
Expand Down
12 changes: 0 additions & 12 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6534,18 +6534,6 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]:

>>> midx.isin([(1, 'red'), (3, 'red')])
array([ True, False, False])

For a DatetimeIndex, string values in `values` are converted to
Timestamps.

>>> dates = ['2000-03-11', '2000-03-12', '2000-03-13']
>>> dti = pd.to_datetime(dates)
>>> dti
DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'],
dtype='datetime64[ns]', freq=None)

>>> dti.isin(['2000-03-11'])
array([ True, False, False])
"""
if level is not None:
self._validate_index_level(level)
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,45 @@ def test_large(self):
expected[1] = True
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
def test_isin_datetimelike_all_nat(self, dtype):
# GH#56427
dta = date_range("2013-01-01", periods=3)._values
arr = Series(dta.view("i8")).array.view(dtype)

arr[0] = NaT
result = algos.isin(arr, [NaT])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"])
def test_isin_datetimelike_strings_deprecated(self, dtype):
# GH#53111
dta = date_range("2013-01-01", periods=3)._values
arr = Series(dta.view("i8")).array.view(dtype)

vals = [str(x) for x in arr]
msg = "The behavior of 'isin' with dtype=.* is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = algos.isin(arr, vals)
assert res.all()

vals2 = np.array(vals, dtype=str)
with tm.assert_produces_warning(FutureWarning, match=msg):
res2 = algos.isin(arr, vals2)
assert res2.all()

def test_isin_dt64tz_with_nat(self):
# the all-NaT values used to get inferred to tznaive, which was evaluated
# as non-matching GH#56427
dti = date_range("2016-01-01", periods=3, tz="UTC")
ser = Series(dti)
ser[0] = NaT

res = algos.isin(ser._values, [NaT])
exp = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(res, exp)

def test_categorical_from_codes(self):
# GH 16639
vals = np.array([0, 1, 2, 0])
Expand Down