Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Index.reindex with duplicate index #42568

Merged
merged 10 commits into from
Aug 8, 2021
1 change: 1 addition & 0 deletions doc/source/user_guide/duplicates.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ duplicates present. The output can't be determined, and so pandas raises.

.. ipython:: python
:okexcept:
:okwarning:
s1 = pd.Series([0, 1, 2], index=["a", "b", "b"])
s1.reindex(["a", "b", "c"])
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ Deprecations
- Deprecated treating ``numpy.datetime64`` objects as UTC times when passed to the :class:`Timestamp` constructor along with a timezone. In a future version, these will be treated as wall-times. To retain the old behavior, use ``Timestamp(dt64).tz_localize("UTC").tz_convert(tz)`` (:issue:`24559`)
- Deprecated ignoring missing labels when indexing with a sequence of labels on a level of a MultiIndex (:issue:`42351`)
- Creating an empty Series without a dtype will now raise a more visible ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`30017`)
- Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`)
-

.. ---------------------------------------------------------------------------

Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3888,6 +3888,15 @@ def reindex(
if self.equals(target):
indexer = None
else:
if not self.is_unique:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't you need to do this on L3912? e.g. raise if those conditions are tru first. also don't we know that its unique at this point?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't you need to do this on L3912? e.g. raise if those conditions are tru first

sure, will move this down

also don't we know that its unique at this point?

nope

# GH#42568
warnings.warn(
"reindexing with a non-unique Index is deprecated and "
"will raise in a future version",
FutureWarning,
stacklevel=2,
)

if self._index_as_unique:
indexer = self.get_indexer(
target, method=method, limit=limit, tolerance=tolerance
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,14 @@ def reindex(
missing = np.array([], dtype=np.intp)
else:
indexer, missing = self.get_indexer_non_unique(target)
if not self.is_unique:
# GH#42568
warnings.warn(
"reindexing with a non-unique Index is deprecated and will "
"raise in a future version",
FutureWarning,
stacklevel=2,
)

if len(self) and indexer is not None:
new_target = self.take(indexer)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/indexing/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self, df_dup_col
df = df_dup_cols
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
df[df.A > 6]
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df[df.A > 6]

def test_getitem_boolean_series_with_duplicate_columns(self, df_dup_cols):
# boolean indexing
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def test_setitem_error_msmgs(self):
)
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
df["newcol"] = ser
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df["newcol"] = ser

# GH 4107, more descriptive error message
df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
Expand Down
12 changes: 8 additions & 4 deletions pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,8 @@ def test_reindex_dups(self):
# reindex fails
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
df.reindex(index=list(range(len(df))))
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df.reindex(index=list(range(len(df))))

def test_reindex_with_duplicate_columns(self):

Expand All @@ -670,9 +671,11 @@ def test_reindex_with_duplicate_columns(self):
)
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
df.reindex(columns=["bar"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df.reindex(columns=["bar"])
with pytest.raises(ValueError, match=msg):
df.reindex(columns=["bar", "foo"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df.reindex(columns=["bar", "foo"])

def test_reindex_axis_style(self):
# https://github.com/pandas-dev/pandas/issues/12392
Expand Down Expand Up @@ -944,7 +947,8 @@ def test_reindex_with_categoricalindex(self):
# passed duplicate indexers are not allowed
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
df2.reindex(["a", "b"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df2.reindex(["a", "b"])

# args NotImplemented ATM
msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
Expand Down
22 changes: 14 additions & 8 deletions pandas/tests/indexes/categorical/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,32 @@

class TestReindex:
def test_reindex_dtype(self):
c = CategoricalIndex(["a", "b", "c", "a"])
res, indexer = c.reindex(["a", "c"])
# GH#11586
ci = CategoricalIndex(["a", "b", "c", "a"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
res, indexer = ci.reindex(["a", "c"])

tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

c = CategoricalIndex(["a", "b", "c", "a"])
res, indexer = c.reindex(Categorical(["a", "c"]))
ci = CategoricalIndex(["a", "b", "c", "a"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
res, indexer = ci.reindex(Categorical(["a", "c"]))

exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
tm.assert_index_equal(res, exp, exact=True)
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
res, indexer = c.reindex(["a", "c"])
ci = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
res, indexer = ci.reindex(["a", "c"])
exp = Index(["a", "a", "c"], dtype="object")
tm.assert_index_equal(res, exp, exact=True)
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))

c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
res, indexer = c.reindex(Categorical(["a", "c"]))
ci = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
res, indexer = ci.reindex(Categorical(["a", "c"]))
exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
tm.assert_index_equal(res, exp, exact=True)
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/indexes/multi/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def test_reindex_non_unique():

msg = "cannot handle a non-unique multi-index!"
with pytest.raises(ValueError, match=msg):
a.reindex(new_idx)
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
a.reindex(new_idx)


@pytest.mark.parametrize("values", [[["a"], ["x"]], [[], []]])
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,8 @@ def test_asfreq_non_unique():

msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
ts.asfreq("B")
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
ts.asfreq("B")


def test_resample_axis1():
Expand Down