Skip to content

Commit

Permalink
Fixed reindexing arith with duplicates (#35303)
Browse files Browse the repository at this point in the history
Closes #35194
  • Loading branch information
TomAugspurger committed Jul 16, 2020
1 parent 697a538 commit d396111
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -953,6 +953,7 @@ Numeric
- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
- Bug in arithmetic operations between ``DataFrame`` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`)
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
- Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`)
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
Expand Down
23 changes: 19 additions & 4 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna

from pandas.core import algorithms
from pandas.core.construction import extract_array
from pandas.core.ops.array_ops import (
arithmetic_op,
Expand Down Expand Up @@ -562,18 +563,32 @@ def _frame_arith_method_with_reindex(
DataFrame
"""
# GH#31623, only operate on shared columns
cols = left.columns.intersection(right.columns)
cols, lcols, rcols = left.columns.join(
right.columns, how="inner", level=None, return_indexers=True
)

new_left = left[cols]
new_right = right[cols]
new_left = left.iloc[:, lcols]
new_right = right.iloc[:, rcols]
result = op(new_left, new_right)

# Do the join on the columns instead of using _align_method_FRAME
# to avoid constructing two potentially large/sparse DataFrames
join_columns, _, _ = left.columns.join(
right.columns, how="outer", level=None, return_indexers=True
)
return result.reindex(join_columns, axis=1)

if result.columns.has_duplicates:
# Avoid reindexing with a duplicate axis.
# https://github.com/pandas-dev/pandas/issues/35194
indexer, _ = result.columns.get_indexer_non_unique(join_columns)
indexer = algorithms.unique1d(indexer)
result = result._reindex_with_indexers(
{1: [join_columns, indexer]}, allow_dups=True
)
else:
result = result.reindex(join_columns, axis=1)

return result


def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int):
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1552,3 +1552,12 @@ def test_dataframe_operation_with_non_numeric_types(df, col_dtype):
expected = expected.astype({"b": col_dtype})
result = df + pd.Series([-1.0], index=list("a"))
tm.assert_frame_equal(result, expected)


def test_arith_reindex_with_duplicates():
# https://github.com/pandas-dev/pandas/issues/35194
df1 = pd.DataFrame(data=[[0]], columns=["second"])
df2 = pd.DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"])
result = df1 + df2
expected = pd.DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"])
tm.assert_frame_equal(result, expected)

0 comments on commit d396111

Please sign in to comment.