Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add result_names argument to DataFrame.compare #44354 #47643

Merged
merged 37 commits into from
Jul 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
54c5068
DOC #45443 edited the documentation of where/mask functions
ahmedibrhm Jul 8, 2022
2951fb1
DOC #45443 edited the documentation of where/mask functions
ahmedibrhm Jul 8, 2022
6335204
Merge branch 'main' into main
ahmedibrhm Jul 8, 2022
2245def
Merge branch 'main' of https://github.com/ahmedibrhm/pandas
ahmedibrhm Jul 8, 2022
330eda8
Merge branch 'main' of https://github.com/pandas-dev/pandas
ahmedibrhm Jul 8, 2022
8afd6a1
Update generic.py
ahmedibrhm Jul 8, 2022
edc9ff4
Merge branch 'main' of https://github.com/ahmedibrhm/pandas
ahmedibrhm Jul 8, 2022
a326359
ENH: add suffixes argument to DataFrame.compare #44354
ahmedibrhm Jul 8, 2022
d9c4ca9
Edited the tests
ahmedibrhm Jul 8, 2022
1c54472
space fixing
ahmedibrhm Jul 8, 2022
ad7fb76
Merge branch 'main' into issue2
ahmedibrhm Jul 8, 2022
4d34821
Update shared_docs.py
ahmedibrhm Jul 8, 2022
8fb6aa2
Update series.py
ahmedibrhm Jul 8, 2022
1e33dea
Update series.py
ahmedibrhm Jul 8, 2022
ae6c75a
invalid argument tests
ahmedibrhm Jul 8, 2022
ee10dd3
issue reference
ahmedibrhm Jul 8, 2022
077d274
syntax editing
ahmedibrhm Jul 8, 2022
3b2ef9e
Merge branch 'main' into issue2
ahmedibrhm Jul 9, 2022
d0289e5
grammar fixing
ahmedibrhm Jul 10, 2022
bd45e06
edit doc
ahmedibrhm Jul 10, 2022
a13b319
editting doc
ahmedibrhm Jul 10, 2022
f32d7cf
Update 02_read_write.rst
ahmedibrhm Jul 10, 2022
6396583
Update 02_read_write.rst
ahmedibrhm Jul 10, 2022
e754e15
Update v1.5.0.rst
ahmedibrhm Jul 10, 2022
8f67c9f
Update v1.5.0.rst
ahmedibrhm Jul 10, 2022
fa4e26b
Merge branch 'main' into issue2
ahmedibrhm Jul 10, 2022
580773d
np
ahmedibrhm Jul 13, 2022
15a9a57
Merge branch 'pandas-dev:main' into issue2
ahmedibrhm Jul 13, 2022
a4fca56
1.5.0 rst
ahmedibrhm Jul 13, 2022
a468714
Merge branch 'main' into issue2
ahmedibrhm Jul 14, 2022
bc209bb
created tests for invalid input
ahmedibrhm Jul 16, 2022
ff014e3
space
ahmedibrhm Jul 16, 2022
32d1c5e
space
ahmedibrhm Jul 16, 2022
0daa3e8
space
ahmedibrhm Jul 16, 2022
be1a9ad
Merge branch 'main' into issue2
ahmedibrhm Jul 16, 2022
9cb23b8
editing test
ahmedibrhm Jul 16, 2022
7ad93e1
Merge branch 'issue2' of https://github.com/ahmedibrhm/pandas into is…
ahmedibrhm Jul 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ Other enhancements
- :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7776,6 +7776,14 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
0 a c NaN NaN
2 NaN NaN 3.0 4.0

Assign result_names

>>> df.compare(df2, result_names=("left", "right"))
col1 col3
left right left right
0 a c NaN NaN
2 NaN NaN 3.0 4.0

Stack the differences on rows

>>> df.compare(df2, align_axis=0)
Expand Down Expand Up @@ -7823,12 +7831,14 @@ def compare(
align_axis: Axis = 1,
keep_shape: bool = False,
keep_equal: bool = False,
result_names: Suffixes = ("self", "other"),
) -> DataFrame:
return super().compare(
other=other,
align_axis=align_axis,
keep_shape=keep_shape,
keep_equal=keep_equal,
result_names=result_names,
)

def combine(
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
Renamer,
SortKind,
StorageOptions,
Suffixes,
T,
TimedeltaConvertibleTypes,
TimestampConvertibleTypes,
Expand Down Expand Up @@ -8970,6 +8971,7 @@ def compare(
align_axis: Axis = 1,
keep_shape: bool_t = False,
keep_equal: bool_t = False,
result_names: Suffixes = ("self", "other"),
):
from pandas.core.reshape.concat import concat

Expand All @@ -8980,7 +8982,6 @@ def compare(
)

mask = ~((self == other) | (self.isna() & other.isna()))
keys = ["self", "other"]

if not keep_equal:
self = self.where(mask)
Expand All @@ -8995,13 +8996,18 @@ def compare(
else:
self = self[mask]
other = other[mask]
if not isinstance(result_names, tuple):
raise TypeError(
f"Passing 'result_names' as a {type(result_names)} is not "
"supported. Provide 'result_names' as a tuple instead."
)

if align_axis in (1, "columns"): # This is needed for Series
axis = 1
else:
axis = self._get_axis_number(align_axis)

diff = concat([self, other], axis=axis, keys=keys)
diff = concat([self, other], axis=axis, keys=result_names)

if axis >= self.ndim:
# No need to reorganize data if stacking on new axis
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@
from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
Suffixes,
)

from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -3237,12 +3238,14 @@ def compare(
align_axis: Axis = 1,
keep_shape: bool = False,
keep_equal: bool = False,
result_names: Suffixes = ("self", "other"),
) -> DataFrame | Series:
return super().compare(
other=other,
align_axis=align_axis,
keep_shape=keep_shape,
keep_equal=keep_equal,
result_names=result_names,
)

def combine(self, other, func, fill_value=None) -> Series:
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/shared_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@
keep_equal : bool, default False
If true, the result keeps values that are equal.
Otherwise, equal values are shown as NaNs.

result_names : tuple, default ('self', 'other')
Set the dataframes names in the comparison.

.. versionadded:: 1.5.0
"""

_shared_docs[
Expand Down
56 changes: 56 additions & 0 deletions pandas/tests/frame/methods/test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,59 @@ def test_compare_unaligned_objects():
df1 = pd.DataFrame(np.ones((3, 3)))
df2 = pd.DataFrame(np.zeros((2, 1)))
df1.compare(df2)


def test_compare_result_names():
# GH 44354
df1 = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
)
df2 = pd.DataFrame(
{
"col1": ["c", "b", "c"],
"col2": [1.0, 2.0, np.nan],
"col3": [1.0, 2.0, np.nan],
},
)
result = df1.compare(df2, result_names=("left", "right"))
expected = pd.DataFrame(
{
("col1", "left"): {0: "a", 2: np.nan},
("col1", "right"): {0: "c", 2: np.nan},
("col3", "left"): {0: np.nan, 2: 3.0},
("col3", "right"): {0: np.nan, 2: np.nan},
}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"result_names",
[
[1, 2],
"HK",
{"2": 2, "3": 3},
3,
3.0,
],
)
def test_invalid_input_result_names(result_names):
# GH 44354
df1 = pd.DataFrame(
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
)
df2 = pd.DataFrame(
{
"col1": ["c", "b", "c"],
"col2": [1.0, 2.0, np.nan],
"col3": [1.0, 2.0, np.nan],
},
)
with pytest.raises(
TypeError,
match=(
f"Passing 'result_names' as a {type(result_names)} is not "
"supported. Provide 'result_names' as a tuple instead."
),
):
df1.compare(df2, result_names=result_names)