diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 09626be713c4f..80c76d14a5938 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -181,6 +181,7 @@ Groupby/Resample/Rolling Reshaping ^^^^^^^^^ +- Bug in :func:`pandas.merge` adds a string of ``None`` if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) - diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 050c3d3e87fc6..5cae6e1a89170 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1971,16 +1971,28 @@ def items_overlap_with_suffix(left, lsuffix, right, rsuffix): raise ValueError('columns overlap but no suffix specified: ' '{rename}'.format(rename=to_rename)) - def lrenamer(x): - if x in to_rename: - return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) - return x + def renamer(x, suffix): + """Rename the left and right indices. + + If there is overlap, and suffix is not None, add + suffix, otherwise, leave it as-is. + + Parameters + ---------- + x : original column name + suffix : str or None - def rrenamer(x): - if x in to_rename: - return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) + Returns + ------- + x : renamed column name + """ + if x in to_rename and suffix is not None: + return '{x}{suffix}'.format(x=x, suffix=suffix) return x + lrenamer = partial(renamer, suffix=lsuffix) + rrenamer = partial(renamer, suffix=rsuffix) + return (_transform_index(left, lrenamer), _transform_index(right, rrenamer)) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1dd19a7c1514e..ad3327e694b67 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -159,9 +159,15 @@ def merge_ordered(left, right, on=None, left DataFrame fill_method : {'ffill', None}, default None Interpolation method for data - suffixes : 2-length sequence (tuple, list, ...) - Suffix to apply to overlapping column names in the left and right - side, respectively + suffixes : Sequence, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. + + .. versionchanged:: 0.25.0 how : {'left', 'right', 'outer', 'inner'}, default 'outer' * left: use only keys from left frame (SQL: left outer join) * right: use only keys from right frame (SQL: right outer join) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a0a20d1da6cef..25487ccc76e62 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1526,3 +1526,65 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): with pytest.raises(ValueError, match=msg): result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index) + + +@pytest.mark.parametrize("col1, col2, kwargs, expected_cols", [ + (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), + (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), + (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), + (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), + ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), + ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), + ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), + ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), + (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(), ["a_x", "a_y"]), + (0, 0, dict(), ["0_x", "0_y"]) +]) +def test_merge_suffix(col1, col2, kwargs, expected_cols): + # issue: 24782 + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [4, 5, 6]}) + + expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], + columns=expected_cols) + + result = a.merge(b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + result = pd.merge(a, b, left_index=True, right_index=True, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("col1, col2, suffixes", [ + ("a", "a", [None, None]), + ("a", "a", (None, None)), + ("a", "a", ("", None)), + (0, 0, [None, None]), + (0, 0, (None, "")) +]) +def test_merge_suffix_error(col1, col2, suffixes): + # issue: 24782 + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + # TODO: might reconsider current raise behaviour, see issue 24782 + msg = "columns overlap but no suffix specified" + with pytest.raises(ValueError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + +@pytest.mark.parametrize("col1, col2, suffixes", [ + ("a", "a", None), + (0, 0, None) +]) +def test_merge_suffix_none_error(col1, col2, suffixes): + # issue: 24782 + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + # TODO: might reconsider current raise behaviour, see GH24782 + msg = "iterable" + with pytest.raises(TypeError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)