From b366c3d3925009d587dc9f9eed00433c1bf45af4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 23:51:29 +0100 Subject: [PATCH 1/7] BUG: Index.str.cat casting result always to object --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/strings/accessor.py | 7 ++- pandas/tests/strings/test_cat.py | 85 +++++++++++++++++--------------- 3 files changed, 52 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 543a9864ced26..0f4d3a22f5129 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,7 +25,7 @@ Bug fixes - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..a05fae1524ffd 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -393,7 +393,7 @@ def cons_row(x): else: dtype = vdtype else: - dtype = vdtype + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim @@ -689,8 +689,11 @@ def cat( out: Index | Series if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA + dtype = None + if isna(result).all(): + dtype = object - out = Index(result, dtype=object, name=self._orig.name) + out = Index(result, dtype=dtype, name=self._orig.name) else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 3e620b7664335..497f87e245ba3 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -10,6 +12,7 @@ Series, _testing as tm, concat, + option_context, ) @@ -26,45 +29,49 @@ def test_str_cat_name(index_or_series, other): assert result.name == "name" -def test_str_cat(index_or_series): - box = index_or_series - # test_cat above tests "str_cat" from ndarray; - # here testing "str.cat" from Series/Index to ndarray/list - s = box(["a", "a", "b", "b", "c", np.nan]) - - # single array - result = s.str.cat() - expected = "aabbc" - assert result == expected - - result = s.str.cat(na_rep="-") - expected = "aabbc-" - assert result == expected - - result = s.str.cat(sep="_", na_rep="NA") - expected = "a_a_b_b_c_NA" - assert result == expected - - t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) - expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) - - # Series/Index with array - result = s.str.cat(t, na_rep="-") - tm.assert_equal(result, expected) - - # Series/Index with list - result = s.str.cat(list(t), na_rep="-") - tm.assert_equal(result, expected) - - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(z.values) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(list(z)) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_str_cat(index_or_series, infer_string): + with option_context("future.infer_string", infer_string): + box = index_or_series + # test_cat above tests "str_cat" from ndarray; + # here testing "str.cat" from Series/Index to ndarray/list + s = box(["a", "a", "b", "b", "c", np.nan]) + + # single array + result = s.str.cat() + expected = "aabbc" + assert result == expected + + result = s.str.cat(na_rep="-") + expected = "aabbc-" + assert result == expected + + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" + assert result == expected + + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) + + # Series/Index with array + result = s.str.cat(t, na_rep="-") + tm.assert_equal(result, expected) + + # Series/Index with list + result = s.str.cat(list(t), na_rep="-") + tm.assert_equal(result, expected) + + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]) + + with pytest.raises(ValueError, match=rgx): + s.str.cat(z.values) + + with pytest.raises(ValueError, match=rgx): + s.str.cat(list(z)) def test_str_cat_raises_intuitive_error(index_or_series): From d43c3be8a0564975e57c2c5effdd6dc98aae8e2a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 23:52:25 +0100 Subject: [PATCH 2/7] Update accessor.py --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a05fae1524ffd..62f6a576db24f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -393,7 +393,7 @@ def cons_row(x): else: dtype = vdtype else: - _dtype = vdtype + dtype = vdtype if expand: cons = self._orig._constructor_expanddim From 35bc604a8c1525ae887423547e3f7c9cd55cc941 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 00:22:37 +0100 Subject: [PATCH 3/7] Fix further bugs --- pandas/core/strings/accessor.py | 5 ++- pandas/tests/strings/test_cat.py | 64 ++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 62f6a576db24f..35bfb3a1ad2f1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -44,6 +44,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array @@ -455,7 +456,7 @@ def _get_series_list(self, others): # in case of list-like `others`, all elements must be # either Series/Index/np.ndarray (1-dim)... if all( - isinstance(x, (ABCSeries, ABCIndex)) + isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): @@ -697,7 +698,7 @@ def cat( else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. - dtype = None + dtype = self._orig.dtype.categories.dtype else: dtype = self._orig.dtype res_ser = Series( diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 497f87e245ba3..284932491a65e 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -85,39 +85,54 @@ def test_str_cat_raises_intuitive_error(index_or_series): s.str.cat(" ") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("sep", ["", None]) @pytest.mark.parametrize("dtype_target", ["object", "category"]) @pytest.mark.parametrize("dtype_caller", ["object", "category"]) -def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): +def test_str_cat_categorical( + index_or_series, dtype_caller, dtype_target, sep, infer_string +): box = index_or_series - s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) - t = Index(["b", "a", "b", "c"], dtype=dtype_target) - - expected = Index(["ab", "aa", "bb", "ac"]) - expected = expected if box == Index else Series(expected, index=s) + with option_context("future.infer_string", infer_string): + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) + s = s if box == Index else Series(s, index=s) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) + + expected = Index(["ab", "aa", "bb", "ac"]) + expected = ( + expected + if box == Index + else Series(expected, index=Index(s, dtype=dtype_caller)) + ) - # Series/Index with unaligned Index -> t.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) - # Series/Index with Series having matching Index - t = Series(t.values, index=s) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + # Series/Index with Series having matching Index + t = Series(t.values, index=Index(s, dtype=dtype_caller)) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) - # Series/Index with Series.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + # Series/Index with Series.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) - # Series/Index with Series having different Index - t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "bb", "bb", "aa"]) - expected = expected if box == Index else Series(expected, index=expected.str[:1]) + # Series/Index with Series having different Index + t = Series(t.values, index=t.values) + expected = Index(["aa", "aa", "bb", "bb", "aa"]) + dtype = object if dtype_caller == "object" else s.dtype.categories.dtype + expected = ( + expected + if box == Index + else Series(expected, index=Index(expected.str[:1], dtype=dtype)) + ) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -328,8 +343,9 @@ def test_str_cat_all_na(index_or_series, index_or_series2): # all-NA target if box == Series: - expected = Series([np.nan] * 4, index=s.index, dtype=object) + expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype) else: # box == Index + # TODO: Strimg option, this should return string dtype expected = Index([np.nan] * 4, dtype=object) result = s.str.cat(t, join="left") tm.assert_equal(result, expected) From fa99d733259cbad3638769637e00095f7cd62c6a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 00:49:23 +0100 Subject: [PATCH 4/7] Fix --- pandas/tests/strings/test_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 2914b22a52e94..fd2501835318d 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, @@ -178,6 +179,7 @@ def test_api_for_categorical(any_string_method, any_string_dtype): s = Series(list("aabb"), dtype=any_string_dtype) s = s + " " + s c = s.astype("category") + c = c.astype(CategoricalDtype(c.dtype.categories.astype("object"))) assert isinstance(c.str, StringMethods) method_name, args, kwargs = any_string_method From 9fd924b63af733d5b3078c2c6770648a210cd631 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 25 Nov 2023 18:21:04 +0100 Subject: [PATCH 5/7] Update accessor.py --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 35bfb3a1ad2f1..1d118cada16c3 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -698,7 +698,7 @@ def cat( else: # Series if isinstance(self._orig.dtype, CategoricalDtype): # We need to infer the new categories. - dtype = self._orig.dtype.categories.dtype + dtype = self._orig.dtype.categories.dtype # type: ignore[assignment] else: dtype = self._orig.dtype res_ser = Series( From 5b560687a3f1a815601a4f0c1b40096d61e46bc1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 22:55:56 +0100 Subject: [PATCH 6/7] Update v2.1.4.rst --- doc/source/whatsnew/v2.1.4.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 83ec9bcd6a519..ee2d8efdbcc6b 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -30,7 +30,6 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- Fixed bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) - Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) From 55784175b35f82f7011aed9d3138b175fc8ecd77 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 8 Dec 2023 22:56:41 +0100 Subject: [PATCH 7/7] Update v2.2.0.rst --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c878fd2664dc4..99faad8aff986 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -576,6 +576,7 @@ Strings ^^^^^^^ - Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) - Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)