From 3c89432d1adf277d785145f5c62abca1157e19dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:57:14 +0100 Subject: [PATCH] Backport PR #56772 on branch 2.2.x (Support large strings in interchange protocol) (#56795) Backport PR #56772: Support large strings in interchange protocol Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/dtypes/dtypes.py | 4 +++- pandas/core/interchange/column.py | 9 +++------ pandas/core/interchange/utils.py | 1 + pandas/tests/interchange/test_impl.py | 9 +++++++++ 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index dbb11d3d0788d..36e677fa2a7a9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -906,6 +906,7 @@ Sparse Other ^^^^^ +- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) - Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) - Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ed5256922377a..e90e92fa0ee1c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2190,7 +2190,9 @@ def numpy_dtype(self) -> np.dtype: # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index acfbc5d9e6c62..7f524d6823f30 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -301,12 +301,9 @@ def _get_data_buffer( buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = ( - DtypeKind.STRING, - 8, - ArrowCTypes.STRING, - Endianness.NATIVE, - ) # note: currently only support native endianness + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 + dtype = self.dtype else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py index 4ac063080e62d..2e73e560e5740 100644 --- a/pandas/core/interchange/utils.py +++ b/pandas/core/interchange/utils.py @@ -37,6 +37,7 @@ "float": "f", # float32 "double": "g", # float64 "string": "u", + "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 15c2b8d000b37..27ea8ccdd17b1 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -362,3 +362,12 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_large_string(): + # GH#56702 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="object") + tm.assert_frame_equal(result, expected)