Backport PR #52800 on branch 2.0.x (BUG: interchange.from_dataframe d…

…oesn't work with large_string) (#52822) Backport PR #52800: BUG: interchange.from_dataframe doesn't work with large_string Co-authored-by: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
pandas-dev · Apr 21, 2023 · 6da8e61 · 6da8e61
1 parent 6812451
commit 6da8e61
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 3 deletions.
diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst
@@ -28,7 +28,8 @@ Bug fixes
 - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
 - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
 - Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
-- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on-categorical dtypes (:issue:`49889`)
+- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
+- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)
 - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
 - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
 - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`)

diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
@@ -238,8 +238,11 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
     # Retrieve the data buffer containing the UTF-8 code units
     data_buff, protocol_data_dtype = buffers["data"]
     # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
-    assert protocol_data_dtype[1] == 8  # bitwidth == 8
-    assert protocol_data_dtype[2] == ArrowCTypes.STRING  # format_str == utf-8
+    assert protocol_data_dtype[1] == 8
+    assert protocol_data_dtype[2] in (
+        ArrowCTypes.STRING,
+        ArrowCTypes.LARGE_STRING,
+    )  # format_str == utf-8
     # Convert the buffers to NumPy arrays. In order to go from STRING to
     # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
     data_dtype = (

diff --git a/pandas/core/interchange/utils.py b/pandas/core/interchange/utils.py
@@ -37,6 +37,7 @@ class ArrowCTypes:
     FLOAT32 = "f"
     FLOAT64 = "g"
     STRING = "u"  # utf-8
+    LARGE_STRING = "U"  # utf-8
     DATE32 = "tdD"
     DATE64 = "tdm"
     # Resoulution:

diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
@@ -89,6 +89,21 @@ def test_categorical_pyarrow():
     tm.assert_frame_equal(result, expected)
 
 
+def test_large_string_pyarrow():
+    # GH 52795
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+
+    arr = ["Mon", "Tue"]
+    table = pa.table({"weekday": pa.array(arr, "large_string")})
+    exchange_df = table.__dataframe__()
+    result = from_dataframe(exchange_df)
+    expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
+    tm.assert_frame_equal(result, expected)
+
+    # check round-trip
+    assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
+
+
 @pytest.mark.parametrize(
     "data", [int_data, uint_data, float_data, bool_data, datetime_data]
 )