From 8521bbe63f15fbfc6c86a9d5a3c99112738ce7fd Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Mon, 1 Aug 2022 13:01:44 -0500 Subject: [PATCH] FIX-#4652: Support categorical data in `from_dataframe` (#4737) Signed-off-by: Karthik Velayutham --- docs/release_notes/release_notes-0.16.0.rst | 3 ++- .../pandas/exchange/dataframe_protocol/column.py | 15 ++++++++++----- .../dataframe_protocol/pandas/test_protocol.py | 13 +++++++++++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/docs/release_notes/release_notes-0.16.0.rst b/docs/release_notes/release_notes-0.16.0.rst index 37b8b11edf6..59e0c3de56c 100644 --- a/docs/release_notes/release_notes-0.16.0.rst +++ b/docs/release_notes/release_notes-0.16.0.rst @@ -25,7 +25,8 @@ Key Features and Updates * FIX-#4358: MultiIndex `loc` shouldn't drop levels for full-key lookups (#4608) * FIX-#4658: Expand exception handling for `read_*` functions from s3 storages (#4659) * FIX-#4672: Fix incorrect warning when setting `frame.index` or `frame.columns` (#4721) - * FIX-#4686: Propagate metadata and drain call queue in unwrap_partitions (#4697) + * FIX-#4686: Propagate metadata and drain call queue in unwrap_partitions (#4697) + * FIX-#4652: Support categorical data in `from_dataframe` (#4737) * Performance enhancements * PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391) * PERF-#4288: Improve perf of `groupby.mean` for narrow data (#4591) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/column.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/column.py index ba462d11815..473d0eb5b66 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/column.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/column.py @@ -43,6 +43,13 @@ from .exception import NoValidityBuffer, NoOffsetsBuffer +_NO_VALIDITY_BUFFER = { + ColumnNullType.NON_NULLABLE: "This column is non-nullable so does not have a mask", + ColumnNullType.USE_NAN: "This column uses NaN as null so does not have a separate mask", + ColumnNullType.USE_SENTINEL: "This column uses a sentinel value so does not have a mask", +} + + @_inherit_docstrings(ProtocolColumn) class PandasProtocolColumn(ProtocolColumn): """ @@ -414,11 +421,9 @@ def _get_validity_buffer(self) -> Tuple[PandasProtocolBuffer, Any]: self._validity_buffer_cache = (buffer, dtype) return self._validity_buffer_cache - if null == ColumnNullType.NON_NULLABLE: - msg = "This column is non-nullable so does not have a mask" - elif null == ColumnNullType.USE_NAN: - msg = "This column uses NaN as null so does not have a separate mask" - else: + try: + msg = _NO_VALIDITY_BUFFER[null] + except KeyError: raise NotImplementedError("See self.describe_null") raise NoValidityBuffer(msg) diff --git a/modin/test/exchange/dataframe_protocol/pandas/test_protocol.py b/modin/test/exchange/dataframe_protocol/pandas/test_protocol.py index c5cdb9eb01e..a2d4379f761 100644 --- a/modin/test/exchange/dataframe_protocol/pandas/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/pandas/test_protocol.py @@ -19,8 +19,7 @@ from modin.test.test_utils import warns_that_defaulting_to_pandas -def test_simple_import(): - modin_df_producer = pd.DataFrame(test_data["int_data"]) +def eval_df_protocol(modin_df_producer): internal_modin_df_producer = modin_df_producer.__dataframe__() # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, this one raises a warning on `.from_dataframe` @@ -43,3 +42,13 @@ def test_simple_import(): df_equals(modin_df_producer, modin_df_consumer) df_equals(modin_df_producer, internal_modin_df_consumer) + + +def test_simple_import(): + modin_df = pd.DataFrame(test_data["int_data"]) + eval_df_protocol(modin_df) + + +def test_categorical_from_dataframe(): + modin_df = pd.DataFrame({"foo": pd.Series([0, 1, 2, 3], dtype="category")}) + eval_df_protocol(modin_df)