Skip to content

Commit

Permalink
FIX-#4652: Support categorical data in from_dataframe (#4737)
Browse files Browse the repository at this point in the history
Signed-off-by: Karthik Velayutham <vkarthik@ponder.io>
  • Loading branch information
Karthik Velayutham committed Aug 1, 2022
1 parent 8864bc1 commit 8521bbe
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 8 deletions.
3 changes: 2 additions & 1 deletion docs/release_notes/release_notes-0.16.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ Key Features and Updates
* FIX-#4358: MultiIndex `loc` shouldn't drop levels for full-key lookups (#4608)
* FIX-#4658: Expand exception handling for `read_*` functions from s3 storages (#4659)
* FIX-#4672: Fix incorrect warning when setting `frame.index` or `frame.columns` (#4721)
* FIX-#4686: Propagate metadata and drain call queue in unwrap_partitions (#4697)
* FIX-#4686: Propagate metadata and drain call queue in unwrap_partitions (#4697)
* FIX-#4652: Support categorical data in `from_dataframe` (#4737)
* Performance enhancements
* PERF-#4182: Add cell-wise execution for binary ops, fix bin ops for empty dataframes (#4391)
* PERF-#4288: Improve perf of `groupby.mean` for narrow data (#4591)
Expand Down
15 changes: 10 additions & 5 deletions modin/core/dataframe/pandas/exchange/dataframe_protocol/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@
from .exception import NoValidityBuffer, NoOffsetsBuffer


_NO_VALIDITY_BUFFER = {
ColumnNullType.NON_NULLABLE: "This column is non-nullable so does not have a mask",
ColumnNullType.USE_NAN: "This column uses NaN as null so does not have a separate mask",
ColumnNullType.USE_SENTINEL: "This column uses a sentinel value so does not have a mask",
}


@_inherit_docstrings(ProtocolColumn)
class PandasProtocolColumn(ProtocolColumn):
"""
Expand Down Expand Up @@ -414,11 +421,9 @@ def _get_validity_buffer(self) -> Tuple[PandasProtocolBuffer, Any]:
self._validity_buffer_cache = (buffer, dtype)
return self._validity_buffer_cache

if null == ColumnNullType.NON_NULLABLE:
msg = "This column is non-nullable so does not have a mask"
elif null == ColumnNullType.USE_NAN:
msg = "This column uses NaN as null so does not have a separate mask"
else:
try:
msg = _NO_VALIDITY_BUFFER[null]
except KeyError:
raise NotImplementedError("See self.describe_null")

raise NoValidityBuffer(msg)
Expand Down
13 changes: 11 additions & 2 deletions modin/test/exchange/dataframe_protocol/pandas/test_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
from modin.test.test_utils import warns_that_defaulting_to_pandas


def test_simple_import():
modin_df_producer = pd.DataFrame(test_data["int_data"])
def eval_df_protocol(modin_df_producer):
internal_modin_df_producer = modin_df_producer.__dataframe__()
# Our configuration in pytest.ini requires that we explicitly catch all
# instances of defaulting to pandas, this one raises a warning on `.from_dataframe`
Expand All @@ -43,3 +42,13 @@ def test_simple_import():

df_equals(modin_df_producer, modin_df_consumer)
df_equals(modin_df_producer, internal_modin_df_consumer)


def test_simple_import():
modin_df = pd.DataFrame(test_data["int_data"])
eval_df_protocol(modin_df)


def test_categorical_from_dataframe():
modin_df = pd.DataFrame({"foo": pd.Series([0, 1, 2, 3], dtype="category")})
eval_df_protocol(modin_df)

0 comments on commit 8521bbe

Please sign in to comment.