FIX-#3736: Don't use a 2-d array to set a categorical column. (#3777)

Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com> Signed-off-by: mvashishtha <mahesh@ponder.io>
modin-project · Dec 7, 2021 · 0e36e22 · 0e36e22
1 parent 6d51921
commit 0e36e22
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 5 deletions.
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -1201,11 +1201,23 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
                     col_internal_idx, remote_part, col_idx, axis=1
                 )
 
+                # We want to eventually make item_to_distribute an np.ndarray,
+                # but that doesn't work for setting a subset of a categorical
+                # column, as per https://github.com/modin-project/modin/issues/3736.
+                # In that case, `item` is not an ndarray but instead some
+                # categorical variable, which we we don't need to distribute
+                # at all. Note that np.ndarray is not hashable, so it can't
+                # be a categorical variable.
+                # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
+                # this special case once the pandas bug is fixed.
                 if item_to_distribute is not None:
-                    item = item_to_distribute[
-                        row_position_counter : row_position_counter + row_offset,
-                        col_position_counter : col_position_counter + col_offset,
-                    ]
+                    if isinstance(item_to_distribute, np.ndarray):
+                        item = item_to_distribute[
+                            row_position_counter : row_position_counter + row_offset,
+                            col_position_counter : col_position_counter + col_offset,
+                        ]
+                    else:
+                        item = item_to_distribute
                     item = {"item": item}
                 else:
                     item = {}

diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
@@ -385,7 +385,31 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
             else:
                 new_col_len = len(col_lookup)
             to_shape = new_row_len, new_col_len
-            item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
+            # If we are asssigning to a single categorical column, we won't be
+            # able to set the column to a 2-d array due to a bug in Pandas.
+            # Until that bug is fixed, don't convert `item` to a 2-d array
+            # in that case.
+            # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
+            # this special case once the pandas bug is fixed.
+            assigning_to_single_category_column = new_col_len == 1 and (
+                # Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
+                #                           dtype="category")
+                # Then type(df.dtypes) is pandas.core.series.Series
+                (
+                    isinstance(self.df.dtypes, pandas.core.series.Series)
+                    and self.df.dtypes[col_lookup][0].name == "category"
+                )
+                # Case 2: df = pd.Series( ["a", "b", "c"],  dtype="category")
+                # Then df.dtypes ==  CategoricalDtype(categories=['a', 'b', 'c'],
+                #                                     ordered=False)
+                # and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
+                # Case 3:  df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
+                # Then df.dtypes == dtype('int64') and type(df.dtypes) is
+                # numpy.dtype
+                or (getattr(self.df.dtypes, "name", "") == "category")
+            )
+            if not assigning_to_single_category_column:
+                item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
             self._write_items(row_lookup, col_lookup, item)
 
     def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):

diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
@@ -372,6 +372,15 @@ def test_loc(data):
         modin_df.loc["NO_EXIST"]
 
 
+# This tests the bug from https://github.com/modin-project/modin/issues/3736
+def test_loc_setting_single_categorical_column():
+    modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
+    pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
+    modin_df.loc[1:3, "status"] = "a"
+    pandas_df.loc[1:3, "status"] = "a"
+    df_equals(modin_df, pandas_df)
+
+
 def test_loc_multi_index():
     modin_df = pd.read_csv(
         "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0

diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -2105,6 +2105,15 @@ def test_loc(data):
     df_equals(modin_result, pandas_result)
 
 
+# This tests the bug from https://github.com/modin-project/modin/issues/3736
+def test_loc_setting_categorical_series():
+    modin_series = pd.Series(["a", "b", "c"], dtype="category")
+    pandas_series = pandas.Series(["a", "b", "c"], dtype="category")
+    modin_series.loc[1:3] = "a"
+    pandas_series.loc[1:3] = "a"
+    df_equals(modin_series, pandas_series)
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_lt(data):
     modin_series, pandas_series = create_test_series(data)