Skip to content

Commit

Permalink
FIX-#3736: Don't use a 2-d array to set a categorical column. (#3777)
Browse files Browse the repository at this point in the history
Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com>
Signed-off-by: mvashishtha <mahesh@ponder.io>
  • Loading branch information
mvashishtha and vnlitvinov committed Dec 7, 2021
1 parent 6d51921 commit 0e36e22
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 5 deletions.
20 changes: 16 additions & 4 deletions modin/core/dataframe/pandas/partitioning/partition_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,11 +1201,23 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
col_internal_idx, remote_part, col_idx, axis=1
)

# We want to eventually make item_to_distribute an np.ndarray,
# but that doesn't work for setting a subset of a categorical
# column, as per https://github.com/modin-project/modin/issues/3736.
# In that case, `item` is not an ndarray but instead some
# categorical variable, which we we don't need to distribute
# at all. Note that np.ndarray is not hashable, so it can't
# be a categorical variable.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
if item_to_distribute is not None:
item = item_to_distribute[
row_position_counter : row_position_counter + row_offset,
col_position_counter : col_position_counter + col_offset,
]
if isinstance(item_to_distribute, np.ndarray):
item = item_to_distribute[
row_position_counter : row_position_counter + row_offset,
col_position_counter : col_position_counter + col_offset,
]
else:
item = item_to_distribute
item = {"item": item}
else:
item = {}
Expand Down
26 changes: 25 additions & 1 deletion modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,31 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
else:
new_col_len = len(col_lookup)
to_shape = new_row_len, new_col_len
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
# If we are asssigning to a single categorical column, we won't be
# able to set the column to a 2-d array due to a bug in Pandas.
# Until that bug is fixed, don't convert `item` to a 2-d array
# in that case.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
assigning_to_single_category_column = new_col_len == 1 and (
# Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
# dtype="category")
# Then type(df.dtypes) is pandas.core.series.Series
(
isinstance(self.df.dtypes, pandas.core.series.Series)
and self.df.dtypes[col_lookup][0].name == "category"
)
# Case 2: df = pd.Series( ["a", "b", "c"], dtype="category")
# Then df.dtypes == CategoricalDtype(categories=['a', 'b', 'c'],
# ordered=False)
# and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
# Case 3: df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
# Then df.dtypes == dtype('int64') and type(df.dtypes) is
# numpy.dtype
or (getattr(self.df.dtypes, "name", "") == "category")
)
if not assigning_to_single_category_column:
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
self._write_items(row_lookup, col_lookup, item)

def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
Expand Down
9 changes: 9 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,15 @@ def test_loc(data):
modin_df.loc["NO_EXIST"]


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_single_categorical_column():
modin_df = pd.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
pandas_df = pandas.DataFrame({"status": ["a", "b", "c"]}, dtype="category")
modin_df.loc[1:3, "status"] = "a"
pandas_df.loc[1:3, "status"] = "a"
df_equals(modin_df, pandas_df)


def test_loc_multi_index():
modin_df = pd.read_csv(
"modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0
Expand Down
9 changes: 9 additions & 0 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2105,6 +2105,15 @@ def test_loc(data):
df_equals(modin_result, pandas_result)


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_loc_setting_categorical_series():
modin_series = pd.Series(["a", "b", "c"], dtype="category")
pandas_series = pandas.Series(["a", "b", "c"], dtype="category")
modin_series.loc[1:3] = "a"
pandas_series.loc[1:3] = "a"
df_equals(modin_series, pandas_series)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_lt(data):
modin_series, pandas_series = create_test_series(data)
Expand Down

0 comments on commit 0e36e22

Please sign in to comment.