FIX-#3981, FIX-#3801, FIX-#4149: Stop broadcasting scalars to set ite…

…ms. (#4160) Signed-off-by: mvashishtha <mahesh@ponder.io>
modin-project · Mar 17, 2022 · 515d3e3 · 515d3e3
1 parent b59dc1d
commit 515d3e3
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 43 deletions.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -23,6 +23,7 @@
 import datetime
 from pandas.core.indexes.api import ensure_index, Index, RangeIndex
 from pandas.core.dtypes.common import is_numeric_dtype, is_list_like
+from pandas._libs.lib import no_default
 from typing import List, Hashable, Optional, Callable, Union, Dict
 
 from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
@@ -31,7 +32,10 @@
     find_common_type_cat as find_common_type,
 )
 from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
-from modin.core.dataframe.base.dataframe.utils import Axis, JoinType
+from modin.core.dataframe.base.dataframe.utils import (
+    Axis,
+    JoinType,
+)
 from modin.pandas.indexing import is_range_like
 from modin.pandas.utils import is_full_grab_slice, check_both_not_none
 
@@ -1915,7 +1919,7 @@ def apply_select_indices(
         new_index=None,
         new_columns=None,
         keep_remaining=False,
-        item_to_distribute=None,
+        item_to_distribute=no_default,
     ):
         """
         Apply a function for a subset of the data.
@@ -1942,7 +1946,7 @@ def apply_select_indices(
             advance, and if not provided it must be computed.
         keep_remaining : boolean, default: False
             Whether or not to drop the data that is not computed over.
-        item_to_distribute : (optional)
+        item_to_distribute : np.ndarray or scalar, default: no_default
             The item to split up so it can be applied over both axes.
 
         Returns
@@ -1989,7 +1993,7 @@ def apply_select_indices(
             # variables set.
             assert row_labels is not None and col_labels is not None
             assert keep_remaining
-            assert item_to_distribute is not None
+            assert item_to_distribute is not no_default
             row_partitions_list = self._get_dict_of_block_index(0, row_labels).items()
             col_partitions_list = self._get_dict_of_block_index(1, col_labels).items()
             new_partitions = self._partition_mgr_cls.apply_func_to_indices_both_axis(

diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -21,6 +21,7 @@
 from functools import wraps
 import numpy as np
 import pandas
+from pandas._libs.lib import no_default
 import warnings
 
 from modin.error_message import ErrorMessage
@@ -1114,7 +1115,7 @@ def apply_func_to_indices_both_axis(
         func,
         row_partitions_list,
         col_partitions_list,
-        item_to_distribute=None,
+        item_to_distribute=no_default,
         row_lengths=None,
         col_widths=None,
     ):
@@ -1135,7 +1136,7 @@ def apply_func_to_indices_both_axis(
             Iterable of tuples, containing 2 values:
                 1. Integer column partition index.
                 2. Internal column indexer of this partition.
-        item_to_distribute : item, default: None
+        item_to_distribute : np.ndarray or scalar, default: no_default
             The item to split up so it can be applied over both axes.
         row_lengths : list of ints, optional
             Lengths of partitions for every row. If not specified this information
@@ -1190,16 +1191,7 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
                     col_internal_idx, remote_part, col_idx, axis=1
                 )
 
-                # We want to eventually make item_to_distribute an np.ndarray,
-                # but that doesn't work for setting a subset of a categorical
-                # column, as per https://github.com/modin-project/modin/issues/3736.
-                # In that case, `item` is not an ndarray but instead some
-                # categorical variable, which we we don't need to distribute
-                # at all. Note that np.ndarray is not hashable, so it can't
-                # be a categorical variable.
-                # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
-                # this special case once the pandas bug is fixed.
-                if item_to_distribute is not None:
+                if item_to_distribute is not no_default:
                     if isinstance(item_to_distribute, np.ndarray):
                         item = item_to_distribute[
                             row_position_counter : row_position_counter + row_offset,

diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py
@@ -32,6 +32,7 @@
 import pandas
 
 import ray
+from pandas._libs.lib import no_default
 
 
 def progress_bar_wrapper(f):
@@ -550,7 +551,7 @@ def apply_func_to_indices_both_axis(
         func,
         row_partitions_list,
         col_partitions_list,
-        item_to_distribute=None,
+        item_to_distribute=no_default,
         row_lengths=None,
         col_widths=None,
     ):
@@ -567,7 +568,7 @@ def apply_func_to_indices_both_axis(
             List of row partitions.
         col_partitions_list : list
             List of column partitions.
-        item_to_distribute : item, optional
+        item_to_distribute : np.ndarray or scalar, default: no_default
             The item to split up so it can be applied over both axes.
         row_lengths : list of ints, optional
             Lengths of partitions for every row. If not specified this information

diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
@@ -385,30 +385,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
             else:
                 new_col_len = len(col_lookup)
             to_shape = new_row_len, new_col_len
-            # If we are asssigning to a single categorical column, we won't be
-            # able to set the column to a 2-d array due to a bug in Pandas.
-            # Until that bug is fixed, don't convert `item` to a 2-d array
-            # in that case.
-            # TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
-            # this special case once the pandas bug is fixed.
-            assigning_to_single_category_column = new_col_len == 1 and (
-                # Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
-                #                           dtype="category")
-                # Then type(df.dtypes) is pandas.core.series.Series
-                (
-                    isinstance(self.df.dtypes, pandas.core.series.Series)
-                    and self.df.dtypes[col_lookup][0].name == "category"
-                )
-                # Case 2: df = pd.Series( ["a", "b", "c"],  dtype="category")
-                # Then df.dtypes ==  CategoricalDtype(categories=['a', 'b', 'c'],
-                #                                     ordered=False)
-                # and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
-                # Case 3:  df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
-                # Then df.dtypes == dtype('int64') and type(df.dtypes) is
-                # numpy.dtype
-                or (getattr(self.df.dtypes, "name", "") == "category")
-            )
-            if not assigning_to_single_category_column:
+            if not is_scalar(item):
                 item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
             self._write_items(row_lookup, col_lookup, item)
 
@@ -422,7 +399,7 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
             The global row index to locate inside of `item`.
         col_lookup : slice or scalar
             The global col index to locate inside of `item`.
-        item : DataFrame, Series, query_compiler or scalar
+        item : DataFrame, Series, or query_compiler
             Value that should be broadcast to a new shape of `to_shape`.
         to_shape : tuple of two int
             Shape of dataset that `item` should be broadcasted to.

diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py
@@ -1765,6 +1765,17 @@ def test___setitem__assigning_single_categorical_sets_correct_dtypes():
     df_equals(modin_df, pandas_df)
 
 
+def test_iloc_assigning_scalar_none_to_string_frame():
+    # This test case comes from
+    # https://github.com/modin-project/modin/issues/3981
+    data = [["A"]]
+    modin_df = pd.DataFrame(data, dtype="string")
+    modin_df.iloc[0, 0] = None
+    pandas_df = pandas.DataFrame(data, dtype="string")
+    pandas_df.iloc[0, 0] = None
+    df_equals(modin_df, pandas_df)
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test___len__(data):
     modin_df = pd.DataFrame(data)