Skip to content

Commit

Permalink
FIX-#3981, FIX-#3801, FIX-#4149: Stop broadcasting scalars to set ite…
Browse files Browse the repository at this point in the history
…ms. (#4160)

Signed-off-by: mvashishtha <mahesh@ponder.io>
  • Loading branch information
mvashishtha authored and vnlitvinov committed Mar 17, 2022
1 parent b59dc1d commit 515d3e3
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 43 deletions.
12 changes: 8 additions & 4 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import datetime
from pandas.core.indexes.api import ensure_index, Index, RangeIndex
from pandas.core.dtypes.common import is_numeric_dtype, is_list_like
from pandas._libs.lib import no_default
from typing import List, Hashable, Optional, Callable, Union, Dict

from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
Expand All @@ -31,7 +32,10 @@
find_common_type_cat as find_common_type,
)
from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
from modin.core.dataframe.base.dataframe.utils import Axis, JoinType
from modin.core.dataframe.base.dataframe.utils import (
Axis,
JoinType,
)
from modin.pandas.indexing import is_range_like
from modin.pandas.utils import is_full_grab_slice, check_both_not_none

Expand Down Expand Up @@ -1915,7 +1919,7 @@ def apply_select_indices(
new_index=None,
new_columns=None,
keep_remaining=False,
item_to_distribute=None,
item_to_distribute=no_default,
):
"""
Apply a function for a subset of the data.
Expand All @@ -1942,7 +1946,7 @@ def apply_select_indices(
advance, and if not provided it must be computed.
keep_remaining : boolean, default: False
Whether or not to drop the data that is not computed over.
item_to_distribute : (optional)
item_to_distribute : np.ndarray or scalar, default: no_default
The item to split up so it can be applied over both axes.
Returns
Expand Down Expand Up @@ -1989,7 +1993,7 @@ def apply_select_indices(
# variables set.
assert row_labels is not None and col_labels is not None
assert keep_remaining
assert item_to_distribute is not None
assert item_to_distribute is not no_default
row_partitions_list = self._get_dict_of_block_index(0, row_labels).items()
col_partitions_list = self._get_dict_of_block_index(1, col_labels).items()
new_partitions = self._partition_mgr_cls.apply_func_to_indices_both_axis(
Expand Down
16 changes: 4 additions & 12 deletions modin/core/dataframe/pandas/partitioning/partition_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from functools import wraps
import numpy as np
import pandas
from pandas._libs.lib import no_default
import warnings

from modin.error_message import ErrorMessage
Expand Down Expand Up @@ -1114,7 +1115,7 @@ def apply_func_to_indices_both_axis(
func,
row_partitions_list,
col_partitions_list,
item_to_distribute=None,
item_to_distribute=no_default,
row_lengths=None,
col_widths=None,
):
Expand All @@ -1135,7 +1136,7 @@ def apply_func_to_indices_both_axis(
Iterable of tuples, containing 2 values:
1. Integer column partition index.
2. Internal column indexer of this partition.
item_to_distribute : item, default: None
item_to_distribute : np.ndarray or scalar, default: no_default
The item to split up so it can be applied over both axes.
row_lengths : list of ints, optional
Lengths of partitions for every row. If not specified this information
Expand Down Expand Up @@ -1190,16 +1191,7 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
col_internal_idx, remote_part, col_idx, axis=1
)

# We want to eventually make item_to_distribute an np.ndarray,
# but that doesn't work for setting a subset of a categorical
# column, as per https://github.com/modin-project/modin/issues/3736.
# In that case, `item` is not an ndarray but instead some
# categorical variable, which we we don't need to distribute
# at all. Note that np.ndarray is not hashable, so it can't
# be a categorical variable.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
if item_to_distribute is not None:
if item_to_distribute is not no_default:
if isinstance(item_to_distribute, np.ndarray):
item = item_to_distribute[
row_position_counter : row_position_counter + row_offset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import pandas

import ray
from pandas._libs.lib import no_default


def progress_bar_wrapper(f):
Expand Down Expand Up @@ -550,7 +551,7 @@ def apply_func_to_indices_both_axis(
func,
row_partitions_list,
col_partitions_list,
item_to_distribute=None,
item_to_distribute=no_default,
row_lengths=None,
col_widths=None,
):
Expand All @@ -567,7 +568,7 @@ def apply_func_to_indices_both_axis(
List of row partitions.
col_partitions_list : list
List of column partitions.
item_to_distribute : item, optional
item_to_distribute : np.ndarray or scalar, default: no_default
The item to split up so it can be applied over both axes.
row_lengths : list of ints, optional
Lengths of partitions for every row. If not specified this information
Expand Down
27 changes: 2 additions & 25 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,30 +385,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
else:
new_col_len = len(col_lookup)
to_shape = new_row_len, new_col_len
# If we are asssigning to a single categorical column, we won't be
# able to set the column to a 2-d array due to a bug in Pandas.
# Until that bug is fixed, don't convert `item` to a 2-d array
# in that case.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
assigning_to_single_category_column = new_col_len == 1 and (
# Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
# dtype="category")
# Then type(df.dtypes) is pandas.core.series.Series
(
isinstance(self.df.dtypes, pandas.core.series.Series)
and self.df.dtypes[col_lookup][0].name == "category"
)
# Case 2: df = pd.Series( ["a", "b", "c"], dtype="category")
# Then df.dtypes == CategoricalDtype(categories=['a', 'b', 'c'],
# ordered=False)
# and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
# Case 3: df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
# Then df.dtypes == dtype('int64') and type(df.dtypes) is
# numpy.dtype
or (getattr(self.df.dtypes, "name", "") == "category")
)
if not assigning_to_single_category_column:
if not is_scalar(item):
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
self._write_items(row_lookup, col_lookup, item)

Expand All @@ -422,7 +399,7 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
The global row index to locate inside of `item`.
col_lookup : slice or scalar
The global col index to locate inside of `item`.
item : DataFrame, Series, query_compiler or scalar
item : DataFrame, Series, or query_compiler
Value that should be broadcast to a new shape of `to_shape`.
to_shape : tuple of two int
Shape of dataset that `item` should be broadcasted to.
Expand Down
11 changes: 11 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1765,6 +1765,17 @@ def test___setitem__assigning_single_categorical_sets_correct_dtypes():
df_equals(modin_df, pandas_df)


def test_iloc_assigning_scalar_none_to_string_frame():
# This test case comes from
# https://github.com/modin-project/modin/issues/3981
data = [["A"]]
modin_df = pd.DataFrame(data, dtype="string")
modin_df.iloc[0, 0] = None
pandas_df = pandas.DataFrame(data, dtype="string")
pandas_df.iloc[0, 0] = None
df_equals(modin_df, pandas_df)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___len__(data):
modin_df = pd.DataFrame(data)
Expand Down

0 comments on commit 515d3e3

Please sign in to comment.