Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#3981, FIX-#3801, FIX-#4149: Stop broadcasting scalars to set items. #4160

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 8 additions & 4 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import datetime
from pandas.core.indexes.api import ensure_index, Index, RangeIndex
from pandas.core.dtypes.common import is_numeric_dtype, is_list_like
from pandas._libs.lib import no_default
from typing import List, Hashable, Optional, Callable, Union, Dict

from modin.core.storage_formats.pandas.query_compiler import PandasQueryCompiler
Expand All @@ -31,7 +32,10 @@
find_common_type_cat as find_common_type,
)
from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
from modin.core.dataframe.base.dataframe.utils import Axis, JoinType
from modin.core.dataframe.base.dataframe.utils import (
Axis,
JoinType,
)
from modin.pandas.indexing import is_range_like
from modin.pandas.utils import is_full_grab_slice, check_both_not_none

Expand Down Expand Up @@ -1904,7 +1908,7 @@ def apply_select_indices(
new_index=None,
new_columns=None,
keep_remaining=False,
item_to_distribute=None,
item_to_distribute=no_default,
):
"""
Apply a function for a subset of the data.
Expand All @@ -1931,7 +1935,7 @@ def apply_select_indices(
advance, and if not provided it must be computed.
keep_remaining : boolean, default: False
Whether or not to drop the data that is not computed over.
item_to_distribute : (optional)
item_to_distribute : np.ndarray, Sentinel, or scalar, default: no_default
mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
The item to split up so it can be applied over both axes.

Returns
Expand Down Expand Up @@ -1978,7 +1982,7 @@ def apply_select_indices(
# variables set.
assert row_labels is not None and col_labels is not None
assert keep_remaining
assert item_to_distribute is not None
assert item_to_distribute is not no_default
row_partitions_list = self._get_dict_of_block_index(0, row_labels).items()
col_partitions_list = self._get_dict_of_block_index(1, col_labels).items()
new_partitions = self._partition_mgr_cls.apply_func_to_indices_both_axis(
Expand Down
16 changes: 4 additions & 12 deletions modin/core/dataframe/pandas/partitioning/partition_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from functools import wraps
import numpy as np
import pandas
from pandas._libs.lib import no_default
import warnings

from modin.error_message import ErrorMessage
Expand Down Expand Up @@ -1114,7 +1115,7 @@ def apply_func_to_indices_both_axis(
func,
row_partitions_list,
col_partitions_list,
item_to_distribute=None,
item_to_distribute=no_default,
row_lengths=None,
col_widths=None,
):
Expand All @@ -1135,7 +1136,7 @@ def apply_func_to_indices_both_axis(
Iterable of tuples, containing 2 values:
1. Integer column partition index.
2. Internal column indexer of this partition.
item_to_distribute : item, default: None
item_to_distribute : np.ndarray, Sentinel, or scalar, default: no_default
mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
The item to split up so it can be applied over both axes.
row_lengths : list of ints, optional
Lengths of partitions for every row. If not specified this information
Expand Down Expand Up @@ -1190,16 +1191,7 @@ def compute_part_size(indexer, remote_part, part_idx, axis):
col_internal_idx, remote_part, col_idx, axis=1
)

# We want to eventually make item_to_distribute an np.ndarray,
# but that doesn't work for setting a subset of a categorical
# column, as per https://github.com/modin-project/modin/issues/3736.
# In that case, `item` is not an ndarray but instead some
# categorical variable, which we we don't need to distribute
# at all. Note that np.ndarray is not hashable, so it can't
# be a categorical variable.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
if item_to_distribute is not None:
if item_to_distribute is not no_default:
if isinstance(item_to_distribute, np.ndarray):
item = item_to_distribute[
row_position_counter : row_position_counter + row_offset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from modin.core.storage_formats.pandas.utils import compute_chunksize
from modin.error_message import ErrorMessage
import pandas
from pandas._libs.lib import no_default

import ray

Expand Down Expand Up @@ -550,7 +551,7 @@ def apply_func_to_indices_both_axis(
func,
row_partitions_list,
col_partitions_list,
item_to_distribute=None,
item_to_distribute=no_default,
row_lengths=None,
col_widths=None,
):
Expand All @@ -567,7 +568,7 @@ def apply_func_to_indices_both_axis(
List of row partitions.
col_partitions_list : list
List of column partitions.
item_to_distribute : item, optional
item_to_distribute : np.ndarray, Sentinel, or scalar, default: no_default
mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
The item to split up so it can be applied over both axes.
row_lengths : list of ints, optional
Lengths of partitions for every row. If not specified this information
Expand Down
27 changes: 2 additions & 25 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,30 +385,7 @@ def __setitem__(self, row_lookup, col_lookup, item, axis=None):
else:
new_col_len = len(col_lookup)
to_shape = new_row_len, new_col_len
# If we are asssigning to a single categorical column, we won't be
# able to set the column to a 2-d array due to a bug in Pandas.
# Until that bug is fixed, don't convert `item` to a 2-d array
# in that case.
# TODO(https://github.com/pandas-dev/pandas/issues/44703): Delete
# this special case once the pandas bug is fixed.
assigning_to_single_category_column = new_col_len == 1 and (
# Case 1: df = pd.DataFrame({"status": ["a", "b", "c"]},
# dtype="category")
# Then type(df.dtypes) is pandas.core.series.Series
(
isinstance(self.df.dtypes, pandas.core.series.Series)
and self.df.dtypes[col_lookup][0].name == "category"
)
# Case 2: df = pd.Series( ["a", "b", "c"], dtype="category")
# Then df.dtypes == CategoricalDtype(categories=['a', 'b', 'c'],
# ordered=False)
# and type(df.dtypes) is pandas.core.dtypes.dtypes.CategoricalDtype.
# Case 3: df = pd.Series([0, 1, 2], index=['a', 'b', 'c'])
# Then df.dtypes == dtype('int64') and type(df.dtypes) is
# numpy.dtype
or (getattr(self.df.dtypes, "name", "") == "category")
)
if not assigning_to_single_category_column:
if not is_scalar(item):
item = self._broadcast_item(row_lookup, col_lookup, item, to_shape)
self._write_items(row_lookup, col_lookup, item)

Expand All @@ -422,7 +399,7 @@ def _broadcast_item(self, row_lookup, col_lookup, item, to_shape):
The global row index to locate inside of `item`.
col_lookup : slice or scalar
The global col index to locate inside of `item`.
item : DataFrame, Series, query_compiler or scalar
item : DataFrame, Series, or query_compiler
Value that should be broadcast to a new shape of `to_shape`.
to_shape : tuple of two int
Shape of dataset that `item` should be broadcasted to.
Expand Down
11 changes: 11 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1748,6 +1748,17 @@ def test___setitem__assigning_single_categorical_sets_correct_dtypes():
df_equals(modin_df, pandas_df)


def test_iloc_assigning_scalar_none_to_string_frame():
# This test case comes from
# https://github.com/modin-project/modin/issues/3981
data = [["A"]]
modin_df = pd.DataFrame(data, dtype="string")
modin_df.iloc[0, 0] = None
pandas_df = pandas.DataFrame(data, dtype="string")
pandas_df.iloc[0, 0] = None
df_equals(modin_df, pandas_df)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___len__(data):
modin_df = pd.DataFrame(data)
Expand Down
9 changes: 9 additions & 0 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2122,6 +2122,15 @@ def test_loc_setting_categorical_series():
df_equals(modin_series, pandas_series)


# This tests the bug from https://github.com/modin-project/modin/issues/3736
def test_iloc_assigning_scalar_none_to_string_series():
data = ["A"]
modin_series, pandas_series = create_test_series(data, dtype="string")
modin_series.iloc[0] = None
pandas_series.iloc[0] = None
df_equals(modin_series, pandas_series)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_lt(data):
modin_series, pandas_series = create_test_series(data)
Expand Down
2 changes: 1 addition & 1 deletion requirements/env_omnisci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ dependencies:
- openpyxl
- xlrd
- sqlalchemy
- scipy
- scipy
mvashishtha marked this conversation as resolved.
Show resolved Hide resolved
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@
url="https://github.com/modin-project/modin",
long_description=long_description,
long_description_content_type="text/markdown",
install_requires=["pandas==1.4.0", "packaging", "numpy>=1.18.5", "fsspec"],
install_requires=[
"pandas==1.4.0",
"packaging",
"numpy>=1.18.5",
"fsspec",
],
extras_require={
# can be installed by pip install modin[dask]
"dask": dask_deps,
Expand Down