Skip to content

Commit

Permalink
FIX-#3895: Fix assigning a Categorical to a column. (#3919)
Browse files Browse the repository at this point in the history
Co-authored-by: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com>
Co-authored-by: Devin Petersohn <devin-petersohn@users.noreply.github.com>
Signed-off-by: mvashishtha <mahesh@ponder.io>
  • Loading branch information
3 people committed Jan 24, 2022
1 parent 67228ef commit cb4e727
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -253,4 +253,23 @@ def run_modin(
constructor_kwargs=constructor_kwargs,
**kwargs,
)

# Currently, strings are converted to categories when exported from OmniSci,
# this makes the equality comparison fail. Converting string cols back to
# their original dtypes until the issue is resolved:
# https://github.com/modin-project/modin/issues/2747
if isinstance(exp_res, pd.DataFrame):
external_dtypes = exp_res.dtypes
exp_res = try_cast_to_pandas(exp_res)
internal_dtypes = exp_res.dtypes

new_schema = {}
for col in exp_res.columns:
if (
internal_dtypes[col] == "category"
and external_dtypes[col] != "category"
):
new_schema[col] = external_dtypes[col]
exp_res = exp_res.astype(new_schema)

comparator(ref_res, exp_res)
3 changes: 2 additions & 1 deletion modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from typing import IO, Optional, Union, Mapping, Iterator
import warnings

from modin.pandas import Categorical
from modin.error_message import ErrorMessage
from modin.utils import _inherit_docstrings, to_pandas, hashable
from modin.config import Engine, IsExperimental, PersistentPickle
Expand Down Expand Up @@ -2533,7 +2534,7 @@ def setitem_unhashable_key(df, value):
value = value.T.reshape(-1)
if len(self) > 0:
value = value[: len(self)]
if not isinstance(value, Series):
if not isinstance(value, (Series, Categorical)):
value = list(value)

if not self._query_compiler.lazy_execution and len(self.index) == 0:
Expand Down
10 changes: 10 additions & 0 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,6 +1738,16 @@ def test___setitem__single_item_in_series():
df_equals(modin_series, pandas_series)


def test___setitem__assigning_single_categorical_sets_correct_dtypes():
# This test case comes from
# https://github.com/modin-project/modin/issues/3895
modin_df = pd.DataFrame({"categories": ["A"]})
modin_df["categories"] = pd.Categorical(["A"])
pandas_df = pandas.DataFrame({"categories": ["A"]})
pandas_df["categories"] = pandas.Categorical(["A"])
df_equals(modin_df, pandas_df)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___len__(data):
modin_df = pd.DataFrame(data)
Expand Down
6 changes: 4 additions & 2 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,10 @@ def df_categories_equals(df1, df2):
else:
return True

categories_columns = df1.select_dtypes(include="category").columns
for column in categories_columns:
df1_categorical_columns = df1.select_dtypes(include="category").columns
df2_categorical_columns = df2.select_dtypes(include="category").columns
assert df1_categorical_columns.equals(df2_categorical_columns)
for column in df1_categorical_columns:
assert_extension_array_equal(
df1[column].values,
df2[column].values,
Expand Down

0 comments on commit cb4e727

Please sign in to comment.