diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fd7d88bd52383..0c2a176869829 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -853,6 +853,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) +- :meth:`DataFrame.stack` no longer converts to object dtype for DataFrames where each column has the same extension dtype. The output Series will have the same dtype as the columns (:issue:`23077`). - :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7a55b652054ed..1f2a1ee52159e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -35,9 +35,9 @@ is_numeric_v_string_like, is_extension_type, is_extension_array_dtype, is_list_like, - is_sparse, is_re, is_re_compilable, + is_sparse, pandas_dtype) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2dca7cf0e6aa3..065728fb239ae 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -494,8 +494,9 @@ def factorize(index): if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type([ - col for _, col in frame.iteritems() + col._values for _, col in frame.iteritems() ]) + new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame.values.ravel() @@ -624,16 +625,32 @@ def _convert_level_number(level_num, columns): slice_len = loc.stop - loc.start if slice_len != levsize: - chunk = this.loc[:, this.columns[loc]] + chunk = this[this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: - if frame._is_mixed_type: - value_slice = this.loc[:, this.columns[loc]].values + if (frame._is_homogeneous_type and + is_extension_array_dtype(frame.dtypes.iloc[0])): + dtype = this[this.columns[loc]].dtypes.iloc[0] + subset = this[this.columns[loc]] + + value_slice = dtype.construct_array_type()._concat_same_type( + [x._values for _, x in subset.iteritems()] + ) + N, K = this.shape + idx = np.arange(N * K).reshape(K, N).T.ravel() + value_slice = value_slice.take(idx) + + elif frame._is_mixed_type: + value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] - new_data[key] = value_slice.ravel() + if value_slice.ndim > 1: + # i.e. not extension + value_slice = value_slice.ravel() + + new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) @@ -971,3 +988,38 @@ def make_axis_dummies(frame, axis='minor', transform=None): values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index) + + +def _reorder_for_extension_array_stack(arr, n_rows, n_columns): + """ + Re-orders the values when stacking multiple extension-arrays. + + The indirect stacking method used for EAs requires a followup + take to get the order correct. + + Parameters + ---------- + arr : ExtensionArray + n_rows, n_columns : int + The number of rows and columns in the original DataFrame. + + Returns + ------- + taken : ExtensionArray + The original `arr` with elements re-ordered appropriately + + Examples + -------- + >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) + >>> _reorder_for_extension_array_stack(arr, 2, 3) + array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> _reorder_for_extension_array_stack(arr, 3, 2) + array(['a', 'd', 'b', 'e', 'c', 'f'], dtype=' specialized, as is done for period. + expected = expected.astype(object) + + if isinstance(expected, pd.Series): + assert result.dtype == df.iloc[:, 0].dtype + else: + assert all(result.dtypes == df.iloc[:, 0].dtype) + + result = result.astype(object) + self.assert_equal(result, expected) + @pytest.mark.parametrize("index", [ # Two levels, uniform. pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 2b1bfecdf8f28..b7c61496f0bf0 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -139,6 +139,15 @@ def test_from_dtype(self, data): class TestReshaping(BaseJSON, base.BaseReshapingTests): + + @pytest.mark.skip(reason="Different definitions of NA") + def test_stack(self): + """ + The test does .astype(object).stack(). If we happen to have + any missing values in `data`, then we'll end up with different + rows since we consider `{}` NA, but `.astype(object)` doesn't. + """ + @pytest.mark.xfail(reason="dict for NA", strict=True) def test_unstack(self, data, index): # The base test has NaN for the expected NA value. diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 54511df4effad..ab3d6ca3b19f7 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -874,6 +874,17 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) + def test_stack_preserve_categorical_dtype_values(self): + # GH-23077 + cat = pd.Categorical(['a', 'a', 'b', 'c']) + df = pd.DataFrame({"A": cat, "B": cat}) + result = df.stack() + index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']]) + expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a', + 'b', 'b', 'c', 'c']), + index=index) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('level', [0, 1]) def test_unstack_mixed_extension_types(self, level): index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)], diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 03143488c3874..10074a2e5ad99 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -736,6 +736,16 @@ def test_astype_bool(self): assert res['A'].dtype == SparseDtype(np.bool) assert res['B'].dtype == SparseDtype(np.bool) + def test_astype_object(self): + # This may change in GH-23125 + df = pd.DataFrame({"A": SparseArray([0, 1]), + "B": SparseArray([0, 1])}) + result = df.astype(object) + dtype = SparseDtype(object, 0) + expected = pd.DataFrame({"A": SparseArray([0, 1], dtype=dtype), + "B": SparseArray([0, 1], dtype=dtype)}) + tm.assert_frame_equal(result, expected) + def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): df = float_frame_fill0.reindex(lrange(5)) dense = float_frame_fill0_dense.reindex(lrange(5))