From 28a42da41ca8e13efaa2ceb3939e576d08c232c8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Nov 2018 09:34:57 -0600 Subject: [PATCH] ENH: Support EAs in Series.unstack (#23284) --- asv_bench/benchmarks/reshape.py | 20 ++++- doc/source/whatsnew/v0.24.0.txt | 3 +- pandas/core/internals/blocks.py | 83 +++++++++++++++++-- pandas/core/internals/managers.py | 10 ++- pandas/core/reshape/reshape.py | 81 ++++++++++++------ pandas/tests/extension/base/reshaping.py | 45 ++++++++++ pandas/tests/extension/decimal/array.py | 2 +- .../tests/extension/decimal/test_decimal.py | 21 ++++- pandas/tests/extension/json/test_json.py | 6 +- pandas/tests/frame/test_reshape.py | 27 ++++-- pandas/tests/sparse/test_pivot.py | 1 + 11 files changed, 248 insertions(+), 51 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index bda486dba3b0f..67fdfb82e72c0 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -49,21 +49,33 @@ def time_unstack(self): class Unstack(object): - def setup(self): + params = ['int', 'category'] + + def setup(self, dtype): m = 100 n = 1000 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) - values = np.arange(m * m * n).reshape(m * m, n) + if dtype == 'int': + values = np.arange(m * m * n).reshape(m * m, n) + else: + # the category branch is ~20x slower than int. So we + # cut down the size a bit. Now it's only ~3x slower. + n = 50 + columns = columns[:n] + indices = np.random.randint(0, 52, size=(m * m, n)) + values = np.take(list(string.ascii_letters), indices) + values = [pd.Categorical(v) for v in values.T] + self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] - def time_full_product(self): + def time_full_product(self, dtype): self.df.unstack() - def time_without_last_row(self): + def time_without_last_row(self, dtype): self.df2.unstack() diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0c1477007dc7d..eb7a11e4ba17e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -853,7 +853,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) -- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`). +- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`). .. _whatsnew_0240.api.incompatibilities: @@ -1090,6 +1090,7 @@ Categorical - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). +- In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) Datetimelike diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e84953f3dab56..7a55b652054ed 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import functools import warnings import inspect import re @@ -34,6 +35,7 @@ is_numeric_v_string_like, is_extension_type, is_extension_array_dtype, is_list_like, + is_sparse, is_re, is_re_compilable, pandas_dtype) @@ -632,7 +634,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self if klass is None: - if dtype == np.object_: + if is_sparse(self.values): + # special case sparse, Series[Sparse].astype(object) is sparse + klass = ExtensionBlock + elif is_object_dtype(dtype): klass = ObjectBlock elif is_extension_array_dtype(dtype): klass = ExtensionBlock @@ -1429,7 +1434,7 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def _unstack(self, unstacker_func, new_columns): + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): """Return a list of unstacked blocks of self Parameters @@ -1438,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns): Partially applied unstacker. new_columns : Index All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock.unstack + fill_value : int + Only used in ExtensionBlock.unstack Returns ------- @@ -1731,7 +1740,7 @@ def _slice(self, slicer): def _try_cast_result(self, result, dtype=None): return result - def _unstack(self, unstacker_func, new_columns): + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): """Return a list of unstacked blocks of self Parameters @@ -1740,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns): Partially applied unstacker. new_columns : Index All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock.unstack + fill_value : int + Only used in ExtensionBlock.unstack Returns ------- @@ -1751,11 +1764,11 @@ def _unstack(self, unstacker_func, new_columns): # NonConsolidatable blocks can have a single item only, so we return # one block per item unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - mask = mask.any(0) + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) + new_values = new_values.T[mask] new_placement = new_placement[mask] @@ -1763,6 +1776,38 @@ def _unstack(self, unstacker_func, new_columns): for vals, place in zip(new_values, new_placement)] return blocks, mask + def _get_unstack_items(self, unstacker, new_columns): + """ + Get the placement, values, and mask for a Block unstack. + + This is shared between ObjectBlock and ExtensionBlock. They + differ in that ObjectBlock passes the values, while ExtensionBlock + passes the dummy ndarray of positions to be used by a take + later. + + Parameters + ---------- + unstacker : pandas.core.reshape.reshape._Unstacker + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + new_placement : ndarray[int] + The placement of the new columns in `new_columns`. + new_values : Union[ndarray, ExtensionArray] + The first return value from _Unstacker.get_new_values. + mask : ndarray[bool] + The second return value from _Unstacker.get_new_values. + """ + # shared with ExtensionBlock + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + return new_placement, new_values, mask + class ExtensionBlock(NonConsolidatableMixIn, Block): """Block for holding extension types. @@ -1950,6 +1995,30 @@ def shift(self, periods, axis=0): def _ftype(self): return getattr(self.values, '_pandas_ftype', Block._ftype) + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + # ExtensionArray-safe unstack. + # We override ObjectBlock._unstack, which unstacks directly on the + # values of the array. For EA-backed blocks, this would require + # converting to a 2-D ndarray of objects. + # Instead, we unstack an ndarray of integer positions, followed by + # a `take` on the actual values. + dummy_arr = np.arange(n_rows) + dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) + unstacker = dummy_unstacker(dummy_arr) + + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) + + blocks = [ + self.make_block_same_class( + self.values.take(indices, allow_fill=True, + fill_value=fill_value), + [place]) + for indices, place in zip(new_values.T, new_placement) + ] + return blocks, mask + class NumericBlock(Block): __slots__ = () diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fc3a12a9da82a..0519c5e5abe33 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1405,18 +1405,21 @@ def canonicalize(block): return all(block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks)) - def unstack(self, unstacker_func): + def unstack(self, unstacker_func, fill_value): """Return a blockmanager with all blocks unstacked. Parameters ---------- unstacker_func : callable A (partially-applied) ``pd.core.reshape._Unstacker`` class. + fill_value : Any + fill_value for newly introduced missing values. Returns ------- unstacked : BlockManager """ + n_rows = self.shape[-1] dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) new_columns = dummy.get_new_columns() new_index = dummy.get_new_index() @@ -1427,7 +1430,10 @@ def unstack(self, unstacker_func): blocks, mask = blk._unstack( partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), - new_columns) + new_columns, + n_rows, + fill_value + ) new_blocks.extend(blocks) columns_mask.extend(mask) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d3b677a1df2a3..2dca7cf0e6aa3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -12,12 +12,12 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like, - is_object_dtype, is_sparse, needs_i8_conversion) + is_object_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import notna from pandas import compat import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical, SparseArray +from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import _factorize_from_iterable from pandas.core.frame import DataFrame from pandas.core.index import Index, MultiIndex @@ -82,28 +82,15 @@ class _Unstacker(object): def __init__(self, values, index, level=-1, value_columns=None, fill_value=None, constructor=None): - self.is_categorical = None - self.is_sparse = is_sparse(values) if values.ndim == 1: - if isinstance(values, Categorical): - self.is_categorical = values - values = np.array(values) - elif self.is_sparse: - # XXX: Makes SparseArray *dense*, but it's supposedly - # a single column at a time, so it's "doable" - values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns self.fill_value = fill_value if constructor is None: - if self.is_sparse: - self.constructor = SparseDataFrame - else: - self.constructor = DataFrame - else: - self.constructor = constructor + constructor = DataFrame + self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -174,14 +161,6 @@ def get_result(self): columns = self.get_new_columns() index = self.get_new_index() - # may need to coerce categoricals here - if self.is_categorical is not None: - categories = self.is_categorical.categories - ordered = self.is_categorical.ordered - values = [Categorical(values[:, i], categories=categories, - ordered=ordered) - for i in range(values.shape[-1])] - return self.constructor(values, index=index, columns=columns) def get_new_values(self): @@ -339,6 +318,7 @@ def _unstack_multiple(data, clocs, fill_value=None): if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index + unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames @@ -394,6 +374,8 @@ def unstack(obj, level, fill_value=None): else: return obj.T.stack(dropna=False) else: + if is_extension_array_dtype(obj.dtype): + return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker(obj.values, obj.index, level=level, fill_value=fill_value, constructor=obj._constructor_expanddim) @@ -404,7 +386,8 @@ def _unstack_frame(obj, level, fill_value=None): if obj._is_mixed_type: unstacker = partial(_Unstacker, index=obj.index, level=level, fill_value=fill_value) - blocks = obj._data.unstack(unstacker) + blocks = obj._data.unstack(unstacker, + fill_value=fill_value) return obj._constructor(blocks) else: unstacker = _Unstacker(obj.values, obj.index, level=level, @@ -414,6 +397,52 @@ def _unstack_frame(obj, level, fill_value=None): return unstacker.get_result() +def _unstack_extension_series(series, level, fill_value): + """ + Unstack an ExtensionArray-backed Series. + + The ExtensionDtype is preserved. + + Parameters + ---------- + series : Series + A Series with an ExtensionArray for values + level : Any + The level name or number. + fill_value : Any + The user-level (not physical storage) fill value to use for + missing values introduced by the reshape. Passed to + ``series.values.take``. + + Returns + ------- + DataFrame + Each column of the DataFrame will have the same dtype as + the input Series. + """ + # Implementation note: the basic idea is to + # 1. Do a regular unstack on a dummy array of integers + # 2. Followup with a columnwise take. + # We use the dummy take to discover newly-created missing values + # introduced by the reshape. + from pandas.core.reshape.concat import concat + + dummy_arr = np.arange(len(series)) + # fill_value=-1, since we will do a series.values.take later + result = _Unstacker(dummy_arr, series.index, + level=level, fill_value=-1).get_result() + + out = [] + values = series.values + + for col, indices in result.iteritems(): + out.append(Series(values.take(indices.values, + allow_fill=True, + fill_value=fill_value), + name=col, index=result.index)) + return concat(out, axis='columns', copy=False, keys=result.columns) + + def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 446912b66bf33..d0e42e69e300f 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -1,3 +1,5 @@ +import itertools + import numpy as np import pytest @@ -170,3 +172,46 @@ def test_merge(self, data, na_value): [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + + @pytest.mark.parametrize("index", [ + # Two levels, uniform. + pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), + names=['a', 'b']), + + # non-uniform + pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]), + + # three levels, non-uniform + pd.MultiIndex.from_product([('A', 'B'), ('a', 'b', 'c'), (0, 1, 2)]), + pd.MultiIndex.from_tuples([ + ('A', 'a', 1), + ('A', 'b', 0), + ('A', 'a', 0), + ('B', 'a', 0), + ('B', 'c', 1), + ]), + ]) + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, data, index, obj): + data = data[:len(index)] + if obj == "series": + ser = pd.Series(data, index=index) + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + + n = index.nlevels + levels = list(range(n)) + # [0, 1, 2] + # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)] + combinations = itertools.chain.from_iterable( + itertools.permutations(levels, i) for i in range(1, n) + ) + + for level in combinations: + result = ser.unstack(level=level) + assert all(isinstance(result[col].values, type(data)) + for col in result.columns) + expected = ser.astype(object).unstack(level=level) + result = result.astype(object) + + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index c14bfa359bc64..3c8905c578c4f 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -102,7 +102,7 @@ def copy(self, deep=False): def astype(self, dtype, copy=True): if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) - return super(DecimalArray, self).astype(dtype, copy) + return np.asarray(self, dtype=dtype) def __setitem__(self, key, value): if pd.api.types.is_list_like(value): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 1c9beefe9e542..af5f6bf0a2f65 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -1,4 +1,5 @@ import decimal +import math import operator import numpy as np @@ -63,9 +64,23 @@ def data_for_grouping(): class BaseDecimal(object): def assert_series_equal(self, left, right, *args, **kwargs): - - left_na = left.isna() - right_na = right.isna() + def convert(x): + # need to convert array([Decimal(NaN)], dtype='object') to np.NaN + # because Series[object].isnan doesn't recognize decimal(NaN) as + # NA. + try: + return math.isnan(x) + except TypeError: + return False + + if left.dtype == 'object': + left_na = left.apply(convert) + else: + left_na = left.isna() + if right.dtype == 'object': + right_na = right.apply(convert) + else: + right_na = right.isna() tm.assert_series_equal(left_na, right_na) return tm.assert_series_equal(left[~left_na], diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 778432376e092..2b1bfecdf8f28 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -139,7 +139,11 @@ def test_from_dtype(self, data): class TestReshaping(BaseJSON, base.BaseReshapingTests): - pass + @pytest.mark.xfail(reason="dict for NA", strict=True) + def test_unstack(self, data, index): + # The base test has NaN for the expected NA value. + # this matches otherwise + return super().test_unstack(data, index) class TestGetitem(BaseJSON, base.BaseGetitemTests): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index ed3cc39052183..54511df4effad 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -277,8 +277,6 @@ def test_unstack_fill_frame_timedelta(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) - @pytest.mark.xfail(reason="GH-23077", - strict=True) def test_unstack_fill_frame_period(self): # Test unstacking with period @@ -305,7 +303,8 @@ def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') data.index = pd.MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')], + ) # By default missing values will be NaN result = data.unstack() @@ -316,9 +315,10 @@ def test_unstack_fill_frame_categorical(self): index=list('xyz')) assert_frame_equal(result, expected) - # Fill with non-category results in NaN entries similar to above - result = data.unstack(fill_value='d') - assert_frame_equal(result, expected) + # Fill with non-category results in a TypeError + msg = r"'fill_value' \('d'\) is not in" + with tm.assert_raises_regex(TypeError, msg): + data.unstack(fill_value='d') # Fill with category value replaces missing values as expected result = data.unstack(fill_value='c') @@ -874,6 +874,21 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('level', [0, 1]) + def test_unstack_mixed_extension_types(self, level): + index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)], + names=['a', 'b']) + df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]), + "B": pd.Categorical(['a', 'a', 'b'])}, index=index) + + result = df.unstack(level=level) + expected = df.astype(object).unstack(level=level) + + expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2, + index=result.columns) + tm.assert_series_equal(result.dtypes, expected_dtypes) + tm.assert_frame_equal(result.astype(object), expected) + @pytest.mark.parametrize("level", [0, 'baz']) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index e7eba63e4e0b3..0e71048f51177 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -47,4 +47,5 @@ def test_pivot_table_multi(self): values=['D', 'E']) res_dense = pd.pivot_table(self.dense, index='A', columns='B', values=['D', 'E']) + res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense)