ENH: Support EAs in Series.unstack (#23284)

pandas-dev · Nov 7, 2018 · 28a42da · 28a42da
1 parent d6820ac
commit 28a42da
Show file tree

Hide file tree

Showing 11 changed files with 248 additions and 51 deletions.
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -49,21 +49,33 @@ def time_unstack(self):
 
 class Unstack(object):
 
-    def setup(self):
+    params = ['int', 'category']
+
+    def setup(self, dtype):
         m = 100
         n = 1000
 
         levels = np.arange(m)
         index = MultiIndex.from_product([levels] * 2)
         columns = np.arange(n)
-        values = np.arange(m * m * n).reshape(m * m, n)
+        if dtype == 'int':
+            values = np.arange(m * m * n).reshape(m * m, n)
+        else:
+            # the category branch is ~20x slower than int. So we
+            # cut down the size a bit. Now it's only ~3x slower.
+            n = 50
+            columns = columns[:n]
+            indices = np.random.randint(0, 52, size=(m * m, n))
+            values = np.take(list(string.ascii_letters), indices)
+            values = [pd.Categorical(v) for v in values.T]
+
         self.df = DataFrame(values, index, columns)
         self.df2 = self.df.iloc[:-1]
 
-    def time_full_product(self):
+    def time_full_product(self, dtype):
         self.df.unstack()
 
-    def time_without_last_row(self):
+    def time_without_last_row(self, dtype):
         self.df2.unstack()
 
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -853,7 +853,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
 - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
 - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
 - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
-- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`).
+- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
 - Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).
 
 .. _whatsnew_0240.api.incompatibilities:
@@ -1090,6 +1090,7 @@ Categorical
 - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
 - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
 - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
+- In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`)
 - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)
 
 Datetimelike

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import functools
 import warnings
 import inspect
 import re
@@ -34,6 +35,7 @@
     is_numeric_v_string_like, is_extension_type,
     is_extension_array_dtype,
     is_list_like,
+    is_sparse,
     is_re,
     is_re_compilable,
     pandas_dtype)
@@ -632,7 +634,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
             return self
 
         if klass is None:
-            if dtype == np.object_:
+            if is_sparse(self.values):
+                # special case sparse, Series[Sparse].astype(object) is sparse
+                klass = ExtensionBlock
+            elif is_object_dtype(dtype):
                 klass = ObjectBlock
             elif is_extension_array_dtype(dtype):
                 klass = ExtensionBlock
@@ -1429,7 +1434,7 @@ def equals(self, other):
             return False
         return array_equivalent(self.values, other.values)
 
-    def _unstack(self, unstacker_func, new_columns):
+    def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
         """Return a list of unstacked blocks of self
 
         Parameters
@@ -1438,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns):
             Partially applied unstacker.
         new_columns : Index
             All columns of the unstacked BlockManager.
+        n_rows : int
+            Only used in ExtensionBlock.unstack
+        fill_value : int
+            Only used in ExtensionBlock.unstack
 
         Returns
         -------
@@ -1731,7 +1740,7 @@ def _slice(self, slicer):
     def _try_cast_result(self, result, dtype=None):
         return result
 
-    def _unstack(self, unstacker_func, new_columns):
+    def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
         """Return a list of unstacked blocks of self
 
         Parameters
@@ -1740,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns):
             Partially applied unstacker.
         new_columns : Index
             All columns of the unstacked BlockManager.
+        n_rows : int
+            Only used in ExtensionBlock.unstack
+        fill_value : int
+            Only used in ExtensionBlock.unstack
 
         Returns
         -------
@@ -1751,18 +1764,50 @@ def _unstack(self, unstacker_func, new_columns):
         # NonConsolidatable blocks can have a single item only, so we return
         # one block per item
         unstacker = unstacker_func(self.values.T)
-        new_items = unstacker.get_new_columns()
-        new_placement = new_columns.get_indexer(new_items)
-        new_values, mask = unstacker.get_new_values()
 
-        mask = mask.any(0)
+        new_placement, new_values, mask = self._get_unstack_items(
+            unstacker, new_columns
+        )
+
         new_values = new_values.T[mask]
         new_placement = new_placement[mask]
 
         blocks = [self.make_block_same_class(vals, [place])
                   for vals, place in zip(new_values, new_placement)]
         return blocks, mask
 
+    def _get_unstack_items(self, unstacker, new_columns):
+        """
+        Get the placement, values, and mask for a Block unstack.
+
+        This is shared between ObjectBlock and ExtensionBlock. They
+        differ in that ObjectBlock passes the values, while ExtensionBlock
+        passes the dummy ndarray of positions to be used by a take
+        later.
+
+        Parameters
+        ----------
+        unstacker : pandas.core.reshape.reshape._Unstacker
+        new_columns : Index
+            All columns of the unstacked BlockManager.
+
+        Returns
+        -------
+        new_placement : ndarray[int]
+            The placement of the new columns in `new_columns`.
+        new_values : Union[ndarray, ExtensionArray]
+            The first return value from _Unstacker.get_new_values.
+        mask : ndarray[bool]
+            The second return value from _Unstacker.get_new_values.
+        """
+        # shared with ExtensionBlock
+        new_items = unstacker.get_new_columns()
+        new_placement = new_columns.get_indexer(new_items)
+        new_values, mask = unstacker.get_new_values()
+
+        mask = mask.any(0)
+        return new_placement, new_values, mask
+
 
 class ExtensionBlock(NonConsolidatableMixIn, Block):
     """Block for holding extension types.
@@ -1950,6 +1995,30 @@ def shift(self, periods, axis=0):
     def _ftype(self):
         return getattr(self.values, '_pandas_ftype', Block._ftype)
 
+    def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
+        # ExtensionArray-safe unstack.
+        # We override ObjectBlock._unstack, which unstacks directly on the
+        # values of the array. For EA-backed blocks, this would require
+        # converting to a 2-D ndarray of objects.
+        # Instead, we unstack an ndarray of integer positions, followed by
+        # a `take` on the actual values.
+        dummy_arr = np.arange(n_rows)
+        dummy_unstacker = functools.partial(unstacker_func, fill_value=-1)
+        unstacker = dummy_unstacker(dummy_arr)
+
+        new_placement, new_values, mask = self._get_unstack_items(
+            unstacker, new_columns
+        )
+
+        blocks = [
+            self.make_block_same_class(
+                self.values.take(indices, allow_fill=True,
+                                 fill_value=fill_value),
+                [place])
+            for indices, place in zip(new_values.T, new_placement)
+        ]
+        return blocks, mask
+
 
 class NumericBlock(Block):
     __slots__ = ()

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1405,18 +1405,21 @@ def canonicalize(block):
         return all(block.equals(oblock)
                    for block, oblock in zip(self_blocks, other_blocks))
 
-    def unstack(self, unstacker_func):
+    def unstack(self, unstacker_func, fill_value):
         """Return a blockmanager with all blocks unstacked.
 
         Parameters
         ----------
         unstacker_func : callable
             A (partially-applied) ``pd.core.reshape._Unstacker`` class.
+        fill_value : Any
+            fill_value for newly introduced missing values.
 
         Returns
         -------
         unstacked : BlockManager
         """
+        n_rows = self.shape[-1]
         dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
         new_columns = dummy.get_new_columns()
         new_index = dummy.get_new_index()
@@ -1427,7 +1430,10 @@ def unstack(self, unstacker_func):
             blocks, mask = blk._unstack(
                 partial(unstacker_func,
                         value_columns=self.items[blk.mgr_locs.indexer]),
-                new_columns)
+                new_columns,
+                n_rows,
+                fill_value
+            )
 
             new_blocks.extend(blocks)
             columns_mask.extend(mask)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -12,12 +12,12 @@
 from pandas.core.dtypes.cast import maybe_promote
 from pandas.core.dtypes.common import (
     ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like,
-    is_object_dtype, is_sparse, needs_i8_conversion)
+    is_object_dtype, needs_i8_conversion)
 from pandas.core.dtypes.missing import notna
 
 from pandas import compat
 import pandas.core.algorithms as algos
-from pandas.core.arrays import Categorical, SparseArray
+from pandas.core.arrays import SparseArray
 from pandas.core.arrays.categorical import _factorize_from_iterable
 from pandas.core.frame import DataFrame
 from pandas.core.index import Index, MultiIndex
@@ -82,28 +82,15 @@ class _Unstacker(object):
     def __init__(self, values, index, level=-1, value_columns=None,
                  fill_value=None, constructor=None):
 
-        self.is_categorical = None
-        self.is_sparse = is_sparse(values)
         if values.ndim == 1:
-            if isinstance(values, Categorical):
-                self.is_categorical = values
-                values = np.array(values)
-            elif self.is_sparse:
-                # XXX: Makes SparseArray *dense*, but it's supposedly
-                # a single column at a time, so it's "doable"
-                values = values.values
             values = values[:, np.newaxis]
         self.values = values
         self.value_columns = value_columns
         self.fill_value = fill_value
 
         if constructor is None:
-            if self.is_sparse:
-                self.constructor = SparseDataFrame
-            else:
-                self.constructor = DataFrame
-        else:
-            self.constructor = constructor
+            constructor = DataFrame
+        self.constructor = constructor
 
         if value_columns is None and values.shape[1] != 1:  # pragma: no cover
             raise ValueError('must pass column labels for multi-column data')
@@ -174,14 +161,6 @@ def get_result(self):
         columns = self.get_new_columns()
         index = self.get_new_index()
 
-        # may need to coerce categoricals here
-        if self.is_categorical is not None:
-            categories = self.is_categorical.categories
-            ordered = self.is_categorical.ordered
-            values = [Categorical(values[:, i], categories=categories,
-                                  ordered=ordered)
-                      for i in range(values.shape[-1])]
-
         return self.constructor(values, index=index, columns=columns)
 
     def get_new_values(self):
@@ -339,6 +318,7 @@ def _unstack_multiple(data, clocs, fill_value=None):
     if isinstance(data, Series):
         dummy = data.copy()
         dummy.index = dummy_index
+
         unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
         new_levels = clevels
         new_names = cnames
@@ -394,6 +374,8 @@ def unstack(obj, level, fill_value=None):
         else:
             return obj.T.stack(dropna=False)
     else:
+        if is_extension_array_dtype(obj.dtype):
+            return _unstack_extension_series(obj, level, fill_value)
         unstacker = _Unstacker(obj.values, obj.index, level=level,
                                fill_value=fill_value,
                                constructor=obj._constructor_expanddim)
@@ -404,7 +386,8 @@ def _unstack_frame(obj, level, fill_value=None):
     if obj._is_mixed_type:
         unstacker = partial(_Unstacker, index=obj.index,
                             level=level, fill_value=fill_value)
-        blocks = obj._data.unstack(unstacker)
+        blocks = obj._data.unstack(unstacker,
+                                   fill_value=fill_value)
         return obj._constructor(blocks)
     else:
         unstacker = _Unstacker(obj.values, obj.index, level=level,
@@ -414,6 +397,52 @@ def _unstack_frame(obj, level, fill_value=None):
         return unstacker.get_result()
 
 
+def _unstack_extension_series(series, level, fill_value):
+    """
+    Unstack an ExtensionArray-backed Series.
+
+    The ExtensionDtype is preserved.
+
+    Parameters
+    ----------
+    series : Series
+        A Series with an ExtensionArray for values
+    level : Any
+        The level name or number.
+    fill_value : Any
+        The user-level (not physical storage) fill value to use for
+        missing values introduced by the reshape. Passed to
+        ``series.values.take``.
+
+    Returns
+    -------
+    DataFrame
+        Each column of the DataFrame will have the same dtype as
+        the input Series.
+    """
+    # Implementation note: the basic idea is to
+    # 1. Do a regular unstack on a dummy array of integers
+    # 2. Followup with a columnwise take.
+    # We use the dummy take to discover newly-created missing values
+    # introduced by the reshape.
+    from pandas.core.reshape.concat import concat
+
+    dummy_arr = np.arange(len(series))
+    # fill_value=-1, since we will do a series.values.take later
+    result = _Unstacker(dummy_arr, series.index,
+                        level=level, fill_value=-1).get_result()
+
+    out = []
+    values = series.values
+
+    for col, indices in result.iteritems():
+        out.append(Series(values.take(indices.values,
+                                      allow_fill=True,
+                                      fill_value=fill_value),
+                          name=col, index=result.index))
+    return concat(out, axis='columns', copy=False, keys=result.columns)
+
+
 def stack(frame, level=-1, dropna=True):
     """
     Convert DataFrame to Series with multi-level Index. Columns become the