Skip to content

Commit

Permalink
ENH: Support EAs in Series.unstack (#23284)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and jorisvandenbossche committed Nov 7, 2018
1 parent d6820ac commit 28a42da
Show file tree
Hide file tree
Showing 11 changed files with 248 additions and 51 deletions.
20 changes: 16 additions & 4 deletions asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,33 @@ def time_unstack(self):

class Unstack(object):

def setup(self):
params = ['int', 'category']

def setup(self, dtype):
m = 100
n = 1000

levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
columns = np.arange(n)
values = np.arange(m * m * n).reshape(m * m, n)
if dtype == 'int':
values = np.arange(m * m * n).reshape(m * m, n)
else:
# the category branch is ~20x slower than int. So we
# cut down the size a bit. Now it's only ~3x slower.
n = 50
columns = columns[:n]
indices = np.random.randint(0, 52, size=(m * m, n))
values = np.take(list(string.ascii_letters), indices)
values = [pd.Categorical(v) for v in values.T]

self.df = DataFrame(values, index, columns)
self.df2 = self.df.iloc[:-1]

def time_full_product(self):
def time_full_product(self, dtype):
self.df.unstack()

def time_without_last_row(self):
def time_without_last_row(self, dtype):
self.df2.unstack()


Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`).
- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`).
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).

.. _whatsnew_0240.api.incompatibilities:
Expand Down Expand Up @@ -1090,6 +1090,7 @@ Categorical
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
- In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`)
- Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)

Datetimelike
Expand Down
83 changes: 76 additions & 7 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import functools
import warnings
import inspect
import re
Expand Down Expand Up @@ -34,6 +35,7 @@
is_numeric_v_string_like, is_extension_type,
is_extension_array_dtype,
is_list_like,
is_sparse,
is_re,
is_re_compilable,
pandas_dtype)
Expand Down Expand Up @@ -632,7 +634,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
return self

if klass is None:
if dtype == np.object_:
if is_sparse(self.values):
# special case sparse, Series[Sparse].astype(object) is sparse
klass = ExtensionBlock
elif is_object_dtype(dtype):
klass = ObjectBlock
elif is_extension_array_dtype(dtype):
klass = ExtensionBlock
Expand Down Expand Up @@ -1429,7 +1434,7 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def _unstack(self, unstacker_func, new_columns):
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
"""Return a list of unstacked blocks of self
Parameters
Expand All @@ -1438,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns):
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
n_rows : int
Only used in ExtensionBlock.unstack
fill_value : int
Only used in ExtensionBlock.unstack
Returns
-------
Expand Down Expand Up @@ -1731,7 +1740,7 @@ def _slice(self, slicer):
def _try_cast_result(self, result, dtype=None):
return result

def _unstack(self, unstacker_func, new_columns):
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
"""Return a list of unstacked blocks of self
Parameters
Expand All @@ -1740,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns):
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
n_rows : int
Only used in ExtensionBlock.unstack
fill_value : int
Only used in ExtensionBlock.unstack
Returns
-------
Expand All @@ -1751,18 +1764,50 @@ def _unstack(self, unstacker_func, new_columns):
# NonConsolidatable blocks can have a single item only, so we return
# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
new_placement, new_values, mask = self._get_unstack_items(
unstacker, new_columns
)

new_values = new_values.T[mask]
new_placement = new_placement[mask]

blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask

def _get_unstack_items(self, unstacker, new_columns):
"""
Get the placement, values, and mask for a Block unstack.
This is shared between ObjectBlock and ExtensionBlock. They
differ in that ObjectBlock passes the values, while ExtensionBlock
passes the dummy ndarray of positions to be used by a take
later.
Parameters
----------
unstacker : pandas.core.reshape.reshape._Unstacker
new_columns : Index
All columns of the unstacked BlockManager.
Returns
-------
new_placement : ndarray[int]
The placement of the new columns in `new_columns`.
new_values : Union[ndarray, ExtensionArray]
The first return value from _Unstacker.get_new_values.
mask : ndarray[bool]
The second return value from _Unstacker.get_new_values.
"""
# shared with ExtensionBlock
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()

mask = mask.any(0)
return new_placement, new_values, mask


class ExtensionBlock(NonConsolidatableMixIn, Block):
"""Block for holding extension types.
Expand Down Expand Up @@ -1950,6 +1995,30 @@ def shift(self, periods, axis=0):
def _ftype(self):
return getattr(self.values, '_pandas_ftype', Block._ftype)

def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
# ExtensionArray-safe unstack.
# We override ObjectBlock._unstack, which unstacks directly on the
# values of the array. For EA-backed blocks, this would require
# converting to a 2-D ndarray of objects.
# Instead, we unstack an ndarray of integer positions, followed by
# a `take` on the actual values.
dummy_arr = np.arange(n_rows)
dummy_unstacker = functools.partial(unstacker_func, fill_value=-1)
unstacker = dummy_unstacker(dummy_arr)

new_placement, new_values, mask = self._get_unstack_items(
unstacker, new_columns
)

blocks = [
self.make_block_same_class(
self.values.take(indices, allow_fill=True,
fill_value=fill_value),
[place])
for indices, place in zip(new_values.T, new_placement)
]
return blocks, mask


class NumericBlock(Block):
__slots__ = ()
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1405,18 +1405,21 @@ def canonicalize(block):
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))

def unstack(self, unstacker_func):
def unstack(self, unstacker_func, fill_value):
"""Return a blockmanager with all blocks unstacked.
Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
fill_value : Any
fill_value for newly introduced missing values.
Returns
-------
unstacked : BlockManager
"""
n_rows = self.shape[-1]
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
Expand All @@ -1427,7 +1430,10 @@ def unstack(self, unstacker_func):
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)
new_columns,
n_rows,
fill_value
)

new_blocks.extend(blocks)
columns_mask.extend(mask)
Expand Down
81 changes: 55 additions & 26 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like,
is_object_dtype, is_sparse, needs_i8_conversion)
is_object_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import notna

from pandas import compat
import pandas.core.algorithms as algos
from pandas.core.arrays import Categorical, SparseArray
from pandas.core.arrays import SparseArray
from pandas.core.arrays.categorical import _factorize_from_iterable
from pandas.core.frame import DataFrame
from pandas.core.index import Index, MultiIndex
Expand Down Expand Up @@ -82,28 +82,15 @@ class _Unstacker(object):
def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None, constructor=None):

self.is_categorical = None
self.is_sparse = is_sparse(values)
if values.ndim == 1:
if isinstance(values, Categorical):
self.is_categorical = values
values = np.array(values)
elif self.is_sparse:
# XXX: Makes SparseArray *dense*, but it's supposedly
# a single column at a time, so it's "doable"
values = values.values
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
self.fill_value = fill_value

if constructor is None:
if self.is_sparse:
self.constructor = SparseDataFrame
else:
self.constructor = DataFrame
else:
self.constructor = constructor
constructor = DataFrame
self.constructor = constructor

if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError('must pass column labels for multi-column data')
Expand Down Expand Up @@ -174,14 +161,6 @@ def get_result(self):
columns = self.get_new_columns()
index = self.get_new_index()

# may need to coerce categoricals here
if self.is_categorical is not None:
categories = self.is_categorical.categories
ordered = self.is_categorical.ordered
values = [Categorical(values[:, i], categories=categories,
ordered=ordered)
for i in range(values.shape[-1])]

return self.constructor(values, index=index, columns=columns)

def get_new_values(self):
Expand Down Expand Up @@ -339,6 +318,7 @@ def _unstack_multiple(data, clocs, fill_value=None):
if isinstance(data, Series):
dummy = data.copy()
dummy.index = dummy_index

unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
new_levels = clevels
new_names = cnames
Expand Down Expand Up @@ -394,6 +374,8 @@ def unstack(obj, level, fill_value=None):
else:
return obj.T.stack(dropna=False)
else:
if is_extension_array_dtype(obj.dtype):
return _unstack_extension_series(obj, level, fill_value)
unstacker = _Unstacker(obj.values, obj.index, level=level,
fill_value=fill_value,
constructor=obj._constructor_expanddim)
Expand All @@ -404,7 +386,8 @@ def _unstack_frame(obj, level, fill_value=None):
if obj._is_mixed_type:
unstacker = partial(_Unstacker, index=obj.index,
level=level, fill_value=fill_value)
blocks = obj._data.unstack(unstacker)
blocks = obj._data.unstack(unstacker,
fill_value=fill_value)
return obj._constructor(blocks)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
Expand All @@ -414,6 +397,52 @@ def _unstack_frame(obj, level, fill_value=None):
return unstacker.get_result()


def _unstack_extension_series(series, level, fill_value):
"""
Unstack an ExtensionArray-backed Series.
The ExtensionDtype is preserved.
Parameters
----------
series : Series
A Series with an ExtensionArray for values
level : Any
The level name or number.
fill_value : Any
The user-level (not physical storage) fill value to use for
missing values introduced by the reshape. Passed to
``series.values.take``.
Returns
-------
DataFrame
Each column of the DataFrame will have the same dtype as
the input Series.
"""
# Implementation note: the basic idea is to
# 1. Do a regular unstack on a dummy array of integers
# 2. Followup with a columnwise take.
# We use the dummy take to discover newly-created missing values
# introduced by the reshape.
from pandas.core.reshape.concat import concat

dummy_arr = np.arange(len(series))
# fill_value=-1, since we will do a series.values.take later
result = _Unstacker(dummy_arr, series.index,
level=level, fill_value=-1).get_result()

out = []
values = series.values

for col, indices in result.iteritems():
out.append(Series(values.take(indices.values,
allow_fill=True,
fill_value=fill_value),
name=col, index=result.index))
return concat(out, axis='columns', copy=False, keys=result.columns)


def stack(frame, level=-1, dropna=True):
"""
Convert DataFrame to Series with multi-level Index. Columns become the
Expand Down
Loading

0 comments on commit 28a42da

Please sign in to comment.