Skip to content

Commit

Permalink
Implement DataFrame.__array_ufunc__
Browse files Browse the repository at this point in the history
For some cases, this will preserve extension types of arrays by calling
the ufunc blockwise.

```python
In [1]: import pandas as pd; import numpy as np
In [2]: df = pd.DataFrame({"A": pd.array([0, 1], dtype="Sparse")})

In [3]: np.sin(df).dtypes
Out[3]:
A    Sparse[float64, nan]
dtype: object
```

We don't currently handle the multi-input case well (aside from ufuncs that
are implemented as dunder ops like `np.add`). For these, we fall back to
the old implementation of converting to an ndarray.
  • Loading branch information
TomAugspurger committed Oct 7, 2020
1 parent da3a2d3 commit 01024ef
Show file tree
Hide file tree
Showing 7 changed files with 272 additions and 79 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ Other enhancements
- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
- Calling a NumPy ufunc on a ``DataFrame`` with extension types now presrves the extension types when possible (:issue:`23743`).
- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)

.. _whatsnew_120.api_breaking.python:
Expand Down Expand Up @@ -289,6 +291,7 @@ Deprecations
- Deprecated :meth:`Index.is_all_dates` (:issue:`27744`)
- Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`)
- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`)
- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`)
- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`)

.. ---------------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,10 @@ def __init__(

NDFrame.__init__(self, mgr)

# ----------------------------------------------------------------------
# Array interface
_HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)

# ----------------------------------------------------------------------

@property
Expand Down
100 changes: 98 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@
from pandas.core.dtypes.missing import isna, notna

import pandas as pd
from pandas.core import missing, nanops
from pandas.core import missing, nanops, ops
import pandas.core.algorithms as algos
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.construction import create_series_with_explicit_dtype, extract_array
from pandas.core.flags import Flags
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index
Expand Down Expand Up @@ -1912,6 +1912,102 @@ def __array_wrap__(
self, method="__array_wrap__"
)

@ops.defer_or_dispatch_ufunc
def __array_ufunc__(
self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
):
# XXX: check outer
# align all the inputs.
types = tuple(type(x) for x in inputs)
alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]

if len(alignable) > 1:
# This triggers alignment.
# At the moment, there aren't any ufuncs with more than two inputs
# so this ends up just being x1.index | x2.index, but we write
# it to handle *args.

if len(set(types)) > 1:
# We currently don't handle ufunc(DataFrame, Series)
# well. Previously this raised an internal ValueError. We might
# support it someday, so raise a NotImplementedError.
raise NotImplementedError(
"Cannot apply ufunc {} to mixed DataFrame and Series "
"inputs.".format(ufunc)
)
axes = self.axes
for obj in alignable[1:]:
# this relies on the fact that we aren't handling mixed
# series / frame ufuncs.
for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
axes[i] = ax1 | ax2

reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
inputs = tuple(
x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
for x, t in zip(inputs, types)
)
else:
reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))

if self.ndim == 1:
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
name = names[0] if len(set(names)) == 1 else None
reconstruct_kwargs = {"name": name}
else:
reconstruct_kwargs = {}

def reconstruct(result):
if lib.is_scalar(result):
return result
if result.ndim != self.ndim:
if method == "outer":
if self.ndim == 2:
# we already deprecated for Series
msg = (
"outer method for ufunc {} is not implemented on "
"pandas objects. Returning an ndarray, but in the "
"future this will raise a 'NotImplementedError'. "
"Consider explicitly converting the DataFrame "
"to an array with '.to_numpy()' first."
)
warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4)
return result
raise NotImplementedError
return result
if isinstance(result, BlockManager):
# we went through BlockManager.apply
return self._constructor(result, **reconstruct_kwargs, copy=False)
else:
# we converted an array, lost our axes
return self._constructor(
result, **reconstruct_axes, **reconstruct_kwargs, copy=False
)

if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
# Just give up on preserving types in the complex case.
# In theory we could preserve them for them.
# * nout>1 is doable if BlockManager.apply took nout and
# returned a Tuple[BlockManager].
# * len(inputs) > 1 is doable when we know that we have
# aligned blocks / dtypes.
inputs = tuple(np.asarray(x) for x in inputs)
result = getattr(ufunc, method)(*inputs)
elif self.ndim == 1:
# ufunc(series, ...)
inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)
else:
# ufunc(dataframe)
mgr = inputs[0]._mgr
result = mgr.apply(getattr(ufunc, method))

if ufunc.nout > 1:
result = tuple(reconstruct(x) for x in result)
else:
result = reconstruct(result)
return result

# ideally we would define this to avoid the getattr checks, but
# is slower
# @property
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@
logical_op,
)
from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.ops.common import ( # noqa:F401
defer_or_dispatch_ufunc,
unpack_zerodim_and_defer,
)
from pandas.core.ops.docstrings import (
_arith_doc_FRAME,
_flex_comp_doc_FRAME,
Expand Down
53 changes: 52 additions & 1 deletion pandas/core/ops/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
Boilerplate functions used in defining binary operations.
"""
from functools import wraps
from typing import Callable
from typing import Any, Callable

import numpy as np

from pandas._libs.lib import item_from_zerodim
from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
from pandas._typing import F

from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
Expand Down Expand Up @@ -65,3 +68,51 @@ def new_method(self, other):
return method(self, other)

return new_method


def defer_or_dispatch_ufunc(meth):
"""
Boilerplate for pandas conventions in arithmetic and comparison methods.
Ensure method returns NotImplemented when operating against "senior"
classes. Ensure zero-dimensional ndarrays are always unpacked.
Parameters
----------
method : binary method
Returns
-------
method
"""

@wraps(meth)
def new_method(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any):
cls = type(self)

# for binary ops, use our custom dunder methods
result = maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result

# Determine if we should defer.
no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__)

for item in inputs:
higher_priority = (
hasattr(item, "__array_priority__")
and item.__array_priority__ > self.__array_priority__
)
has_array_ufunc = (
hasattr(item, "__array_ufunc__")
and type(item).__array_ufunc__ not in no_defer
and not isinstance(item, self._HANDLED_TYPES)
)
if higher_priority or has_array_ufunc:
return NotImplemented

return meth(self, ufunc, method, *inputs, **kwargs)

return new_method
75 changes: 0 additions & 75 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,81 +682,6 @@ def view(self, dtype=None) -> "Series":
# NDArray Compat
_HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)

def __array_ufunc__(
self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
):
# TODO: handle DataFrame
cls = type(self)

# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result

# Determine if we should defer.
no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__)

for item in inputs:
higher_priority = (
hasattr(item, "__array_priority__")
and item.__array_priority__ > self.__array_priority__
)
has_array_ufunc = (
hasattr(item, "__array_ufunc__")
and type(item).__array_ufunc__ not in no_defer
and not isinstance(item, self._HANDLED_TYPES)
)
if higher_priority or has_array_ufunc:
return NotImplemented

# align all the inputs.
names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
types = tuple(type(x) for x in inputs)
# TODO: dataframe
alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)]

if len(alignable) > 1:
# This triggers alignment.
# At the moment, there aren't any ufuncs with more than two inputs
# so this ends up just being x1.index | x2.index, but we write
# it to handle *args.
index = alignable[0].index
for s in alignable[1:]:
index |= s.index
inputs = tuple(
x.reindex(index) if issubclass(t, Series) else x
for x, t in zip(inputs, types)
)
else:
index = self.index

inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)

name = names[0] if len(set(names)) == 1 else None

def construct_return(result):
if lib.is_scalar(result):
return result
elif result.ndim > 1:
# e.g. np.subtract.outer
if method == "outer":
# GH#27198
raise NotImplementedError
return result
return self._constructor(result, index=index, name=name, copy=False)

if type(result) is tuple:
# multiple return values
return tuple(construct_return(x) for x in result)
elif method == "at":
# no return value
return None
else:
return construct_return(result)

def __array__(self, dtype=None) -> np.ndarray:
"""
Return the values as a NumPy array.
Expand Down
Loading

0 comments on commit 01024ef

Please sign in to comment.