Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: clarify DataFrame.apply reduction on empty frames #6007

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ API Changes
when detecting chained assignment, related (:issue:`5938`)
- DataFrame.head(0) returns self instead of empty frame (:issue:`5846`)
- ``autocorrelation_plot`` now accepts ``**kwargs``. (:issue:`5623`)
- ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a
``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is
empty (:issue:`6007`).

Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down
31 changes: 31 additions & 0 deletions doc/source/v0.13.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,37 @@ There are several new or updated docs sections including:
API changes
~~~~~~~~~~~

- ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a
``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is
empty (:issue:`6007`).

Previously, calling ``DataFrame.apply`` an empty ``DataFrame`` would return
either a ``DataFrame`` if there were no columns, or the function being
applied would be called with an empty ``Series`` to guess whether a
``Series`` or ``DataFrame`` should be returned:

.. ipython:: python

def applied_func(col):
print "Apply function being called with:", col
return col.sum()

import pandas as pd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

take out the import pandas line and the pd (pandas is auto imported in docs and Dataframe is too)

empty = pd.DataFrame(columns=['a', 'b'])
empty.apply(applied_func)

Now, when ``apply`` is called on an empty ``DataFrame``: if the ``reduce``
argument is ``True`` a ``Series`` will returned, if it is ``False`` a
``DataFrame`` will be returned, and if it is ``None`` (the default) the
function being applied will be called with an empty series to try and guess
the return type.

.. ipython:: python

empty.apply(applied_func, reduce=True)
empty.apply(applied_func, reduce=False)


Prior Version Deprecations/Changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
48 changes: 30 additions & 18 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3362,14 +3362,15 @@ def diff(self, periods=1):
#----------------------------------------------------------------------
# Function application

def apply(self, func, axis=0, broadcast=False, raw=False, reduce=True,
def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None,
args=(), **kwds):
"""
Applies function along input axis of DataFrame.

Objects passed to functions are Series objects having index
either the DataFrame's index (axis=0) or the columns (axis=1).
Return type depends on whether passed function aggregates
Return type depends on whether passed function aggregates, or the
reduce argument if the DataFrame is empty.

Parameters
----------
Expand All @@ -3381,8 +3382,14 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=True,
broadcast : boolean, default False
For aggregation functions, return object of same size with values
propagated
reduce : boolean, default True
Try to apply reduction procedures
reduce : boolean or None, default None
Try to apply reduction procedures. If the DataFrame is empty,
apply will use reduce to determine whether the result should be a
Series or a DataFrame. If reduce is None (the default), apply's
return value will be guessed by calling func an empty Series (note:
while guessing, exceptions raised by func will be ignored). If
reduce is True a Series will always be returned, and if False a
DataFrame will always be returned.
raw : boolean, default False
If False, convert each row or column into a Series. If raw=True the
passed function will receive ndarray objects instead. If you are
Expand All @@ -3407,41 +3414,46 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=True,
-------
applied : Series or DataFrame
"""
if len(self.columns) == 0 and len(self.index) == 0:
return self

axis = self._get_axis_number(axis)
if kwds or args and not isinstance(func, np.ufunc):
f = lambda x: func(x, *args, **kwds)
else:
f = func

if len(self.columns) == 0 and len(self.index) == 0:
return self._apply_empty_result(func, axis, reduce)

if isinstance(f, np.ufunc):
results = f(self.values)
return self._constructor(data=results, index=self.index,
columns=self.columns, copy=False)
else:
if not broadcast:
if not all(self.shape):
# How to determine this better?
is_reduction = False
try:
is_reduction = not isinstance(f(_EMPTY_SERIES), Series)
except Exception:
pass

if is_reduction:
return Series(NA, index=self._get_agg_axis(axis))
else:
return self.copy()
return self._apply_empty_result(func, axis, reduce)

if raw and not self._is_mixed_type:
return self._apply_raw(f, axis)
else:
if reduce is None:
reduce = True
return self._apply_standard(f, axis, reduce=reduce)
else:
return self._apply_broadcast(f, axis)

def _apply_empty_result(self, func, axis, reduce):
if reduce is None:
reduce = False
try:
reduce = not isinstance(func(_EMPTY_SERIES), Series)
except Exception:
pass

if reduce:
return Series(NA, index=self._get_agg_axis(axis))
else:
return self.copy()

def _apply_raw(self, func, axis):
try:
result = lib.reduce(self.values, func, axis=axis)
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8978,6 +8978,22 @@ def test_apply_empty(self):
rs = xp.apply(lambda x: x['a'], axis=1)
assert_frame_equal(xp, rs)

# reduce with an empty DataFrame
x = []
result = self.empty.apply(x.append, axis=1, reduce=False)
assert_frame_equal(result, self.empty)
result = self.empty.apply(x.append, axis=1, reduce=True)
assert_series_equal(result, Series([]))

empty_with_cols = DataFrame(columns=['a', 'b', 'c'])
result = empty_with_cols.apply(x.append, axis=1, reduce=False)
assert_frame_equal(result, empty_with_cols)
result = empty_with_cols.apply(x.append, axis=1, reduce=True)
assert_series_equal(result, Series([]))

# Ensure that x.append hasn't been called
self.assertEqual(x, [])

def test_apply_standard_nonunique(self):
df = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c'])
Expand Down