Skip to content

Commit

Permalink
DEPR: deprecate relabling dictionarys in groupby.agg
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Apr 7, 2017
1 parent 0cfc08c commit ec6361f
Show file tree
Hide file tree
Showing 8 changed files with 335 additions and 84 deletions.
73 changes: 73 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,79 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi
df.iloc[[0, 2], df.columns.get_loc('A')]


.. _whatsnew_0200.api_breaking.deprecate_agg_series:

Deprecate groupby.agg() with a dictionary when renaming
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The ``.groupby(..).agg(..)`` syntax can accept a variable of inputs, including scalars, list, and a dictionary of column names to scalars or lists.
This provides a useful syntax for constructing multiple (potentially different) aggregations for a groupby.

1) We are deprecating passing a dictionary to a grouped ``Series``. This allowed one to ``rename`` the resulting aggregation, but this had a completely different
meaning that passing a dictionary to a grouped ``DataFrame``, which accepts column-to-aggregations.
2) We are deprecating passing a dict-of-dict to a grouped ``DataFrame`` in a similar manner.

Here's an example of 1), passing a dict to a grouped ``Series``:

.. ipython:: python

df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
'B': range(5),
'C':range(5)})
df

Aggregating a DataFrame with column selection.

.. ipython:: python

df.groupby('A').agg({'B': ['sum', 'max'],
'C': ['count', 'min']})


We are deprecating the following

.. code-block:: ipython. Which is a combination aggregation & renaming.

In [6]: df.groupby('A').B.agg({'foo': 'count'})
FutureWarning: using a dictionary on a Series for aggregation
is deprecated and will be removed in a future version

Out[6]:
foo
A
1 3
2 2

You can accomplish the same operation, more idiomatically by:

.. ipython:: python

df.groupby('A').B.agg(['count']).rename({'count': 'foo'})


Here's an example of 2), passing a dict-of-dict to a grouped ``DataFrame``:

.. code-block:: python

In [23]: df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, 'C': {'bar': ['count', 'min']}})
FutureWarning: using a dictionary on a Series for aggregation
is deprecated and will be removed in a future version

Out[23]:
foo bar
sum max count min
A
1 3 2 3 0
2 7 4 2 3

You can accomplish the same by:

.. ipython:: python

r = df.groupby('A').agg({'B': ['sum', 'max'], 'C': ['count', 'min']})
r.columns = r.columns.set_levels(['foo', 'bar'], level=0)
r

.. _whatsnew.api_breaking.io_compat:

Possible incompat for HDF5 formats for pandas < 0.13.0
Expand Down
136 changes: 118 additions & 18 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Base and utility classes for pandas objects.
"""
import warnings
from pandas import compat
from pandas.compat import builtins
import numpy as np
Expand Down Expand Up @@ -290,7 +291,9 @@ class SelectionMixin(object):
}

@property
def name(self):
def _selection_name(self):
""" return a name for myself; this would ideally be the 'name' property, but
we cannot conflict with the Series.name property which can be set """
if self._selection is None:
return None # 'result'
else:
Expand Down Expand Up @@ -405,6 +408,26 @@ def aggregate(self, func, *args, **kwargs):

agg = aggregate

def _try_aggregate_string_function(self, arg, *args, **kwargs):
"""
if arg is a string, then try to operate on it:
- try to find a function on ourselves
- try to find a numpy function
- raise
"""
assert isinstance(arg, compat.string_types)

f = getattr(self, arg, None)
if f is not None:
return f(*args, **kwargs)

f = getattr(np, arg, None)
if f is not None:
return f(self, *args, **kwargs)

raise ValueError("{} is an unknown string function".format(arg))

def _aggregate(self, arg, *args, **kwargs):
"""
provide an implementation for the aggregators
Expand All @@ -428,14 +451,19 @@ def _aggregate(self, arg, *args, **kwargs):
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
is_nested_renamer = False

_axis = kwargs.pop('_axis', None)
if _axis is None:
_axis = getattr(self, 'axis', 0)
_level = kwargs.pop('_level', None)

if isinstance(arg, compat.string_types):
return getattr(self, arg)(*args, **kwargs), None
return self._try_aggregate_string_function(arg, *args,
**kwargs), None

if isinstance(arg, dict):

# aggregate based on the passed dict
if self.axis != 0: # pragma: no cover
if _axis != 0: # pragma: no cover
raise ValueError('Can only pass dict with axis=0')

obj = self._selected_obj
Expand Down Expand Up @@ -505,6 +533,16 @@ def _agg(arg, func):
keys = list(compat.iterkeys(arg))
result = compat.OrderedDict()

# renaming keys
if isinstance(self._selected_obj, ABCDataFrame):
if len(self._selected_obj.columns.intersection(
keys)) != len(keys):
warnings.warn(
("using a dict with renaming"
"is deprecated and will be removed in a future "
"version"),
FutureWarning, stacklevel=3)

# nested renamer
if is_nested_renamer:
result = list(_agg(arg, _agg_1dim).values())
Expand Down Expand Up @@ -534,7 +572,7 @@ def _agg(arg, func):
agg_how: _agg_1dim(self._selection, agg_how))

# we are selecting the same set as we are aggregating
elif not len(sl - set(compat.iterkeys(arg))):
elif not len(sl - set(keys)):

result = _agg(arg, _agg_1dim)

Expand All @@ -555,32 +593,74 @@ def _agg(arg, func):
result = _agg(arg, _agg_2dim)

# combine results

def is_any_series():
# return a boolean if we have *any* nested series
return any([isinstance(r, ABCSeries)
for r in compat.itervalues(result)])

def is_any_frame():
# return a boolean if we have *any* nested series
return any([isinstance(r, ABCDataFrame)
for r in compat.itervalues(result)])

if isinstance(result, list):
result = concat(result, keys=keys, axis=1)
elif isinstance(list(compat.itervalues(result))[0],
ABCDataFrame):
result = concat([result[k] for k in keys], keys=keys, axis=1)
else:
from pandas import DataFrame
return concat(result, keys=keys, axis=1), True

elif is_any_frame():
# we have a dict of DataFrames
# return a MI DataFrame

return concat([result[k] for k in keys],
keys=keys, axis=1), True

elif isinstance(self, ABCSeries) and is_any_series():

# we have a dict of Series
# return a MI Series
try:
result = concat(result)
except TypeError:
# we want to give a nice error here if
# we have non-same sized objects, so
# we don't automatically broadcast

raise ValueError("cannot perform both aggregation "
"and transformation operations "
"simultaneously")

return result, True

# fall thru
from pandas import DataFrame, Series
try:
result = DataFrame(result)
except ValueError:

# we have a dict of scalars
result = Series(result,
name=getattr(self, 'name', None))

return result, True
elif hasattr(arg, '__iter__'):
return self._aggregate_multiple_funcs(arg, _level=_level), None
elif is_list_like(arg) and arg not in compat.string_types:
# we require a list, but not an 'str'
return self._aggregate_multiple_funcs(arg,
_level=_level,
_axis=_axis), None
else:
result = None

cy_func = self._is_cython_func(arg)
if cy_func and not args and not kwargs:
return getattr(self, cy_func)(), None
f = self._is_cython_func(arg)
if f and not args and not kwargs:
return getattr(self, f)(), None

# caller can react
return result, True

def _aggregate_multiple_funcs(self, arg, _level):
def _aggregate_multiple_funcs(self, arg, _level, _axis):
from pandas.tools.concat import concat

if self.axis != 0:
if _axis != 0:
raise NotImplementedError("axis other than 0 is not supported")

if self._selected_obj.ndim == 1:
Expand Down Expand Up @@ -615,10 +695,30 @@ def _aggregate_multiple_funcs(self, arg, _level):
keys.append(col)
except (TypeError, DataError):
pass
except ValueError:
# cannot aggregate
continue
except SpecificationError:
raise

return concat(results, keys=keys, axis=1)
# if we are empty
if not len(results):
raise ValueError("no results")

try:
return concat(results, keys=keys, axis=1)
except TypeError:

# we are concatting non-NDFrame objects,
# e.g. a list of scalars

from pandas.types.cast import is_nested_object
from pandas import Series
result = Series(results, index=keys, name=self.name)
if is_nested_object(result):
raise ValueError("cannot combine transform and "
"aggregation operations")
return result

def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
""" return a new object with the replacement attributes """
Expand Down
Loading

0 comments on commit ec6361f

Please sign in to comment.