DEPR: deprecate relabling dictionarys in groupby.agg

pandas-dev · Apr 7, 2017 · ec6361f · ec6361f
1 parent 0cfc08c
commit ec6361f
Show file tree

Hide file tree

Showing 8 changed files with 335 additions and 84 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -423,6 +423,79 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi
   df.iloc[[0, 2], df.columns.get_loc('A')]
 
 
+.. _whatsnew_0200.api_breaking.deprecate_agg_series:
+
+Deprecate groupby.agg() with a dictionary when renaming
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``.groupby(..).agg(..)`` syntax can accept a variable of inputs, including scalars, list, and a dictionary of column names to scalars or lists.
+This provides a useful syntax for constructing multiple (potentially different) aggregations for a groupby.
+
+1) We are deprecating passing a dictionary to a grouped ``Series``. This allowed one to ``rename`` the resulting aggregation, but this had a completely different
+meaning that passing a dictionary to a grouped ``DataFrame``, which accepts column-to-aggregations.
+2) We are deprecating passing a dict-of-dict to a grouped ``DataFrame`` in a similar manner.
+
+Here's an example of 1), passing a dict to a grouped ``Series``:
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
+                       'B': range(5),
+                       'C':range(5)})
+    df
+
+Aggregating a DataFrame with column selection.
+
+.. ipython:: python
+
+   df.groupby('A').agg({'B': ['sum', 'max'],
+                        'C': ['count', 'min']})
+
+
+We are deprecating the following
+
+.. code-block:: ipython. Which is a combination aggregation & renaming.
+
+   In [6]: df.groupby('A').B.agg({'foo': 'count'})
+   FutureWarning: using a dictionary on a Series for aggregation
+   is deprecated and will be removed in a future version
+
+   Out[6]:
+      foo
+   A
+   1    3
+   2    2
+
+You can accomplish the same operation, more idiomatically by:
+
+.. ipython:: python
+
+   df.groupby('A').B.agg(['count']).rename({'count': 'foo'})
+
+
+Here's an example of 2), passing a dict-of-dict to a grouped ``DataFrame``:
+
+.. code-block:: python
+
+   In [23]: df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, 'C': {'bar': ['count', 'min']}})
+   FutureWarning: using a dictionary on a Series for aggregation
+   is deprecated and will be removed in a future version
+
+   Out[23]:
+     foo       bar
+     sum max count min
+   A
+   1   3   2     3   0
+   2   7   4     2   3
+
+You can accomplish the same by:
+
+.. ipython:: python
+
+   r = df.groupby('A').agg({'B': ['sum', 'max'], 'C': ['count', 'min']})
+   r.columns = r.columns.set_levels(['foo', 'bar'], level=0)
+   r
+
 .. _whatsnew.api_breaking.io_compat:
 
 Possible incompat for HDF5 formats for pandas < 0.13.0

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1,6 +1,7 @@
 """
 Base and utility classes for pandas objects.
 """
+import warnings
 from pandas import compat
 from pandas.compat import builtins
 import numpy as np
@@ -290,7 +291,9 @@ class SelectionMixin(object):
     }
 
     @property
-    def name(self):
+    def _selection_name(self):
+        """ return a name for myself; this would ideally be the 'name' property, but
+        we cannot conflict with the Series.name property which can be set """
         if self._selection is None:
             return None  # 'result'
         else:
@@ -405,6 +408,26 @@ def aggregate(self, func, *args, **kwargs):
 
     agg = aggregate
 
+    def _try_aggregate_string_function(self, arg, *args, **kwargs):
+        """
+        if arg is a string, then try to operate on it:
+        - try to find a function on ourselves
+        - try to find a numpy function
+        - raise
+
+        """
+        assert isinstance(arg, compat.string_types)
+
+        f = getattr(self, arg, None)
+        if f is not None:
+            return f(*args, **kwargs)
+
+        f = getattr(np, arg, None)
+        if f is not None:
+            return f(self, *args, **kwargs)
+
+        raise ValueError("{} is an unknown string function".format(arg))
+
     def _aggregate(self, arg, *args, **kwargs):
         """
         provide an implementation for the aggregators
@@ -428,14 +451,19 @@ def _aggregate(self, arg, *args, **kwargs):
         is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
         is_nested_renamer = False
 
+        _axis = kwargs.pop('_axis', None)
+        if _axis is None:
+            _axis = getattr(self, 'axis', 0)
         _level = kwargs.pop('_level', None)
+
         if isinstance(arg, compat.string_types):
-            return getattr(self, arg)(*args, **kwargs), None
+            return self._try_aggregate_string_function(arg, *args,
+                                                       **kwargs), None
 
         if isinstance(arg, dict):
 
             # aggregate based on the passed dict
-            if self.axis != 0:  # pragma: no cover
+            if _axis != 0:  # pragma: no cover
                 raise ValueError('Can only pass dict with axis=0')
 
             obj = self._selected_obj
@@ -505,6 +533,16 @@ def _agg(arg, func):
             keys = list(compat.iterkeys(arg))
             result = compat.OrderedDict()
 
+            # renaming keys
+            if isinstance(self._selected_obj, ABCDataFrame):
+                if len(self._selected_obj.columns.intersection(
+                        keys)) != len(keys):
+                    warnings.warn(
+                        ("using a dict with renaming"
+                         "is deprecated and will be removed in a future "
+                         "version"),
+                        FutureWarning, stacklevel=3)
+
             # nested renamer
             if is_nested_renamer:
                 result = list(_agg(arg, _agg_1dim).values())
@@ -534,7 +572,7 @@ def _agg(arg, func):
                                   agg_how: _agg_1dim(self._selection, agg_how))
 
                 # we are selecting the same set as we are aggregating
-                elif not len(sl - set(compat.iterkeys(arg))):
+                elif not len(sl - set(keys)):
 
                     result = _agg(arg, _agg_1dim)
 
@@ -555,32 +593,74 @@ def _agg(arg, func):
                     result = _agg(arg, _agg_2dim)
 
             # combine results
+
+            def is_any_series():
+                # return a boolean if we have *any* nested series
+                return any([isinstance(r, ABCSeries)
+                            for r in compat.itervalues(result)])
+
+            def is_any_frame():
+                # return a boolean if we have *any* nested series
+                return any([isinstance(r, ABCDataFrame)
+                            for r in compat.itervalues(result)])
+
             if isinstance(result, list):
-                result = concat(result, keys=keys, axis=1)
-            elif isinstance(list(compat.itervalues(result))[0],
-                            ABCDataFrame):
-                result = concat([result[k] for k in keys], keys=keys, axis=1)
-            else:
-                from pandas import DataFrame
+                return concat(result, keys=keys, axis=1), True
+
+            elif is_any_frame():
+                # we have a dict of DataFrames
+                # return a MI DataFrame
+
+                return concat([result[k] for k in keys],
+                              keys=keys, axis=1), True
+
+            elif isinstance(self, ABCSeries) and is_any_series():
+
+                # we have a dict of Series
+                # return a MI Series
+                try:
+                    result = concat(result)
+                except TypeError:
+                    # we want to give a nice error here if
+                    # we have non-same sized objects, so
+                    # we don't automatically broadcast
+
+                    raise ValueError("cannot perform both aggregation "
+                                     "and transformation operations "
+                                     "simultaneously")
+
+                return result, True
+
+            # fall thru
+            from pandas import DataFrame, Series
+            try:
                 result = DataFrame(result)
+            except ValueError:
+
+                # we have a dict of scalars
+                result = Series(result,
+                                name=getattr(self, 'name', None))
 
             return result, True
-        elif hasattr(arg, '__iter__'):
-            return self._aggregate_multiple_funcs(arg, _level=_level), None
+        elif is_list_like(arg) and arg not in compat.string_types:
+            # we require a list, but not an 'str'
+            return self._aggregate_multiple_funcs(arg,
+                                                  _level=_level,
+                                                  _axis=_axis), None
         else:
             result = None
 
-        cy_func = self._is_cython_func(arg)
-        if cy_func and not args and not kwargs:
-            return getattr(self, cy_func)(), None
+        f = self._is_cython_func(arg)
+        if f and not args and not kwargs:
+            return getattr(self, f)(), None
 
         # caller can react
         return result, True
 
-    def _aggregate_multiple_funcs(self, arg, _level):
+    def _aggregate_multiple_funcs(self, arg, _level, _axis):
         from pandas.tools.concat import concat
 
-        if self.axis != 0:
+        if _axis != 0:
             raise NotImplementedError("axis other than 0 is not supported")
 
         if self._selected_obj.ndim == 1:
@@ -615,10 +695,30 @@ def _aggregate_multiple_funcs(self, arg, _level):
                     keys.append(col)
                 except (TypeError, DataError):
                     pass
+                except ValueError:
+                    # cannot aggregate
+                    continue
                 except SpecificationError:
                     raise
 
-        return concat(results, keys=keys, axis=1)
+        # if we are empty
+        if not len(results):
+            raise ValueError("no results")
+
+        try:
+            return concat(results, keys=keys, axis=1)
+        except TypeError:
+
+            # we are concatting non-NDFrame objects,
+            # e.g. a list of scalars
+
+            from pandas.types.cast import is_nested_object
+            from pandas import Series
+            result = Series(results, index=keys, name=self.name)
+            if is_nested_object(result):
+                raise ValueError("cannot combine transform and "
+                                 "aggregation operations")
+            return result
 
     def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
         """ return a new object with the replacement attributes """