diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 62cec66cc22b0..43f72ec6a91fd 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -46,6 +46,8 @@ Enhancements pd.Index([1, np.nan, 3]).fillna(2) +- ``pivot_table`` now has a ``margins_name`` argument so you can use something other than the default of 'All' (:issue:`3335`) + .. _whatsnew_0171.api: API changes diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index de7a5f5a73f3d..97bd1f86d01cf 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1,6 +1,5 @@ # pylint: disable=E1103 -import warnings from pandas import Series, DataFrame from pandas.core.index import MultiIndex, Index @@ -8,13 +7,14 @@ from pandas.tools.merge import concat from pandas.tools.util import cartesian_product from pandas.compat import range, lrange, zip -from pandas.util.decorators import deprecate_kwarg from pandas import compat import pandas.core.common as com import numpy as np + def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', - fill_value=None, margins=False, dropna=True): + fill_value=None, margins=False, dropna=True, + margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on @@ -40,6 +40,9 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN + margins_name : string, default 'All' + Name of the row / column that will contain the totals + when margins is True. Examples -------- @@ -127,7 +130,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) table = table.reindex_axis(m, axis=1) except AttributeError: - pass # it's a single level or a series + pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): @@ -140,7 +143,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if margins: table = _add_margins(table, data, values, rows=index, - cols=columns, aggfunc=aggfunc) + cols=columns, aggfunc=aggfunc, + margins_name=margins_name) # discard the top level if values_passed and not values_multi: @@ -155,29 +159,49 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', DataFrame.pivot_table = pivot_table -def _add_margins(table, data, values, rows, cols, aggfunc): +def _add_margins(table, data, values, rows, cols, aggfunc, + margins_name='All'): + if not isinstance(margins_name, compat.string_types): + raise ValueError('margins_name argument must be a string') + + exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name) + for level in table.index.names: + if margins_name in table.index.get_level_values(level): + raise ValueError(exception_msg) - grand_margin = _compute_grand_margin(data, values, aggfunc) + grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) + + # could be passed a Series object with no 'columns' + if hasattr(table, 'columns'): + for level in table.columns.names[1:]: + if margins_name in table.columns.get_level_values(level): + raise ValueError(exception_msg) + + if len(rows) > 1: + key = (margins_name,) + ('',) * (len(rows) - 1) + else: + key = margins_name if not values and isinstance(table, Series): # If there are no values and the table is a series, then there is only # one column in the data. Compute grand margin and return it. - row_key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' - return table.append(Series({row_key: grand_margin['All']})) + return table.append(Series({key: grand_margin[margins_name]})) if values: - marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin) + marginal_result_set = _generate_marginal_results(table, data, values, + rows, cols, aggfunc, + grand_margin, + margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: - marginal_result_set = _generate_marginal_results_without_values(table, data, rows, cols, aggfunc) + marginal_result_set = _generate_marginal_results_without_values( + table, data, rows, cols, aggfunc, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set - key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' - row_margin = row_margin.reindex(result.columns) # populate grand margin for k in margin_keys: @@ -201,7 +225,8 @@ def _add_margins(table, data, values, rows, cols, aggfunc): return result -def _compute_grand_margin(data, values, aggfunc): +def _compute_grand_margin(data, values, aggfunc, + margins_name='All'): if values: grand_margin = {} @@ -220,18 +245,19 @@ def _compute_grand_margin(data, values, aggfunc): pass return grand_margin else: - return {'All': aggfunc(data.index)} - + return {margins_name: aggfunc(data.index)} -def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin): +def _generate_marginal_results(table, data, values, rows, cols, aggfunc, + grand_margin, + margins_name='All'): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): - return (key, 'All') + ('',) * (len(cols) - 1) + return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) @@ -282,15 +308,17 @@ def _all_key(key): return result, margin_keys, row_margin -def _generate_marginal_results_without_values(table, data, rows, cols, aggfunc): +def _generate_marginal_results_without_values( + table, data, rows, cols, aggfunc, + margins_name='All'): if len(cols) > 0: # need to "interleave" the margins margin_keys = [] def _all_key(): if len(cols) == 1: - return 'All' - return ('All', ) + ('', ) * (len(cols) - 1) + return margins_name + return (margins_name, ) + ('', ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows].groupby(rows).apply(aggfunc) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index f0052774d66a2..cb7e9102b21a0 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -224,32 +224,44 @@ def test_pivot_with_tz(self): tm.assert_frame_equal(pv, expected) def test_margins(self): - def _check_output(res, col, index=['A', 'B'], columns=['C']): - cmarg = res['All'][:-1] - exp = self.data.groupby(index)[col].mean() - tm.assert_series_equal(cmarg, exp, check_names=False) - self.assertEqual(cmarg.name, 'All') - - res = res.sortlevel() - rmarg = res.xs(('All', ''))[:-1] - exp = self.data.groupby(columns)[col].mean() - tm.assert_series_equal(rmarg, exp, check_names=False) - self.assertEqual(rmarg.name, ('All', '')) - - gmarg = res['All']['All', ''] - exp = self.data[col].mean() - self.assertEqual(gmarg, exp) + def _check_output(result, values_col, index=['A', 'B'], + columns=['C'], + margins_col='All'): + col_margins = result.ix[:-1, margins_col] + expected_col_margins = self.data.groupby(index)[values_col].mean() + tm.assert_series_equal(col_margins, expected_col_margins, + check_names=False) + self.assertEqual(col_margins.name, margins_col) + + result = result.sortlevel() + index_margins = result.ix[(margins_col, '')].iloc[:-1] + expected_ix_margins = self.data.groupby(columns)[values_col].mean() + tm.assert_series_equal(index_margins, expected_ix_margins, + check_names=False) + self.assertEqual(index_margins.name, (margins_col, '')) + + grand_total_margins = result.loc[(margins_col, ''), margins_col] + expected_total_margins = self.data[values_col].mean() + self.assertEqual(grand_total_margins, expected_total_margins) # column specified - table = self.data.pivot_table('D', index=['A', 'B'], columns='C', - margins=True, aggfunc=np.mean) - _check_output(table, 'D') + result = self.data.pivot_table(values='D', index=['A', 'B'], + columns='C', + margins=True, aggfunc=np.mean) + _check_output(result, 'D') + + # Set a different margins_name (not 'All') + result = self.data.pivot_table(values='D', index=['A', 'B'], + columns='C', + margins=True, aggfunc=np.mean, + margins_name='Totals') + _check_output(result, 'D', margins_col='Totals') # no column specified table = self.data.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.mean) - for valcol in table.columns.levels[0]: - _check_output(table[valcol], valcol) + for value_col in table.columns.levels[0]: + _check_output(table[value_col], value_col) # no col @@ -257,49 +269,61 @@ def _check_output(res, col, index=['A', 'B'], columns=['C']): self.data.columns = [k * 2 for k in self.data.columns] table = self.data.pivot_table(index=['AA', 'BB'], margins=True, aggfunc=np.mean) - for valcol in table.columns: - gmarg = table[valcol]['All', ''] - self.assertEqual(gmarg, self.data[valcol].mean()) - - # this is OK - table = self.data.pivot_table(index=['AA', 'BB'], margins=True, - aggfunc='mean') + for value_col in table.columns: + totals = table.loc[('All', ''), value_col] + self.assertEqual(totals, self.data[value_col].mean()) # no rows rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, aggfunc=np.mean) tm.assertIsInstance(rtable, Series) + + table = self.data.pivot_table(index=['AA', 'BB'], margins=True, + aggfunc='mean') for item in ['DD', 'EE', 'FF']: - gmarg = table[item]['All', ''] - self.assertEqual(gmarg, self.data[item].mean()) + totals = table.loc[('All', ''), item] + self.assertEqual(totals, self.data[item].mean()) # issue number #8349: pivot_table with margins and dictionary aggfunc + data = [ + {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2013, + 'MONTH': 12, 'DAYS': 3, 'SALARY': 17}, + {'JOB': 'Employ', 'NAME': + 'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23}, + {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, + 'MONTH': 1, 'DAYS': 10, 'SALARY': 100}, + {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, + 'MONTH': 1, 'DAYS': 11, 'SALARY': 110}, + {'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, + 'MONTH': 1, 'DAYS': 15, 'SALARY': 200}, + {'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, + 'MONTH': 2, 'DAYS': 8, 'SALARY': 80}, + {'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, + 'MONTH': 2, 'DAYS': 5, 'SALARY': 190}, + ] - df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, - {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, - {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, - {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, - {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, - {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, - {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ]) - - df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False) - - rs=df.pivot_table( index=['JOB','NAME'], - columns=['YEAR','MONTH'], - values=['DAYS','SALARY'], - aggfunc={'DAYS':'mean','SALARY':'sum'}, - margins=True) + df = DataFrame(data) - ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['DAYS'],aggfunc='mean',margins=True) + df = df.set_index(['JOB', 'NAME', 'YEAR', 'MONTH'], drop=False, + append=False) - tm.assert_frame_equal(rs['DAYS'], ex['DAYS']) + result = df.pivot_table(index=['JOB', 'NAME'], + columns=['YEAR', 'MONTH'], + values=['DAYS', 'SALARY'], + aggfunc={'DAYS': 'mean', 'SALARY': 'sum'}, + margins=True) - ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['SALARY'],aggfunc='sum',margins=True) + expected = df.pivot_table(index=['JOB', 'NAME'], + columns=['YEAR', 'MONTH'], values=['DAYS'], + aggfunc='mean', margins=True) - tm.assert_frame_equal(rs['SALARY'], ex['SALARY']) + tm.assert_frame_equal(result['DAYS'], expected['DAYS']) + expected = df.pivot_table(index=['JOB', 'NAME'], + columns=['YEAR', 'MONTH'], values=['SALARY'], + aggfunc='sum', margins=True) + tm.assert_frame_equal(result['SALARY'], expected['SALARY']) def test_pivot_integer_columns(self): # caused by upstream bug in unstack @@ -402,6 +426,25 @@ def test_margins_no_values_two_row_two_cols(self): result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) + def test_pivot_table_with_margins_set_margin_name(self): + # GH 3335 + for margin_name in ['foo', 'one', 666, None, ['a', 'b']]: + with self.assertRaises(ValueError): + # multi-index index + pivot_table(self.data, values='D', index=['A', 'B'], + columns=['C'], margins=True, + margins_name=margin_name) + with self.assertRaises(ValueError): + # multi-index column + pivot_table(self.data, values='D', index=['C'], + columns=['A', 'B'], margins=True, + margins_name=margin_name) + with self.assertRaises(ValueError): + # non-multi-index index/column + pivot_table(self.data, values='D', index=['A'], + columns=['B'], margins=True, + margins_name=margin_name) + def test_pivot_timegrouper(self): df = DataFrame({ 'Branch' : 'A A A A A A A B'.split(),