Skip to content

Commit

Permalink
Merge pull request #11581 from lexual/issue_3335_pivot_handle_all_for…
Browse files Browse the repository at this point in the history
…_margins

ENH: #3335 Pivot table support for setting name of margins column.
  • Loading branch information
jreback committed Nov 15, 2015
2 parents 96c1f63 + 1ca006c commit 10fe47e
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 69 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Enhancements

pd.Index([1, np.nan, 3]).fillna(2)

- ``pivot_table`` now has a ``margins_name`` argument so you can use something other than the default of 'All' (:issue:`3335`)

.. _whatsnew_0171.api:

API changes
Expand Down
70 changes: 49 additions & 21 deletions pandas/tools/pivot.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# pylint: disable=E1103

import warnings

from pandas import Series, DataFrame
from pandas.core.index import MultiIndex, Index
from pandas.core.groupby import Grouper
from pandas.tools.merge import concat
from pandas.tools.util import cartesian_product
from pandas.compat import range, lrange, zip
from pandas.util.decorators import deprecate_kwarg
from pandas import compat
import pandas.core.common as com
import numpy as np


def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
fill_value=None, margins=False, dropna=True):
fill_value=None, margins=False, dropna=True,
margins_name='All'):
"""
Create a spreadsheet-style pivot table as a DataFrame. The levels in the
pivot table will be stored in MultiIndex objects (hierarchical indexes) on
Expand All @@ -40,6 +40,9 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
Add all row / columns (e.g. for subtotal / grand totals)
dropna : boolean, default True
Do not include columns whose entries are all NaN
margins_name : string, default 'All'
Name of the row / column that will contain the totals
when margins is True.
Examples
--------
Expand Down Expand Up @@ -127,7 +130,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels))
table = table.reindex_axis(m, axis=1)
except AttributeError:
pass # it's a single level or a series
pass # it's a single level or a series

if isinstance(table, DataFrame):
if isinstance(table.columns, MultiIndex):
Expand All @@ -140,7 +143,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',

if margins:
table = _add_margins(table, data, values, rows=index,
cols=columns, aggfunc=aggfunc)
cols=columns, aggfunc=aggfunc,
margins_name=margins_name)

# discard the top level
if values_passed and not values_multi:
Expand All @@ -155,29 +159,49 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
DataFrame.pivot_table = pivot_table


def _add_margins(table, data, values, rows, cols, aggfunc):
def _add_margins(table, data, values, rows, cols, aggfunc,
margins_name='All'):
if not isinstance(margins_name, compat.string_types):
raise ValueError('margins_name argument must be a string')

exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name)
for level in table.index.names:
if margins_name in table.index.get_level_values(level):
raise ValueError(exception_msg)

grand_margin = _compute_grand_margin(data, values, aggfunc)
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)

# could be passed a Series object with no 'columns'
if hasattr(table, 'columns'):
for level in table.columns.names[1:]:
if margins_name in table.columns.get_level_values(level):
raise ValueError(exception_msg)

if len(rows) > 1:
key = (margins_name,) + ('',) * (len(rows) - 1)
else:
key = margins_name

if not values and isinstance(table, Series):
# If there are no values and the table is a series, then there is only
# one column in the data. Compute grand margin and return it.
row_key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All'
return table.append(Series({row_key: grand_margin['All']}))
return table.append(Series({key: grand_margin[margins_name]}))

if values:
marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin)
marginal_result_set = _generate_marginal_results(table, data, values,
rows, cols, aggfunc,
grand_margin,
margins_name)
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set
else:
marginal_result_set = _generate_marginal_results_without_values(table, data, rows, cols, aggfunc)
marginal_result_set = _generate_marginal_results_without_values(
table, data, rows, cols, aggfunc, margins_name)
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set

key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All'

row_margin = row_margin.reindex(result.columns)
# populate grand margin
for k in margin_keys:
Expand All @@ -201,7 +225,8 @@ def _add_margins(table, data, values, rows, cols, aggfunc):
return result


def _compute_grand_margin(data, values, aggfunc):
def _compute_grand_margin(data, values, aggfunc,
margins_name='All'):

if values:
grand_margin = {}
Expand All @@ -220,18 +245,19 @@ def _compute_grand_margin(data, values, aggfunc):
pass
return grand_margin
else:
return {'All': aggfunc(data.index)}

return {margins_name: aggfunc(data.index)}

def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin):

def _generate_marginal_results(table, data, values, rows, cols, aggfunc,
grand_margin,
margins_name='All'):
if len(cols) > 0:
# need to "interleave" the margins
table_pieces = []
margin_keys = []

def _all_key(key):
return (key, 'All') + ('',) * (len(cols) - 1)
return (key, margins_name) + ('',) * (len(cols) - 1)

if len(rows) > 0:
margin = data[rows + values].groupby(rows).agg(aggfunc)
Expand Down Expand Up @@ -282,15 +308,17 @@ def _all_key(key):
return result, margin_keys, row_margin


def _generate_marginal_results_without_values(table, data, rows, cols, aggfunc):
def _generate_marginal_results_without_values(
table, data, rows, cols, aggfunc,
margins_name='All'):
if len(cols) > 0:
# need to "interleave" the margins
margin_keys = []

def _all_key():
if len(cols) == 1:
return 'All'
return ('All', ) + ('', ) * (len(cols) - 1)
return margins_name
return (margins_name, ) + ('', ) * (len(cols) - 1)

if len(rows) > 0:
margin = data[rows].groupby(rows).apply(aggfunc)
Expand Down
139 changes: 91 additions & 48 deletions pandas/tools/tests/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,82 +224,106 @@ def test_pivot_with_tz(self):
tm.assert_frame_equal(pv, expected)

def test_margins(self):
def _check_output(res, col, index=['A', 'B'], columns=['C']):
cmarg = res['All'][:-1]
exp = self.data.groupby(index)[col].mean()
tm.assert_series_equal(cmarg, exp, check_names=False)
self.assertEqual(cmarg.name, 'All')

res = res.sortlevel()
rmarg = res.xs(('All', ''))[:-1]
exp = self.data.groupby(columns)[col].mean()
tm.assert_series_equal(rmarg, exp, check_names=False)
self.assertEqual(rmarg.name, ('All', ''))

gmarg = res['All']['All', '']
exp = self.data[col].mean()
self.assertEqual(gmarg, exp)
def _check_output(result, values_col, index=['A', 'B'],
columns=['C'],
margins_col='All'):
col_margins = result.ix[:-1, margins_col]
expected_col_margins = self.data.groupby(index)[values_col].mean()
tm.assert_series_equal(col_margins, expected_col_margins,
check_names=False)
self.assertEqual(col_margins.name, margins_col)

result = result.sortlevel()
index_margins = result.ix[(margins_col, '')].iloc[:-1]
expected_ix_margins = self.data.groupby(columns)[values_col].mean()
tm.assert_series_equal(index_margins, expected_ix_margins,
check_names=False)
self.assertEqual(index_margins.name, (margins_col, ''))

grand_total_margins = result.loc[(margins_col, ''), margins_col]
expected_total_margins = self.data[values_col].mean()
self.assertEqual(grand_total_margins, expected_total_margins)

# column specified
table = self.data.pivot_table('D', index=['A', 'B'], columns='C',
margins=True, aggfunc=np.mean)
_check_output(table, 'D')
result = self.data.pivot_table(values='D', index=['A', 'B'],
columns='C',
margins=True, aggfunc=np.mean)
_check_output(result, 'D')

# Set a different margins_name (not 'All')
result = self.data.pivot_table(values='D', index=['A', 'B'],
columns='C',
margins=True, aggfunc=np.mean,
margins_name='Totals')
_check_output(result, 'D', margins_col='Totals')

# no column specified
table = self.data.pivot_table(index=['A', 'B'], columns='C',
margins=True, aggfunc=np.mean)
for valcol in table.columns.levels[0]:
_check_output(table[valcol], valcol)
for value_col in table.columns.levels[0]:
_check_output(table[value_col], value_col)

# no col

# to help with a buglet
self.data.columns = [k * 2 for k in self.data.columns]
table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
aggfunc=np.mean)
for valcol in table.columns:
gmarg = table[valcol]['All', '']
self.assertEqual(gmarg, self.data[valcol].mean())

# this is OK
table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
aggfunc='mean')
for value_col in table.columns:
totals = table.loc[('All', ''), value_col]
self.assertEqual(totals, self.data[value_col].mean())

# no rows
rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True,
aggfunc=np.mean)
tm.assertIsInstance(rtable, Series)

table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
aggfunc='mean')
for item in ['DD', 'EE', 'FF']:
gmarg = table[item]['All', '']
self.assertEqual(gmarg, self.data[item].mean())
totals = table.loc[('All', ''), item]
self.assertEqual(totals, self.data[item].mean())

# issue number #8349: pivot_table with margins and dictionary aggfunc
data = [
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2013,
'MONTH': 12, 'DAYS': 3, 'SALARY': 17},
{'JOB': 'Employ', 'NAME':
'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23},
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014,
'MONTH': 1, 'DAYS': 10, 'SALARY': 100},
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014,
'MONTH': 1, 'DAYS': 11, 'SALARY': 110},
{'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014,
'MONTH': 1, 'DAYS': 15, 'SALARY': 200},
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014,
'MONTH': 2, 'DAYS': 8, 'SALARY': 80},
{'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014,
'MONTH': 2, 'DAYS': 5, 'SALARY': 190},
]

df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17},
{'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23},
{'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100},
{'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110},
{'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200},
{'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80},
{'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ])

df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False)

rs=df.pivot_table( index=['JOB','NAME'],
columns=['YEAR','MONTH'],
values=['DAYS','SALARY'],
aggfunc={'DAYS':'mean','SALARY':'sum'},
margins=True)
df = DataFrame(data)

ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['DAYS'],aggfunc='mean',margins=True)
df = df.set_index(['JOB', 'NAME', 'YEAR', 'MONTH'], drop=False,
append=False)

tm.assert_frame_equal(rs['DAYS'], ex['DAYS'])
result = df.pivot_table(index=['JOB', 'NAME'],
columns=['YEAR', 'MONTH'],
values=['DAYS', 'SALARY'],
aggfunc={'DAYS': 'mean', 'SALARY': 'sum'},
margins=True)

ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['SALARY'],aggfunc='sum',margins=True)
expected = df.pivot_table(index=['JOB', 'NAME'],
columns=['YEAR', 'MONTH'], values=['DAYS'],
aggfunc='mean', margins=True)

tm.assert_frame_equal(rs['SALARY'], ex['SALARY'])
tm.assert_frame_equal(result['DAYS'], expected['DAYS'])

expected = df.pivot_table(index=['JOB', 'NAME'],
columns=['YEAR', 'MONTH'], values=['SALARY'],
aggfunc='sum', margins=True)

tm.assert_frame_equal(result['SALARY'], expected['SALARY'])

def test_pivot_integer_columns(self):
# caused by upstream bug in unstack
Expand Down Expand Up @@ -402,6 +426,25 @@ def test_margins_no_values_two_row_two_cols(self):
result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True)
self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])

def test_pivot_table_with_margins_set_margin_name(self):
# GH 3335
for margin_name in ['foo', 'one', 666, None, ['a', 'b']]:
with self.assertRaises(ValueError):
# multi-index index
pivot_table(self.data, values='D', index=['A', 'B'],
columns=['C'], margins=True,
margins_name=margin_name)
with self.assertRaises(ValueError):
# multi-index column
pivot_table(self.data, values='D', index=['C'],
columns=['A', 'B'], margins=True,
margins_name=margin_name)
with self.assertRaises(ValueError):
# non-multi-index index/column
pivot_table(self.data, values='D', index=['A'],
columns=['B'], margins=True,
margins_name=margin_name)

def test_pivot_timegrouper(self):
df = DataFrame({
'Branch' : 'A A A A A A A B'.split(),
Expand Down

0 comments on commit 10fe47e

Please sign in to comment.