# Chapter 10: Data Aggregation and Group Operations

In [2]:
import pandas as pd
import numpy as np
import json

## 10.1 GroupBy Mechanics

- Dot notation `<DataFrame|Series>.groupby()` will return a SeriesGroupBy object. 
- The SeriesGroupBy object  has not been actually computed anything yet except for some intermediate data about the group key. The idea is that this object has all of the information needed to then apply some operation to each of the groups.

In [3]:
# Example - Group data and compute the mean value
df = pd.DataFrame({'key1': list('aabba'),
                   'key2': 'one two one two one'.split(' '),
                   'data1': np.random.rand(5), 
                   'data2': np.random.rand(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.971797,0.558503
1,a,two,0.390162,0.729197
2,b,one,0.770855,0.844706
3,b,two,0.802311,0.915154
4,a,one,0.221763,0.937341


In [4]:
groupedData = df['data1'].groupby(df['key1']) # return a <SeriesGroupBy> object
groupedData

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001533276B408>

In [5]:
groupedData.mean()

key1
a    0.527907
b    0.786583
Name: data1, dtype: float64

In [6]:
# Example - Group data with multiple key --> Series with Hierarchical Index
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.596780
      two     0.390162
b     one     0.770855
      two     0.802311
Name: data1, dtype: float64

In [7]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.59678,0.390162
b,0.770855,0.802311


### Iterating Over Groups

- The GroupBy object supports iteration, generating a sequence of 2-tuples containing the group name along with the chunk of data

In [8]:
# Example - Iterate through a GroupBy object
for name, group in df.groupby('key1'):
    print('{}\n{}'.format(name, group))

a
  key1 key2     data1     data2
0    a  one  0.971797  0.558503
1    a  two  0.390162  0.729197
4    a  one  0.221763  0.937341
b
  key1 key2     data1     data2
2    b  one  0.770855  0.844706
3    b  two  0.802311  0.915154


In [9]:
# Example - Iterate thorugh a GroupBy object with multiple keys
for (key1, key2), group in df.groupby(['key1', 'key2']):
    print((key1, key2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.971797  0.558503
4    a  one  0.221763  0.937341
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.390162  0.729197
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.770855  0.844706
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.802311  0.915154


### Grouping with Dicts and Series

In [24]:
# Example - Grouping using Dicts & Series
people = pd.DataFrame(np.random.rand(5, 5), 
                      columns=list('abcde'), 
                      index='Joe Steve Wes Jim Travis'.split(' '))
people

Unnamed: 0,a,b,c,d,e
Joe,0.77257,0.034328,0.911775,0.049979,0.304239
Steve,0.36444,0.540007,0.883966,0.536263,0.414379
Wes,0.59737,0.826184,0.038518,0.403087,0.292405
Jim,0.561424,0.607781,0.706336,0.124551,0.714658
Travis,0.705123,0.909332,0.440143,0.594523,0.074591


In [22]:
mapping = dict(zip(list('abcdef'), 
               'red red blue blue red orange'.split(' ')))
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [25]:
##  Grouping with Dict
gbColumn = people.groupby(mapping, axis=1)
gbColumn.sum()

Unnamed: 0,blue,red
Joe,0.961754,1.111137
Steve,1.42023,1.318826
Wes,0.441605,1.715959
Jim,0.830887,1.883863
Travis,1.034667,1.689046


In [26]:
mapSeries = pd.Series(mapping)
mapSeries

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [28]:
## Grouping with Series
people.groupby(mapSeries, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,2,3
Jim,2,3
Travis,2,3


### Grouping with Functions

In [30]:
# Example - Group by Function
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.931364,1.468293,1.65663,0.577616,1.311302
5,0.36444,0.540007,0.883966,0.536263,0.414379
6,0.705123,0.909332,0.440143,0.594523,0.074591


In [31]:
# Example - Combine grouping by Function and arrays (can be used with Dicts and Series as well)
keyList = 'one one one two two'.split(' ')
people.groupby([len, keyList]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.59737,0.034328,0.038518,0.049979,0.292405
3,two,0.561424,0.607781,0.706336,0.124551,0.714658
5,one,0.36444,0.540007,0.883966,0.536263,0.414379
6,two,0.705123,0.909332,0.440143,0.594523,0.074591


### Grouping by Index levels

In [46]:
# Example - Group by Index
columns = pd.MultiIndex.from_arrays(['US US US JP JP'.split(' '), 
                                      list('13413')], 
                                     names='cty tenor'.split(' '))
hierDF = pd.DataFrame(np.random.rand(4,5), columns=columns)
hierDF

cty,US,US,US,JP,JP
tenor,1,3,4,1,3
0,0.858101,0.483317,0.239579,0.006839,0.67787
1,0.878298,0.922941,0.774862,0.909374,0.312285
2,0.639611,0.059769,0.819949,0.116496,0.614482
3,0.813883,0.605324,0.596489,0.484791,0.803022
