In [4]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

### 7.1 GroupBy机制

#### 一.  基本概念
1. groupby在R语言中又叫做,"split-apply-combine"  
  1. pandas数据结构根据传进来的key拆分为多个分组
  2. 将一个函数应用到每个分组产生各自的结果
  3. 组合这些结果到新的对象, 作为返回
2. 分组中的缺失值会被自动忽略.   
3. 查看分组情况 :  
 GroupBy.size()返回一个Series, 该Series的多层index表示分组数值. 数据为该分组内有几个数据 
  
#### 二.  Series的分组  
  1. `Series.groupBy(otherSeries)` : 按照`otherSeries`的值分组, 然后把分组后的index复用到Series上, 形成分组信息
  2. Series分组后进行聚合操作, 产生的新Series结构为:  
   index: otherSeries分好组后的值.  
   `index.name=otherSeries.index.name`
   新Series的data = 聚合后的数据
  3. `Series.groupby(otherSeries1,otherSeries2, ..)` : 按照多个Series进行分组  
   按照多个otherSeries分组, 则产生的新Series的索引时多层级的. 第一层为otherSeries1的值, 第二层为otherSeries2的值
   
#### 三. DataFrame.groupby('columnIdx')  
1. DataFrame默认把所有列, 按照`df['columnIdx']`这个Series进行分组   
 若分组后执行的聚合函数为mean这种数值函数, 则mean()会自动忽略非数值列
 
2. DataFrame语法糖  
 `df.groupby('columnidx')['show_columnidx'] = df['show_columnidx'].groupby('columnidx')`

In [5]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.055639,0.295956,a,one
1,1.668246,-0.798594,a,two
2,-0.779818,1.974093,b,one
3,-1.046608,1.161606,b,two
4,-0.527844,2.275711,a,one


In [8]:
# SeriesGroupBy对象实际没有进行任何运算, 
# 只是根据df['key1']这个Series的值进行分组后, 把分好组的index复用到df['data1']上
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x10f11d250>

In [9]:
grouped.mean()

key1
a    0.732014
b   -0.913213
Name: data1, dtype: float64

In [14]:
# Series.groupby(otherSeries1,otherSeries2)
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one     0.263897
      two     1.668246
b     one    -0.779818
      two    -1.046608
Name: data1, dtype: float64

In [15]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.263897,1.668246
b,-0.779818,-1.046608


In [17]:
# DataFrame分组
# mean()自动忽略非数值列'key2'
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.732014,0.591024
b,-0.913213,1.56785


In [20]:
# Group.size查看分组情况
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

#### 四. 分组迭代
1. 因为分组条件为多个Series的值区分, 所以可以查看每个分组的层级索引, 及组内包含的数据`Series/DataFrame`  
 `for name,Group in df.groupby('columnIdx')`
2. 通常一个有用的操作是:  
 把分组名与分组内容形成字典dict,可使用`dict(list(df.groupby(['columnidx'])))`
 1. 用list(GroupBy), 把GroupBy内的iterator元组构成列表
 2. 再用dict(list) : 把`(组名,分组内容)`这个列表变成`dict`
 
3. 分组通常是把行分组, 也可指定axis=1,使其把列分组

In [24]:
# 迭代查看分组的层级索引与分组内容
for name,group in df.groupby(['key1','key2']):
    print name
    print group
    print "-------------------------------"

('a', 'one')
      data1     data2 key1 key2
0  1.055639  0.295956    a  one
4 -0.527844  2.275711    a  one
-------------------------------
('a', 'two')
      data1     data2 key1 key2
1  1.668246 -0.798594    a  two
-------------------------------
('b', 'one')
      data1     data2 key1 key2
2 -0.779818  1.974093    b  one
-------------------------------
('b', 'two')
      data1     data2 key1 key2
3 -1.046608  1.161606    b  two
-------------------------------


In [27]:
# 将分组信息包装成dict
dict(list(df.groupby('key1')))

{'a':       data1     data2 key1 key2
 0  1.055639  0.295956    a  one
 1  1.668246 -0.798594    a  two
 4 -0.527844  2.275711    a  one, 'b':       data1     data2 key1 key2
 2 -0.779818  1.974093    b  one
 3 -1.046608  1.161606    b  two}

In [37]:
# 把列分组
print df.dtypes # Series
grouped = df.groupby(df.dtypes,axis=1)

data1    float64
data2    float64
key1      object
key2      object
dtype: object


In [38]:
for dtype,group in grouped:
    print dtype
    print group
    print '-------------------------------'

float64
      data1     data2
0  1.055639  0.295956
1  1.668246 -0.798594
2 -0.779818  1.974093
3 -1.046608  1.161606
4 -0.527844  2.275711
-------------------------------
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
-------------------------------


#### 五.  其它分组条件
1. 字典做分组条件  
  1. 按照字典的value分组, 记录下被分到同一个group的key
  2. key对应pandas数据结构的index/columnIdx, 从而对pandas的数据结构进行分组

2. 函数作为分组条件  
  1. 函数会应用在pandas数据结构的index/columnIdx上, 返回同一个值得作为一个分组
  
3. 根据索引级别分组

In [39]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,1.268282,0.202598,0.221861,-0.250672,0.927377
Steve,-0.209567,-0.487329,-0.772916,-0.253466,-0.834534
Wes,-0.136008,0.324481,0.996862,1.437169,0.602457
Jim,0.707341,1.161476,-0.337354,-0.728394,-0.717385
Travis,0.888363,0.277433,0.640911,0.596625,-1.780832


In [42]:
# 字典作为分组条件
mapping = {'a': 'red', 
           'b': 'red', 
           'c': 'blue',
           'd': 'blue', 
           'e': 'red', 
           'f': 'orange'}
people.groupby(mapping,axis=1).sum()

Unnamed: 0,blue,red
Joe,-0.028811,2.398257
Steve,-1.026383,-1.531429
Wes,2.434031,0.79093
Jim,-1.065748,1.151431
Travis,1.237536,-0.615035


In [43]:
# 函数作为分组条件
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.839615,1.688555,0.881369,0.458103,0.812449
5,-0.209567,-0.487329,-0.772916,-0.253466,-0.834534
6,0.888363,0.277433,0.640911,0.596625,-1.780832


In [49]:
# 按照索引级别分组
columnIdx = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
                                     names = ['cty','tenor'])
df = DataFrame(np.random.randn(4,5),columns=columnIdx)
df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.148032,-1.586521,-1.028079,-0.780357,-1.516666
1,-0.93689,0.003949,0.472229,-0.011324,-0.4605
2,0.53228,1.047845,1.070381,0.196122,-0.379646
3,-0.225431,1.707557,0.838661,0.808399,1.616353


In [50]:
df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
