In [1]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

### 7.1 GroupBy机制

#### 一.  基本概念
1. groupby在R语言中又叫做,"split-apply-combine"  
  1. pandas数据结构根据传进来的key拆分为多个分组
  2. 将一个函数应用到每个分组产生各自的结果
  3. 组合这些结果到新的对象, 作为返回
2. 分组中的缺失值会被自动忽略.   
3. 查看分组情况 :  
 GroupBy.size()返回一个Series, 该Series的多层index表示分组数值. 数据为该分组内有几个数据 
  
#### 二.  Series的分组  
  1. `Series.groupBy(otherSeries)` : 按照`otherSeries`的值分组, 然后把分组后的index复用到Series上, 形成分组信息
  2. Series分组后进行聚合操作, 产生的新Series结构为:  
   index: otherSeries分好组后的值.  
   `index.name=otherSeries.index.name`
   新Series的data = 聚合后的数据
  3. `Series.groupby(otherSeries1,otherSeries2, ..)` : 按照多个Series进行分组  
   按照多个otherSeries分组, 则产生的新Series的索引时多层级的. 第一层为otherSeries1的值, 第二层为otherSeries2的值
   
#### 三. DataFrame.groupby('columnIdx')  
1. DataFrame默认把所有列, 按照`df['columnIdx']`这个Series进行分组   
 若分组后执行的聚合函数为mean这种数值函数, 则mean()会自动忽略非数值列
 
2. DataFrame语法糖  
 `df.groupby('columnidx')['show_columnidx'] = df['show_columnidx'].groupby('columnidx')`

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.643532,1.420037,a,one
1,0.216824,0.364788,a,two
2,-1.089275,-0.738775,b,one
3,0.082878,0.451184,b,two
4,-0.51339,-0.715815,a,one


In [3]:
# SeriesGroupBy对象实际没有进行任何运算, 
# 只是根据df['key1']这个Series的值进行分组后, 把分好组的index复用到df['data1']上
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7fd066fa8090>

In [4]:
grouped.mean()

key1
a   -0.313366
b   -0.503199
Name: data1, dtype: float64

In [5]:
# Series.groupby(otherSeries1,otherSeries2)
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.578461
      two     0.216824
b     one    -1.089275
      two     0.082878
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.578461,0.216824
b,-1.089275,0.082878


In [7]:
# DataFrame分组
# mean()自动忽略非数值列'key2'
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.313366,0.356337
b,-0.503199,-0.143796


In [8]:
# Group.size查看分组情况
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

#### 四. 分组迭代
1. 因为分组条件为多个Series的值区分, 所以可以查看每个分组的层级索引, 及组内包含的数据`Series/DataFrame`  
 `for name,Group in df.groupby('columnIdx')`
2. 通常一个有用的操作是:  
 把分组名与分组内容形成字典dict,可使用`dict(list(df.groupby(['columnidx'])))`
 1. 用list(GroupBy), 把GroupBy内的iterator元组构成列表
 2. 再用dict(list) : 把`(组名,分组内容)`这个列表变成`dict`
 
3. 分组通常是把行分组, 也可指定axis=1,使其把列分组

In [9]:
# 迭代查看分组的层级索引与分组内容
for name,group in df.groupby(['key1','key2']):
    print name
    print group
    print "-------------------------------"

('a', 'one')
      data1     data2 key1 key2
0 -0.643532  1.420037    a  one
4 -0.513390 -0.715815    a  one
-------------------------------
('a', 'two')
      data1     data2 key1 key2
1  0.216824  0.364788    a  two
-------------------------------
('b', 'one')
      data1     data2 key1 key2
2 -1.089275 -0.738775    b  one
-------------------------------
('b', 'two')
      data1     data2 key1 key2
3  0.082878  0.451184    b  two
-------------------------------


In [10]:
# 将分组信息包装成dict
dict(list(df.groupby('key1')))

{'a':       data1     data2 key1 key2
 0 -0.643532  1.420037    a  one
 1  0.216824  0.364788    a  two
 4 -0.513390 -0.715815    a  one, 'b':       data1     data2 key1 key2
 2 -1.089275 -0.738775    b  one
 3  0.082878  0.451184    b  two}

In [11]:
# 把列分组
print df.dtypes # Series
grouped = df.groupby(df.dtypes,axis=1)

data1    float64
data2    float64
key1      object
key2      object
dtype: object


In [12]:
for dtype,group in grouped:
    print dtype
    print group
    print '-------------------------------'

float64
      data1     data2
0 -0.643532  1.420037
1  0.216824  0.364788
2 -1.089275 -0.738775
3  0.082878  0.451184
4 -0.513390 -0.715815
-------------------------------
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
-------------------------------


#### 五.  其它分组条件
1. 字典做分组条件  
  1. 按照字典的value分组, 记录下被分到同一个group的key
  2. key对应pandas数据结构的index/columnIdx, 从而对pandas的数据结构进行分组

2. 函数作为分组条件  
  1. 函数会应用在pandas数据结构的index/columnIdx上, 返回同一个值得作为一个分组
  
3. 根据索引级别分组

In [13]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.354449,-0.673048,-0.399155,0.905654,-0.263164
Steve,0.97702,-0.073533,1.978242,0.166866,0.252141
Wes,-0.429738,1.919791,-0.098522,0.448823,-1.454787
Jim,0.655896,0.311578,-0.778851,0.567376,-0.140001
Travis,-0.23101,-0.41232,-0.783344,-0.49181,0.1317


In [14]:
# 字典作为分组条件
mapping = {'a': 'red', 
           'b': 'red', 
           'c': 'blue',
           'd': 'blue', 
           'e': 'red', 
           'f': 'orange'}
people.groupby(mapping,axis=1).sum()

Unnamed: 0,blue,red
Joe,0.506499,-0.581763
Steve,2.145108,1.155628
Wes,0.350301,0.035267
Jim,-0.211474,0.827474
Travis,-1.275155,-0.511631


In [15]:
# 函数作为分组条件
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.580607,1.558322,-1.276527,1.921853,-1.857952
5,0.97702,-0.073533,1.978242,0.166866,0.252141
6,-0.23101,-0.41232,-0.783344,-0.49181,0.1317


In [16]:
# 按照索引级别分组
columnIdx = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
                                     names = ['cty','tenor'])
df = DataFrame(np.random.randn(4,5),columns=columnIdx)
df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.310956,-0.641511,-1.682796,-0.043149,-0.304086
1,-0.022172,0.42597,-0.705505,0.024931,-0.207422
2,-2.289864,1.187842,1.557849,-0.652476,1.003132
3,0.531849,-2.108305,-0.371139,-0.507844,-2.289831


In [17]:
df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


### 10.2 聚合函数

#### 1. 任何能从数组生成标量的过程, 都叫聚合  
 例如, Series/DataFrame的函数quantile(), 计算Series/DataFrame的分位数. 虽然quantile()没有明确定义在GroupBy对象中, 但是他能从一个Series产生一个标量, 因此他可以用作聚合函数
 
#### 2. 使用aggregate自定义聚合函数
 GroupBy.aggregate(func) 

In [22]:
# quantile用于聚合
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df['data1'].groupby(df['key1']).quantile(0.9)

key1
a    0.498376
b    1.835900
Name: data1, dtype: float64

In [23]:
# 自定义聚合函数
peek2peek = lambda x : x.max()-x.min()
df.groupby('key1').aggregate(peek2peek)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.177589,2.892375
b,1.966989,1.757164


#### 3. GroupBy.agg()面向多列的多聚合函数应用
1. `GroupBy.agg('aggfuncName')` :  agg函数可使用函数名代表聚合函数
2. `GroupBy.agg([func1,func2])` :  在一列同时应用多个聚合函数
3.  `GroupBy.agg([('alias1','func1'),('alias2','func2')])` :   
 在一列应用多个函数时, 给最后聚合成的结果的index起名(使用元组组成的列表(别名. 函数名))
4. 在多个列上进行多个聚合函数: `GroupBy['columnIdx1','columnIdx2'..].agg('aggfuncName')`
5. 终级应用, 定义在DataFrame的哪一列使用什么样的聚合函数:  
 `GroupBy.agg(dict('columnIdx1':['functions1'] , 'columnIdx2':['functions2']))`

In [28]:
df = pd.read_csv('../example/tips.csv')
# Add tip percentage of total bill
df['tip_pct'] = df['tip'] / df['total_bill']
df.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [37]:
# 使用函数名代表聚合函数
grouped = df.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [33]:
# 在一列同时应用多个聚合函数
grouped_pct.agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [34]:
# 给最后聚合成的结果的index起名
grouped_pct.agg([('foo','mean'),('bar',np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [35]:
# 在多个列上进行多个聚合函数
functions = ['mean','count','max']
grouped['total_bill','tip_pct'].agg(functions)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,max,mean,count,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,18.42,4,22.75,0.15165,4,0.187735
Fri,Yes,16.813333,15,40.17,0.174783,15,0.26348
Sat,No,19.661778,45,48.33,0.158048,45,0.29199
Sat,Yes,21.276667,42,50.81,0.147906,42,0.325733
Sun,No,20.506667,57,48.17,0.160113,57,0.252672
Sun,Yes,24.12,19,45.35,0.18725,19,0.710345
Thur,No,17.113111,45,41.19,0.160298,45,0.266312
Thur,Yes,19.190588,17,43.11,0.163863,17,0.241255


In [38]:
# 终级应用, 定义在DataFrame的哪一列使用什么样的聚合函数
grouped.agg({'tip_pct':['min','max','mean'],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.120385,0.187735,0.15165,9
Fri,Yes,0.103555,0.26348,0.174783,31
Sat,No,0.056797,0.29199,0.158048,115
Sat,Yes,0.035638,0.325733,0.147906,104
Sun,No,0.059447,0.252672,0.160113,167
Sun,Yes,0.06566,0.710345,0.18725,49
Thur,No,0.072961,0.266312,0.160298,112
Thur,Yes,0.090014,0.241255,0.163863,40
