# 数据聚合

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.02454,1.9867
1,a,two,-0.357569,-0.339052
2,b,one,0.87104,0.123964
3,b,two,0.944092,0.587429
4,a,one,-1.150627,1.325387


In [3]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9) # 计算分组之后的分位数

key1
a   -0.490963
b    0.936787
Name: data1, dtype: float64

In [4]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak) # 对分组之后的数据使用自定义聚合函数

# cc：这一招挺牛的！除了一些常用的sum等聚合函数之外，可以自定义聚合函数；

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.793058,2.325752
b,0.073052,0.463465


In [5]:
grouped.describe() # 分别描述分组后的每一组数据


Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.844246,0.426163,-1.150627,-1.087584,-1.02454,-0.691055,-0.357569,3.0,0.991012,1.198389,-0.339052,0.493167,1.325387,1.656043,1.9867
b,2.0,0.907566,0.051655,0.87104,0.889303,0.907566,0.925829,0.944092,2.0,0.355697,0.327719,0.123964,0.23983,0.355697,0.471563,0.587429


优化过的聚合函数：
- `count`：     非NA值的数量
- `sum`：       非NA值的和
- `mean`：      非NA值的平均数
- `median`：    非NA值的中位数
- `std/var`：   无偏（分母为n - 1）的标准差和方差
- `min/max`：   非NA值的最小/最大值
- `prod`：      非NA值的积
- `first/last`：第一个/最后一个非NA值

In [7]:
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill'] # 新加一列，小费与账单金额的比例。
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


## 面向列的多函数应用

In [8]:
# 原书的例子根据sex和是否吸烟做分组，怀疑因为政治正确，sex字段被移除。
grouped = tips.groupby(['smoker', 'time']) # 根据性别和是否抽烟分组
grouped_pct = grouped['tip_pct']
# grouped_pct.agg('mean') # 和下面等价
grouped_pct.mean()

smoker  time  
No      Dinner    0.134633
Name: tip_pct, dtype: float64

In [9]:
grouped_pct.agg(['mean', 'std', peak_to_peak]) # 分别应用3个聚合函数


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Dinner,0.134633,0.043359,0.107141


In [11]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) # 列重命名


Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
No,Dinner,0.134633,0.043359


In [12]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions) # 对group后的两个字段分别作用functions
result

  result = grouped['tip_pct', 'total_bill'].agg(functions) # 对group后的两个字段分别作用functions


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No,Dinner,5,0.134633,0.166587,5,19.322,24.59


In [13]:
result['tip_pct']


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Dinner,5,0.134633,0.166587


In [14]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

  grouped['tip_pct', 'total_bill'].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
No,Dinner,0.134633,0.00188,19.322,33.92697


In [15]:
grouped.agg({'tip' : np.max, 'size' : 'sum'}) # 不同的列对应不同的函数

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
No,Dinner,3.61,14


In [16]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'}) # 每列可以对应不同数量的函数

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
No,Dinner,0.059447,0.166587,0.134633,0.043359,14


## 以“无索引”的形式返回聚合数据

In [17]:
tips.groupby(['smoker', 'time'], as_index=False).mean() # 把原来的索引变成列


Unnamed: 0,smoker,time,total_bill,tip,size,tip_pct
0,No,Dinner,19.322,2.618,2.8,0.134633


In [None]:
# cc：也可以用reset_index()

In [18]:
tips.groupby(['smoker', 'time']).mean().reset_index()

Unnamed: 0,smoker,time,total_bill,tip,size,tip_pct
0,No,Dinner,19.322,2.618,2.8,0.134633
