In [11]:
# 数据聚合
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [12]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,2.634123,-1.7127
1,a,two,0.896584,1.219689
2,b,one,1.364799,-0.993468
3,b,two,-1.833026,1.384741
4,a,one,-0.242178,0.459315


In [13]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9) # 计算分组之后的分位数

key1
a    2.286616
b    1.045017
Name: data1, dtype: float64

In [14]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak) # 对分组之后的数据使用自定义聚合函数

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.876301,2.932389
b,3.197825,2.37821


In [15]:
grouped.describe() # 分别描述分组后的每一组数据

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,1.096177,1.448501,-0.242178,0.327203,0.896584,1.765354,2.634123,3.0,-0.011232,1.521771,-1.7127,-0.626692,0.459315,0.839502,1.219689
b,2.0,-0.234113,2.261204,-1.833026,-1.03357,-0.234113,0.565343,1.364799,2.0,0.195636,1.681648,-0.993468,-0.398916,0.195636,0.790189,1.384741


In [16]:
# 优化过的聚合函数：
# count：     非NA值的数量
# sum：       非NA值的和
# mean：      非NA值的平均数
# median：    非NA值的中位数
# std/var：   无偏（分母为n - 1）的标准差和方差
# min/max：   非NA值的最小/最大值
# prod：      非NA值的积
# first/last：第一个/最后一个非NA值

In [17]:
tips = pd.read_csv('tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill'] # 新加一列，小费与账单金额的比例。
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [18]:
# 面向列的多函数应用

In [19]:
# 原书的例子根据sex和是否吸烟做分组，怀疑因为政治正确，sex字段被移除。
grouped = tips.groupby(['smoker', 'time']) # 根据性别和是否抽烟分组
grouped_pct = grouped['tip_pct']
# grouped_pct.agg('mean') # 和下面等价
grouped_pct.mean()

smoker  time  
No      Dinner    0.158653
        Lunch     0.160920
Yes     Dinner    0.160828
        Lunch     0.170404
Name: tip_pct, dtype: float64

In [20]:
grouped_pct.agg(['mean', 'std', peak_to_peak]) # 分别应用3个聚合函数

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Dinner,0.158653,0.040458,0.235193
No,Lunch,0.16092,0.038989,0.19335
Yes,Dinner,0.160828,0.095153,0.674707
Yes,Lunch,0.170404,0.04277,0.1693


In [21]:
 grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) # 列重命名

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
No,Dinner,0.158653,0.040458
No,Lunch,0.16092,0.038989
Yes,Dinner,0.160828,0.095153
Yes,Lunch,0.170404,0.04277


In [22]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions) # 对group后的两个字段分别作用functions
result

  result = grouped['tip_pct', 'total_bill'].agg(functions) # 对group后的两个字段分别作用functions


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No,Dinner,106,0.158653,0.29199,106,20.09566,48.33
No,Lunch,45,0.16092,0.266312,45,17.050889,41.19
Yes,Dinner,70,0.160828,0.710345,70,21.859429,50.81
Yes,Lunch,23,0.170404,0.259314,23,17.39913,43.11


In [23]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,Dinner,106,0.158653,0.29199
No,Lunch,45,0.16092,0.266312
Yes,Dinner,70,0.160828,0.710345
Yes,Lunch,23,0.170404,0.259314


In [24]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)

  grouped['tip_pct', 'total_bill'].agg(ftuples)


Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
No,Dinner,0.158653,0.001637,20.09566,69.604821
No,Lunch,0.16092,0.00152,17.050889,59.587154
Yes,Dinner,0.160828,0.009054,21.859429,104.148753
Yes,Lunch,0.170404,0.001829,17.39913,61.958436


In [25]:
grouped.agg({'tip' : np.max, 'size' : 'sum'}) # 不同的列对应不同的函数

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
smoker,time,Unnamed: 2_level_1,Unnamed: 3_level_1
No,Dinner,9.0,290
No,Lunch,6.7,113
Yes,Dinner,10.0,173
Yes,Lunch,5.0,51


In [26]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'}) # 每列可以对应不同数量的函数

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,sum
smoker,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
No,Dinner,0.056797,0.29199,0.158653,0.040458,290
No,Lunch,0.072961,0.266312,0.16092,0.038989,113
Yes,Dinner,0.035638,0.710345,0.160828,0.095153,173
Yes,Lunch,0.090014,0.259314,0.170404,0.04277,51


In [27]:
# 以“无索引”的形式返回聚合数据

In [28]:
tips.groupby(['smoker', 'time'], as_index=False).mean() # 把原来的索引变成列

Unnamed: 0,smoker,time,total_bill,tip,size,tip_pct
0,No,Dinner,20.09566,3.126887,2.735849,0.158653
1,No,Lunch,17.050889,2.673778,2.511111,0.16092
2,Yes,Dinner,21.859429,3.066,2.471429,0.160828
3,Yes,Lunch,17.39913,2.834348,2.217391,0.170404
