In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.57548,1.330722
1,a,two,-0.168377,0.352496
2,b,one,-0.253739,0.418349
3,b,two,-0.028151,0.113045
4,a,one,0.59481,-1.513047


In [3]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)  # quantile是Series与DataFrame的样本分位数方法

key1
a    0.442173
b   -0.050710
Name: data1, dtype: float64

In [4]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)  # 使用自定义的聚合函数，将自定义的聚合函数传递给aggregate或agg方法即可

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.17029,2.843769
b,0.225587,0.305305


In [5]:
grouped.describe()  # describe严格说并不是聚合函数

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.049682,0.594105,-0.57548,-0.371928,-0.168377,0.213217,0.59481,3.0,0.056724,1.444772,-1.513047,-0.580275,0.352496,0.841609,1.330722
b,2.0,-0.140945,0.159514,-0.253739,-0.197342,-0.140945,-0.084548,-0.028151,2.0,0.265697,0.215883,0.113045,0.189371,0.265697,0.342023,0.418349


### 面向列的多函数应用

In [6]:
tips = pd.read_csv('../data/examples/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']  # 增加一个消费比例列
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [7]:
grouped = tips.groupby(['day', 'smoker'])

grouped['tip_pct'].agg('mean')  # <==> grouped['tip_pct'].mean()

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [8]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped['tip_pct'].agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124
