In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 一.过滤filteration

可以用来筛选某些组

In [2]:
df = DataFrame([[1, 2, 3], [4, 5, 6]], index=['mouse', 'rabbit'], columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
mouse,1,2,3
rabbit,4,5,6


In [3]:
# 对列进行筛选, 选择包含'one', 'three'
df.filter(items=['one', 'three'])

Unnamed: 0,one,three
mouse,1,3
rabbit,4,6


In [4]:
# 对行进行筛选, 选择包含'bbi'的行
df.filter(like= 'bbi', axis=0)

Unnamed: 0,one,two,three
rabbit,4,5,6


In [5]:
# 根据正则表达式进行筛选, 筛选出列名称包含'e'的列
df.filter(regex= 'e$')

Unnamed: 0,one,three
mouse,1,3
rabbit,4,6


除agg外, pandas还提供了其他操作应用到分组运算中, 比如transform, apply等能够执行更多其他的分组运算

# 二.transform

In [6]:
df = DataFrame({
    'a': [0, 1, 5, 8, 6],
    'b': [5, 10, 9, 1, 6],
    'c': [3, 3, 8, 6, 4],
    'd': [6, 4, 9, 3 ,4],
    'e': [9, 6, 3, 0, 0],
    'key': ['A', 'A', 'B', 'B', 'B']
})
df

Unnamed: 0,a,b,c,d,e,key
0,0,5,3,6,9,A
1,1,10,3,4,6,A
2,5,9,8,9,3,B
3,8,1,6,3,0,B
4,6,6,4,4,0,B


In [7]:
df_obj = df.groupby('key')

In [8]:
df_obj.get_group('A')

Unnamed: 0,a,b,c,d,e,key
0,0,5,3,6,9,A
1,1,10,3,4,6,A


In [9]:
df_obj.get_group('B')

Unnamed: 0,a,b,c,d,e,key
2,5,9,8,9,3,B
3,8,1,6,3,0,B
4,6,6,4,4,0,B


In [10]:
df_obj.agg('mean') # 与.mean一样

Unnamed: 0_level_0,a,b,c,d,e
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0.5,7.5,3.0,5.0,7.5
B,6.333333,5.333333,6.0,5.333333,1.0


In [11]:
df_obj.transform('mean') # 可以保持原数据形状相同

Unnamed: 0,a,b,c,d,e
0,0.5,7.5,3,5.0,7.5
1,0.5,7.5,3,5.0,7.5
2,6.333333,5.333333,6,5.333333,1.0
3,6.333333,5.333333,6,5.333333,1.0
4,6.333333,5.333333,6,5.333333,1.0


## 练习

In [12]:
df = DataFrame({
    'A': [1, 2, 3],
    'B': [10, 20, 30]
})
df

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [13]:
def plus_10(x): # 将x加10
    return x + 10

In [14]:
df.agg(plus_10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [15]:
df.transform('sqrt')

Unnamed: 0,A,B
0,1.0,3.162278
1,1.414214,4.472136
2,1.732051,5.477226


In [16]:
df.transform([np.sqrt, np.exp]) # 对同一列数据应用不同函数

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sqrt,exp,sqrt,exp
0,1.0,2.718282,3.162278,22026.47
1,1.414214,7.389056,4.472136,485165200.0
2,1.732051,20.085537,5.477226,10686470000000.0


In [17]:
df.transform({'A': np.sqrt, 'B': np.exp}) # 对不同列使用不同函数

Unnamed: 0,A,B
0,1.0,22026.47
1,1.414214,485165200.0
2,1.732051,10686470000000.0


In [18]:
# 在组级别处理缺失值
df = DataFrame({
    'name': ['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
    'value': [1, np.nan, np.nan, 8, 2, 5, np.nan, 1]
})
df

Unnamed: 0,name,value
0,A,1.0
1,A,
2,B,
3,B,8.0
4,B,2.0
5,C,5.0
6,C,
7,C,1.0


In [19]:
df.fillna(10)

Unnamed: 0,name,value
0,A,1.0
1,A,10.0
2,B,10.0
3,B,8.0
4,B,2.0
5,C,5.0
6,C,10.0
7,C,1.0


In [20]:
df_obj = df.groupby('name')
df_obj.get_group('A')

Unnamed: 0,name,value
0,A,1.0
1,A,


In [21]:
df_obj.mean() # NaN不纳入计算

Unnamed: 0_level_0,value
name,Unnamed: 1_level_1
A,1.0
B,5.0
C,3.0


In [22]:
def func(x): # 计算x的平均值并填充缺失值
    return x.fillna(x.mean())

In [23]:
df_obj.transform(func)

Unnamed: 0,value
0,1.0
1,1.0
2,5.0
3,8.0
4,2.0
5,5.0
6,3.0
7,1.0


In [24]:
df['value'] = df_obj.transform(func) # 相当于对列重新赋值
df

Unnamed: 0,name,value
0,A,1.0
1,A,1.0
2,B,5.0
3,B,8.0
4,B,2.0
5,C,5.0
6,C,3.0
7,C,1.0


# 三.apply

In [25]:
df = DataFrame({
    'a': [0, 1, 5, 8, 6],
    'b': [5, 10, 9, 1, 6],
    'c': [3, 3, 8, 6, 4],
    'd': [6, 4, 9, 3 ,4],
    'e': [9, 6, 3, 0, 0],
    'key': ['A', 'A', 'B', 'B', 'B']
})
df

Unnamed: 0,a,b,c,d,e,key
0,0,5,3,6,9,A
1,1,10,3,4,6,A
2,5,9,8,9,3,B
3,8,1,6,3,0,B
4,6,6,4,4,0,B


In [26]:
df_obj = df.groupby('key')

In [27]:
df_obj.agg(['max', 'mean']) # 求各组最大值和平均值

Unnamed: 0_level_0,a,a,b,b,c,c,d,d,e,e
Unnamed: 0_level_1,max,mean,max,mean,max,mean,max,mean,max,mean
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
A,1,0.5,10,7.5,3,3,6,5.0,9,7.5
B,8,6.333333,9,5.333333,8,6,9,5.333333,3,1.0


In [28]:
df_obj.get_group('A')['b'].apply('mean')

7.5

## 练习

In [29]:
dict = {
    '年级':['大一','大二','大三', '大四','大二','大三', '大一','大三','大四'],
    '姓名':['李宏卓','李思真','张振海', '赵鸿飞','白蓉','马腾飞', '张晓凡','金紫萱','金烨'],
    '年龄':[18,19,20,21,19,20,18,20,21],
    '身高':[175,165,178,175,160,180,167,170,185],
    '体重':[65,60,70,76,55,70,52,53,73]
}
df = DataFrame(dict)
df

Unnamed: 0,年级,姓名,年龄,身高,体重
0,大一,李宏卓,18,175,65
1,大二,李思真,19,165,60
2,大三,张振海,20,178,70
3,大四,赵鸿飞,21,175,76
4,大二,白蓉,19,160,55
5,大三,马腾飞,20,180,70
6,大一,张晓凡,18,167,52
7,大三,金紫萱,20,170,53
8,大四,金烨,21,185,73


In [30]:
# 输出大一学生信息
df_obj = df.groupby('年级')
df_obj.get_group('大一')

Unnamed: 0,年级,姓名,年龄,身高,体重
0,大一,李宏卓,18,175,65
6,大一,张晓凡,18,167,52
