In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 一.数据聚合

## 1.使用内置统计方法聚合数据

In [2]:
dict = {
    'key1': ['A', 'A', 'B', 'B', 'A'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': [2, 3, 4, 6, 8],
    'data2': [3, 5, 6, 3, 7]
}
df = DataFrame(dict)
df

Unnamed: 0,key1,key2,data1,data2
0,A,one,2,3
1,A,two,3,5
2,B,one,4,6
3,B,two,6,3
4,A,one,8,7


In [3]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4.333333,5.0
B,5.0,4.5


In [4]:
dict = {
    "Key":['C','B','C','A','B','B','A','C','A'],
    "Data":[2,4,6,8,10,1,14,16,18]
}
df = DataFrame(dict)
df

Unnamed: 0,Key,Data
0,C,2
1,B,4
2,C,6
3,A,8
4,B,10
5,B,1
6,A,14
7,C,16
8,A,18


In [5]:
df.groupby(by='Key').max()

Unnamed: 0_level_0,Data
Key,Unnamed: 1_level_1
A,18
B,10
C,16


## 2.面向列的聚合方法

### 对每一列数据应用同一个函数

In [6]:
df = DataFrame(np.arange(36).reshape(6, 6), columns=list('abcdef'))
df

Unnamed: 0,a,b,c,d,e,f
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [7]:
df['key'] = Series(list('aaabbb')) # 通过给列索引赋值的形式添加一列数据
df

Unnamed: 0,a,b,c,d,e,f,key
0,0,1,2,3,4,5,a
1,6,7,8,9,10,11,a
2,12,13,14,15,16,17,a
3,18,19,20,21,22,23,b
4,24,25,26,27,28,29,b
5,30,31,32,33,34,35,b


In [8]:
df_obj = df.groupby('key')
df_obj.get_group('a')

Unnamed: 0,a,b,c,d,e,f,key
0,0,1,2,3,4,5,a
1,6,7,8,9,10,11,a
2,12,13,14,15,16,17,a


In [9]:
df_obj.get_group('b')

Unnamed: 0,a,b,c,d,e,f,key
3,18,19,20,21,22,23,b
4,24,25,26,27,28,29,b
5,30,31,32,33,34,35,b


In [10]:
df_obj.sum()

Unnamed: 0_level_0,a,b,c,d,e,f
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,18,21,24,27,30,33
b,72,75,78,81,84,87


In [11]:
df_obj.agg(sum) # 通过agg方法进行聚合

Unnamed: 0_level_0,a,b,c,d,e,f
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,18,21,24,27,30,33
b,72,75,78,81,84,87


In [12]:
def func(x): # 求极差函数
    return x.max() - x.min()

In [13]:
df_obj.agg(func) # 参数传入一个自定义函数

Unnamed: 0_level_0,a,b,c,d,e,f
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,12,12,12,12,12,12
b,12,12,12,12,12,12


### 多某列数据应用不同的函数

In [14]:
df_obj.agg([func, sum]) # 将两个函数的名称放在列表中

Unnamed: 0_level_0,a,a,b,b,c,c,d,d,e,e,f,f
Unnamed: 0_level_1,func,sum,func,sum,func,sum,func,sum,func,sum,func,sum
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
a,12,18,12,21,12,24,12,27,12,30,12,33
b,12,72,12,75,12,78,12,81,12,84,12,87


In [15]:
df_obj.agg([('极差', func), ('累计和', sum)]) # 可以更直观反映出每列数据的信息

Unnamed: 0_level_0,a,a,b,b,c,c,d,d,e,e,f,f
Unnamed: 0_level_1,极差,累计和,极差,累计和,极差,累计和,极差,累计和,极差,累计和,极差,累计和
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
a,12,18,12,21,12,24,12,27,12,30,12,33
b,12,72,12,75,12,78,12,81,12,84,12,87


### 对不同的数据应用不同的函数

In [16]:
df_obj.agg({'a':'sum', 'b':'mean', 'c':func}) # 自定义函数无需加引号, 内置函数需要

Unnamed: 0_level_0,a,b,c
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,18,7.0,12
b,72,25.0,12


## 练习

In [17]:
list = [
    [1, 2, 3, 4],
    [11, 22, 33, 44],
    [111, 222, 333, 444],
    [1111, 2222, 3333, 4444]
]
df = DataFrame(list, columns=['col1', 'col2', 'col3', 'col4'])
df

Unnamed: 0,col1,col2,col3,col4
0,1,2,3,4
1,11,22,33,44
2,111,222,333,444
3,1111,2222,3333,4444


- 使用内置函数sum求和
- 对某列数据应用sum, mean, std
- 对col1列用sum, min 对col2列用max, min col3列用sum, min
- 按行求极差
- 按列求极差

### 使用内置函数sum求和

In [18]:
df.agg(sum)

col1    1234
col2    2468
col3    3702
col4    4936
dtype: int64

### 对某列数据应用sum, mean, std

In [19]:
# df.agg([('求和', sum), ('平均', 'mean'), ('标准差', 'std')])
df.agg(['sum', 'mean', 'std'])

Unnamed: 0,col1,col2,col3,col4
sum,1234.0,2468.0,3702.0,4936.0
mean,308.5,617.0,925.5,1234.0
std,537.300351,1074.600701,1611.901052,2149.201402


### 对col1列用sum, min 对col2列用max, min col3列用sum, min

In [20]:
df.agg({'col1':['sum', 'min'], 'col2':['max', 'min'], 'col3':['sum', 'min'], }) # 当某一列没有聚合函数值, 用NaN填充

Unnamed: 0,col1,col2,col3
sum,1234.0,,3702.0
min,1.0,2.0,3.0
max,,2222.0,


### 按行求极差

In [21]:
df.agg(func, axis=1)

0       3
1      33
2     333
3    3333
dtype: int64

### 按列求极差

In [22]:
df.agg(func)

col1    1110
col2    2220
col3    3330
col4    4440
dtype: int64

In [23]:
dict = {
    'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings', 'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
    'Rank': [1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
    'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
    'Points': [876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]
}
df = DataFrame(dict)
df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741
5,kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


In [24]:
team = df.groupby('Team')
team.mean()

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,2.5,2014.5,768.0
Kings,1.666667,2015.666667,761.666667
Riders,1.75,2015.5,762.25
Royals,2.5,2014.5,752.5
kings,4.0,2015.0,812.0


In [25]:
team['Points'].mean()

Team
Devils    768.000000
Kings     761.666667
Riders    762.250000
Royals    752.500000
kings     812.000000
Name: Points, dtype: float64

In [26]:
team['Points'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,1536,768.0,134.350288
Kings,2285,761.666667,24.006943
Riders,3049,762.25,88.567771
Royals,1505,752.5,72.831998
kings,812,812.0,
