In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 一.通过groupby方法拆分数据

## 1.通过列名进行分组

In [2]:
dict1 = {
    "Key":['C','B','C','A','B','B','A','C','A'],
    "Data":[2,4,6,8,10,1,14,16,18]
}
df = DataFrame(dict1)
df

Unnamed: 0,Key,Data
0,C,2
1,B,4
2,C,6
3,A,8
4,B,10
5,B,1
6,A,14
7,C,16
8,A,18


In [3]:
groupby_obj = df.groupby(by='Key')

In [4]:
for i in groupby_obj:
    print(i)

('A',   Key  Data
3   A     8
6   A    14
8   A    18)
('B',   Key  Data
1   B     4
4   B    10
5   B     1)
('C',   Key  Data
0   C     2
2   C     6
7   C    16)


In [5]:
groupby_obj.get_group('B') # 可选择任意一组

Unnamed: 0,Key,Data
1,B,4
4,B,10
5,B,1


In [6]:
dict([x for x in groupby_obj])['B'] # 可选择任意一组

Unnamed: 0,Key,Data
1,B,4
4,B,10
5,B,1


## 2.通过Series对象进行分组

In [7]:
dict1 = {
    'key1': ['A', 'A', 'B', 'B', 'A'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': [2, 3, 4, 6, 8],
    'data2': [3, 5, 6, 3, 7]
}
df = DataFrame(dict1)
df

Unnamed: 0,key1,key2,data1,data2
0,A,one,2,3
1,A,two,3,5
2,B,one,4,6
3,B,two,6,3
4,A,one,8,7


In [8]:
s = pd.Series(['a', 'b', 'a', 'b', 'c'])
s

0    a
1    b
2    a
3    b
4    c
dtype: object

In [9]:
groupby_obj = df.groupby(by=s)

In [10]:
for i in groupby_obj:
    print(i)

('a',   key1 key2  data1  data2
0    A  one      2      3
2    B  one      4      6)
('b',   key1 key2  data1  data2
1    A  two      3      5
3    B  two      6      3)
('c',   key1 key2  data1  data2
4    A  one      8      7)


In [11]:
s = pd.Series(['a', 'b', 'a', 'b']) # 少一个键
groupby_obj = df.groupby(by=s)
for i in groupby_obj:
    print(i)

('a',   key1 key2  data1  data2
0    A  one      2      3
2    B  one      4      6)
('b',   key1 key2  data1  data2
1    A  two      3      5
3    B  two      6      3)


## 3.按字典进行分组

In [12]:
dict1 = {
    'a': [1, 2, 3, 4, 5],
    'b': [6, 7, 8, 9, 10],
    'c': [11, 12, 13, 14, 15],
    'd': [5, 4, 3, 2, 1],
    'e': [10, 9, 8, 7, 6]
}
df = DataFrame(dict1)
df

Unnamed: 0,a,b,c,d,e
0,1,6,11,5,10
1,2,7,12,4,9
2,3,8,13,3,8
3,4,9,14,2,7
4,5,10,15,1,6


In [13]:
mapping = {'a': '第一组', 'b': '第二组', 'c': '第一组', 'd': '第二组', 'e': '第三组', } # 定义分组规则
groupby_obj = df.groupby(mapping, axis=1)
for i in groupby_obj:
    print(i)

('第一组',    a   c
0  1  11
1  2  12
2  3  13
3  4  14
4  5  15)
('第三组',     e
0  10
1   9
2   8
3   7
4   6)
('第二组',     b  d
0   6  5
1   7  4
2   8  3
3   9  2
4  10  1)


In [14]:
groupby_obj.get_group('第二组')

Unnamed: 0,b,d
0,6,5
1,7,4
2,8,3
3,9,2
4,10,1


## 4.按函数进行分组

In [15]:
dict1 = {
    'a': [1, 2, 3, 4, 5],
    'b': [6, 7, 8, 9, 10],
    'c': [11, 12, 13, 14, 15],
    'd': [5, 4, 3, 2, 1],
    'e': [10, 9, 8, 7, 6]
}
df = DataFrame(dict1, index=['Tom', 'Jack', 'Alice', 'Helen', 'Bob'])
df

Unnamed: 0,a,b,c,d,e
Tom,1,6,11,5,10
Jack,2,7,12,4,9
Alice,3,8,13,3,8
Helen,4,9,14,2,7
Bob,5,10,15,1,6


In [16]:
groupby_obj = df.groupby(len) # 对行索引执行求长度操作, 返回的长度值作为分组名称. 长度一样的归为一组
for i in groupby_obj:
    print(i)

(3,      a   b   c  d   e
Tom  1   6  11  5  10
Bob  5  10  15  1   6)
(4,       a  b   c  d  e
Jack  2  7  12  4  9)
(5,        a  b   c  d  e
Alice  3  8  13  3  8
Helen  4  9  14  2  7)


In [17]:
groupby_obj.get_group(3)

Unnamed: 0,a,b,c,d,e
Tom,1,6,11,5,10
Bob,5,10,15,1,6


In [18]:
groupby_obj.groups # 查看分组情况

{3: Index(['Tom', 'Bob'], dtype='object'),
 4: Index(['Jack'], dtype='object'),
 5: Index(['Alice', 'Helen'], dtype='object')}

# 练习

In [19]:
dict1 = {
    'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings', 'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
    'Rank': [1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
    'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
    'Points': [876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]
}
df = DataFrame(dict1)
df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741
5,Kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


## 1.按队名进行分组

In [20]:
groupby_obj = df.groupby(by='Team')
for i in groupby_obj:
    print(i)

('Devils',      Team  Rank  Year  Points
2  Devils     2  2014     863
3  Devils     3  2015     673)
('Kings',     Team  Rank  Year  Points
4  Kings     3  2014     741
5  Kings     4  2015     812
6  Kings     1  2016     756
7  Kings     1  2017     788)
('Riders',       Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
8   Riders     2  2016     694
11  Riders     2  2017     690)
('Royals',       Team  Rank  Year  Points
9   Royals     4  2014     701
10  Royals     1  2015     804)


## 2.按年份进行分组

In [21]:
groupby_obj = df.groupby(by='Year')
for i in groupby_obj:
    print(i)

(2014,      Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701)
(2015,       Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    Kings     4  2015     812
10  Royals     1  2015     804)
(2016,      Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694)
(2017,       Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690)


# 二.数据的聚合

In [22]:
team_obj = df.groupby(by='Team')
team_obj.groups

{'Devils': Int64Index([2, 3], dtype='int64'),
 'Kings': Int64Index([4, 5, 6, 7], dtype='int64'),
 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'),
 'Royals': Int64Index([9, 10], dtype='int64')}

In [23]:
team_obj.mean() # 按Team分组求每组平均值

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,2.5,2014.5,768.0
Kings,2.25,2015.5,774.25
Riders,1.75,2015.5,762.25
Royals,2.5,2014.5,752.5


In [24]:
team_obj.max() # 按Team分组求每组最大值

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,3,2015,863
Kings,4,2017,812
Riders,2,2017,876
Royals,4,2015,804


In [25]:
team_obj.rank() # 按Team分组求哪组排名第一最多

Unnamed: 0,Rank,Year,Points
0,1.0,1.0,4.0
1,3.0,2.0,3.0
2,1.0,1.0,2.0
3,2.0,2.0,1.0
4,3.0,1.0,1.0
5,4.0,2.0,4.0
6,1.5,3.0,2.0
7,1.5,4.0,3.0
8,3.0,3.0,2.0
9,2.0,1.0,1.0
