In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### GroupBy机制

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.01227,-1.189934
1,a,two,0.394421,0.463824
2,b,one,2.582684,-2.003238
3,b,two,1.292,-0.044583
4,a,one,-0.504897,1.210113


In [3]:
grouped = df['data1'].groupby(df['key1'])  # 根据Serises进行分组
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f359c9b37f0>

In [4]:
grouped.mean()

key1
a   -0.032735
b    1.937342
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()  # 根据多个Serises进行分组
means

key1  key2
a     one    -0.246313
      two     0.394421
b     one     2.582684
      two     1.292000
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.246313,0.394421
b,2.582684,1.292


In [7]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()  # 分组键可以是任何长度适当的数组，甚至不属于DataFrame

California  2005    0.394421
            2006    2.582684
Ohio        2005    0.652135
            2006   -0.504897
Name: data1, dtype: float64

In [8]:
df.groupby('key1').mean()  # key2列不是数值，默认情况下自动忽略

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.032735,0.161335
b,1.937342,-1.02391


In [9]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.246313,0.01009
a,two,0.394421,0.463824
b,one,2.582684,-2.003238
b,two,1.292,-0.044583


In [10]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 遍历各个分组

In [11]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.012270 -1.189934
1    a  two  0.394421  0.463824
4    a  one -0.504897  1.210113
b
  key1 key2     data1     data2
2    b  one  2.582684 -2.003238
3    b  two  1.292000 -0.044583


In [12]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.012270 -1.189934
4    a  one -0.504897  1.210113
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.394421  0.463824
('b', 'one')
  key1 key2     data1     data2
2    b  one  2.582684 -2.003238
('b', 'two')
  key1 key2  data1     data2
3    b  two  1.292 -0.044583


In [13]:
pieces = dict(list(df.groupby('key1')))  # 将groupby的结果存入字典
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,2.582684,-2.003238
3,b,two,1.292,-0.044583


In [14]:
df.dtypes
'''
key1      object
key2      object
data1    float64
data2    float64
dtype: object
'''

grouped = df.groupby(df.dtypes, axis=1)  # 在其他轴向上进行分组
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  0.012270 -1.189934
1  0.394421  0.463824
2  2.582684 -2.003238
3  1.292000 -0.044583
4 -0.504897  1.210113
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 选择一列或所有列的子集

- df.groupby('key1')['data1']   <==> df['data1'].groupby(df['key1'])
- df.groupby('key1')[['data1']] <==> df[['data1']].groupby(df['key1'])

In [15]:
df.groupby(['key1', 'key2'])['data2'].mean()

# type: pandas.core.series.Series

key1  key2
a     one     0.010090
      two     0.463824
b     one    -2.003238
      two    -0.044583
Name: data2, dtype: float64

In [16]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.01009
a,two,0.463824
b,one,-2.003238
b,two,-0.044583


### 使用字典和Series分组

In [17]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.708451,0.906216,-0.115569,0.584877,-0.654428
Steve,-0.649426,-1.836031,0.582417,-1.064618,1.84431
Wes,-1.364374,,,-0.457209,-0.045077
Jim,0.967665,-1.024199,1.200244,-0.645502,0.436052
Travis,0.689163,-0.096575,0.102592,-0.46053,-1.1829


In [18]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

by_column = people.groupby(mapping, axis=1)  # 使用字典进行分组
by_column.sum()

Unnamed: 0,blue,red
Joe,0.469308,-1.456663
Steve,-0.482201,-0.641147
Wes,-0.457209,-1.409451
Jim,0.554742,0.379517
Travis,-0.357938,-0.590312


In [19]:
map_series = pd.Series(mapping)

people.groupby(map_series, axis=1).count()  # 使用Series进行分组

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### 使用函数分组

In [20]:
people.groupby(len).sum()  # 根据索引字符串的长度进行分组

Unnamed: 0,a,b,c,d,e
3,-2.10516,-0.117983,1.084674,-0.517834,-0.263454
5,-0.649426,-1.836031,0.582417,-1.064618,1.84431
6,0.689163,-0.096575,0.102592,-0.46053,-1.1829


In [21]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()  # 使用函数、数组、字典、series混合进行分组

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.708451,0.906216,-0.115569,-0.457209,-0.654428
3,two,0.967665,-1.024199,1.200244,-0.645502,0.436052
5,one,-0.649426,-1.836031,0.582417,-1.064618,1.84431
6,two,0.689163,-0.096575,0.102592,-0.46053,-1.1829


### 使用索引层级分组

In [22]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df.iloc[2:3, 1:3] = np.nan
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.04224,-0.58123,-0.662139,-0.780306,0.334916
1,0.127372,1.255648,-0.303024,-2.607757,0.025535
2,1.254893,,,-0.79253,2.575918
3,0.595812,-0.242956,-1.134204,-0.927,0.826497


In [23]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,1
3,2,3
