In [1]:
# GroupBy技术
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
grouped = df['data1'].groupby(df['key1']) # 根据key1的值分组
print(df)
grouped.mean() # 对分组后数字型的列求平均值

  key1 key2     data1     data2
0    a  one -0.896707  0.650517
1    a  two  0.929827 -1.361159
2    b  one -1.228694 -0.810506
3    b  two -1.139256  1.341678
4    a  one -0.118287  0.231004


key1
a   -0.028389
b   -1.183975
Name: data1, dtype: float64

In [3]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean() # 根据key1/2分组,产生多重索引
means

key1  key2
a     one    -0.507497
      two     0.929827
b     one    -1.228694
      two    -1.139256
Name: data1, dtype: float64

In [4]:
means.unstack() # 把内层索引变成列

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.507497,0.929827
b,-1.228694,-1.139256


In [5]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
# 使用group后，原始数据可以认为变为如下形式：
#   data1      data2      key1  key2  states      years
# 0 -0.127927   0.026962  a     one   Ohio        2005
# 1 -1.424594  -0.800712  a     two   California  2005
# 2  1.619073  -0.165311  b     one   California  2006
# 3 -0.996192  -0.367086  b     two   Ohio        2005
# 4  0.020317  -1.238209  a     one   Ohio        2006

California  2005    0.929827
            2006   -1.228694
Ohio        2005   -1.017982
            2006   -0.118287
Name: data1, dtype: float64

In [6]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.028389,-0.159879
b,-1.183975,0.265586


In [7]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.507497,0.440761
a,two,0.929827,-1.361159
b,one,-1.228694,-0.810506
b,two,-1.139256,1.341678


In [8]:
df.groupby(['key1', 'key2']).size() # 统计记录条数，类似SQL的group by然后再count。

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [9]:
# 对分组进行迭代

In [10]:
for name, group in df.groupby('key1'): # 单列分组
    print(name)
    print('---')
    print(group)
    print('***')

a
---
  key1 key2     data1     data2
0    a  one -0.896707  0.650517
1    a  two  0.929827 -1.361159
4    a  one -0.118287  0.231004
***
b
---
  key1 key2     data1     data2
2    b  one -1.228694 -0.810506
3    b  two -1.139256  1.341678
***


In [11]:
for (k1, k2), group in df.groupby(['key1', 'key2']): # 多列分组
    print(k1, k2)
    print('---')
    print(group)
    print('***')

a one
---
  key1 key2     data1     data2
0    a  one -0.896707  0.650517
4    a  one -0.118287  0.231004
***
a two
---
  key1 key2     data1     data2
1    a  two  0.929827 -1.361159
***
b one
---
  key1 key2     data1     data2
2    b  one -1.228694 -0.810506
***
b two
---
  key1 key2     data1     data2
3    b  two -1.139256  1.341678
***


In [12]:
pieces = dict(list(df.groupby('key1')))
for k, v in pieces.items():
    print(k)
    print('---')
    print(v)
    print('***')

a
---
  key1 key2     data1     data2
0    a  one -0.896707  0.650517
1    a  two  0.929827 -1.361159
4    a  one -0.118287  0.231004
***
b
---
  key1 key2     data1     data2
2    b  one -1.228694 -0.810506
3    b  two -1.139256  1.341678
***


In [13]:
grouped = df.groupby(df.dtypes, axis=1) # 默认根据列上的值做分组，axis=1使用行上的类型做分组。
for k, v in dict(list(grouped)).items():
    print(k)
    print('---')
    print(v)
    print('***')

float64
---
      data1     data2
0 -0.896707  0.650517
1  0.929827 -1.361159
2 -1.228694 -0.810506
3 -1.139256  1.341678
4 -0.118287  0.231004
***
object
---
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
***


In [14]:
# 选取一个或一组列

In [15]:
print(df.groupby('key1')['data1']) # 等价df['data1'].groupby(df['key1'])
print(df.groupby('key1')[['data2']]) # df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fed09275910>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fed09275520>


In [16]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.440761
a,two,-1.361159
b,one,-0.810506
b,two,1.341678


In [17]:
s_grouped = df.groupby(['key1', 'key2'])['data2'] # 这里用'data2'而不是['data2']返回Series
s_grouped.mean()

key1  key2
a     one     0.440761
      two    -1.361159
b     one    -0.810506
      two     1.341678
Name: data2, dtype: float64

In [18]:
# 通过字典或Series进行分组

In [19]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan # 添加空值
people

Unnamed: 0,a,b,c,d,e
Joe,0.653053,-1.022527,0.388512,-0.219646,-0.21045
Steve,-0.552606,-0.601066,0.447423,0.893489,1.008685
Wes,0.027579,,,-0.722771,-0.568714
Jim,1.963424,-0.948901,-0.838,0.476383,0.166596
Travis,0.943912,0.088848,0.264405,-0.493485,-0.569666


In [20]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [21]:
by_column = people.groupby(mapping, axis=1) # 每一行根据a/b/c/d/e对应的颜色求sum
by_column.sum()

Unnamed: 0,blue,red
Joe,0.168867,-0.579925
Steve,1.340912,-0.144987
Wes,-0.722771,-0.541135
Jim,-0.361616,1.181119
Travis,-0.229081,0.463094


In [22]:
map_series = Series(mapping)
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [23]:
# 通过函数进行分组

In [24]:
people.groupby(len).sum() # 根据索引名字的长度做group，然后求sum

Unnamed: 0,a,b,c,d,e
3,2.644057,-1.971429,-0.449487,-0.466034,-0.612568
5,-0.552606,-0.601066,0.447423,0.893489,1.008685
6,0.943912,0.088848,0.264405,-0.493485,-0.569666


In [25]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
# key_list等于新加一列，结合len，形状如下：
#                  a         b         c         d         e
# Joe(3)     one,  0.254889 -0.812035  2.765460  1.113513  0.646795
# Steve(5)   one,  1.507490  0.463545 -1.396887  0.728163  1.078788
# Wes(3)     one, -2.099479       NaN       NaN  0.438572  0.134136
# Jim(3)     two,  0.167685  1.772127  1.372546 -0.758560 -1.241066
# Travis(6)  two, -0.834662 -0.933228  1.026441 -0.074524 -0.830303

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.027579,-1.022527,0.388512,-0.722771,-0.568714
3,two,1.963424,-0.948901,-0.838,0.476383,0.166596
5,one,-0.552606,-0.601066,0.447423,0.893489,1.008685
6,two,0.943912,0.088848,0.264405,-0.493485,-0.569666


In [26]:
# 根据索引级别分组

In [27]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.571821,1.917337,0.77274,-0.538001,2.569666
1,-1.294475,0.959807,0.214003,-2.256099,0.115672
2,0.137939,-0.130021,-0.12392,0.673968,-0.30632
3,0.772454,-0.919702,-1.31445,0.167962,1.440125


In [28]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
