# GroupBy技术

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
grouped = df['data1'].groupby(df['key1']) # 根据key1的值分组
print(df)
grouped.mean() # 对分组后数字型的列求平均值

  key1 key2     data1     data2
0    a  one -0.493010  0.171321
1    a  two -0.356854  0.431356
2    b  one  0.129551 -0.832662
3    b  two  0.869675 -1.651974
4    a  one  0.662207 -2.536450


key1
a   -0.062552
b    0.499613
Name: data1, dtype: float64

In [3]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean() # 根据key1/2分组,产生多重索引
means

key1  key2
a     one     0.084598
      two    -0.356854
b     one     0.129551
      two     0.869675
Name: data1, dtype: float64

In [4]:
means.unstack() # 把内层索引变成列


key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.084598,-0.356854
b,0.129551,0.869675


In [6]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.49301,0.171321
1,a,two,-0.356854,0.431356
2,b,one,0.129551,-0.832662
3,b,two,0.869675,-1.651974
4,a,one,0.662207,-2.53645


In [5]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
# 使用group后，原始数据可以认为变为如下形式：
#   data1      data2      key1  key2  states      years
# 0 -0.127927   0.026962  a     one   Ohio        2005
# 1 -1.424594  -0.800712  a     two   California  2005
# 2  1.619073  -0.165311  b     one   California  2006
# 3 -0.996192  -0.367086  b     two   Ohio        2005
# 4  0.020317  -1.238209  a     one   Ohio        2006

# cc：有意思！

California  2005   -0.356854
            2006    0.129551
Ohio        2005    0.188332
            2006    0.662207
Name: data1, dtype: float64

In [7]:
df.groupby('key1').mean()


Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.062552,-0.644591
b,0.499613,-1.242318


In [8]:
df.groupby(['key1', 'key2']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.084598,-1.182565
a,two,-0.356854,0.431356
b,one,0.129551,-0.832662
b,two,0.869675,-1.651974


In [10]:
df.groupby(['key1', 'key2']).size() # 统计记录条数，类似SQL的group by然后再count。

# cc：这个会经常用到，原来是用size，可以用来进行查重检查~

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 对分组进行迭代

In [11]:
for name, group in df.groupby('key1'): # 单列分组
    print(name)
    print('---')
    print(group)
    print('***')

a
---
  key1 key2     data1     data2
0    a  one -0.493010  0.171321
1    a  two -0.356854  0.431356
4    a  one  0.662207 -2.536450
***
b
---
  key1 key2     data1     data2
2    b  one  0.129551 -0.832662
3    b  two  0.869675 -1.651974
***


In [12]:
for (k1, k2), group in df.groupby(['key1', 'key2']): # 多列分组
    print(k1, k2)
    print('---')
    print(group)
    print('***')

a one
---
  key1 key2     data1     data2
0    a  one -0.493010  0.171321
4    a  one  0.662207 -2.536450
***
a two
---
  key1 key2     data1     data2
1    a  two -0.356854  0.431356
***
b one
---
  key1 key2     data1     data2
2    b  one  0.129551 -0.832662
***
b two
---
  key1 key2     data1     data2
3    b  two  0.869675 -1.651974
***


In [13]:
pieces = dict(list(df.groupby('key1')))
for k, v in pieces.items():
    print(k)
    print('---')
    print(v)
    print('***')

a
---
  key1 key2     data1     data2
0    a  one -0.493010  0.171321
1    a  two -0.356854  0.431356
4    a  one  0.662207 -2.536450
***
b
---
  key1 key2     data1     data2
2    b  one  0.129551 -0.832662
3    b  two  0.869675 -1.651974
***


In [14]:
# cc：这种方式还是挺好用的！直接转成字典；
pieces = dict(list(df.groupby(['key1', 'key2'])))
for k, v in pieces.items():
    print(k)
    print('---')
    print(v)
    print('***')

('a', 'one')
---
  key1 key2     data1     data2
0    a  one -0.493010  0.171321
4    a  one  0.662207 -2.536450
***
('a', 'two')
---
  key1 key2     data1     data2
1    a  two -0.356854  0.431356
***
('b', 'one')
---
  key1 key2     data1     data2
2    b  one  0.129551 -0.832662
***
('b', 'two')
---
  key1 key2     data1     data2
3    b  two  0.869675 -1.651974
***


In [15]:
grouped = df.groupby(df.dtypes, axis=1) # 默认根据列上的值做分组，axis=1使用行上的类型做分组。
for k, v in dict(list(grouped)).items():
    print(k)
    print('---')
    print(v)
    print('***')

# cc：这在数据分析EDA的时候可以用到，来快速查看数据类型；

float64
---
      data1     data2
0 -0.493010  0.171321
1 -0.356854  0.431356
2  0.129551 -0.832662
3  0.869675 -1.651974
4  0.662207 -2.536450
***
object
---
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
***


## 选取一个或一组列

In [16]:
print(df.groupby('key1')['data1']) # 等价df['data1'].groupby(df['key1'])
print(df.groupby('key1')[['data2']]) # df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fe814c37580>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe814c37370>


In [17]:
df.groupby(['key1', 'key2'])[['data2']].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-1.182565
a,two,0.431356
b,one,-0.832662
b,two,-1.651974


In [18]:
s_grouped = df.groupby(['key1', 'key2'])['data2'] # 这里用'data2'而不是['data2']返回Series
s_grouped.mean()

key1  key2
a     one    -1.182565
      two     0.431356
b     one    -0.832662
      two    -1.651974
Name: data2, dtype: float64

## 通过字典或Series进行分组

In [19]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[2:3, ['b', 'c']] = np.nan # 添加空值
people

  people.loc[2:3, ['b', 'c']] = np.nan # 添加空值


Unnamed: 0,a,b,c,d,e
Joe,-0.247591,0.027931,-1.7676,-0.525491,-0.800097
Steve,0.357923,1.582008,0.458303,-0.690434,0.126138
Wes,2.640612,,,-0.066969,-0.210371
Jim,0.385138,-0.264671,-1.272215,0.747196,-0.076634
Travis,0.286672,-0.80299,-0.942094,0.041461,-0.314216


In [20]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping, axis=1) # 每一行根据a/b/c/d/e对应的颜色求sum
by_column.sum()

Unnamed: 0,blue,red
Joe,-2.293091,-1.019757
Steve,-0.232131,2.066069
Wes,-0.066969,2.430241
Jim,-0.525019,0.043833
Travis,-0.900633,-0.830534


In [21]:
map_series = Series(mapping)
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## 通过函数进行分组

In [23]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.247591,0.027931,-1.7676,-0.525491,-0.800097
Steve,0.357923,1.582008,0.458303,-0.690434,0.126138
Wes,2.640612,,,-0.066969,-0.210371
Jim,0.385138,-0.264671,-1.272215,0.747196,-0.076634
Travis,0.286672,-0.80299,-0.942094,0.041461,-0.314216


In [24]:
people.groupby(len).sum() # 根据索引名字的长度做group，然后求sum


Unnamed: 0,a,b,c,d,e
3,2.778159,-0.236739,-3.039815,0.154735,-1.087102
5,0.357923,1.582008,0.458303,-0.690434,0.126138
6,0.286672,-0.80299,-0.942094,0.041461,-0.314216


In [25]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
# key_list等于新加一列，结合len，形状如下：--- cc：这点就很有意思；
#                  a         b         c         d         e
# Joe(3)     one,  0.254889 -0.812035  2.765460  1.113513  0.646795
# Steve(5)   one,  1.507490  0.463545 -1.396887  0.728163  1.078788
# Wes(3)     one, -2.099479       NaN       NaN  0.438572  0.134136
# Jim(3)     two,  0.167685  1.772127  1.372546 -0.758560 -1.241066
# Travis(6)  two, -0.834662 -0.933228  1.026441 -0.074524 -0.830303

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.247591,0.027931,-1.7676,-0.525491,-0.800097
3,two,0.385138,-0.264671,-1.272215,0.747196,-0.076634
5,one,0.357923,1.582008,0.458303,-0.690434,0.126138
6,two,0.286672,-0.80299,-0.942094,0.041461,-0.314216


## 根据索引级别分组

In [26]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.865956,-0.656825,0.144248,0.253493,0.168063
1,-0.143796,-1.09926,0.041759,-0.742731,-2.051739
2,-1.557372,1.286113,1.259506,1.347115,0.265384
3,-0.060767,-0.058229,-0.140133,-0.061002,0.647741


In [32]:
hier_df.groupby(level='cty', axis=1).count()


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
