In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### GroupBy机制

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.182476,-1.96343
1,a,two,-0.392948,0.084866
2,b,one,-0.530832,1.575083
3,b,two,0.243163,-0.540113
4,a,one,-0.253188,0.296963


In [3]:
grouped = df['data1'].groupby(df['key1'])  # 根据Serises进行分组
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff92854d860>

In [4]:
grouped.mean()

key1
a   -0.609537
b   -0.143835
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()  # 根据多个Serises进行分组
means

key1  key2
a     one    -0.717832
      two    -0.392948
b     one    -0.530832
      two     0.243163
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.717832,-0.392948
b,-0.530832,0.243163


In [7]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()  # 分组键可以是任何长度适当的数组，甚至不属于DataFrame

California  2005   -0.392948
            2006   -0.530832
Ohio        2005   -0.469657
            2006   -0.253188
Name: data1, dtype: float64

In [8]:
df.groupby('key1').mean()  # key2列不是数值，默认情况下自动忽略

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.609537,-0.527201
b,-0.143835,0.517485


In [9]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.717832,-0.833234
a,two,-0.392948,0.084866
b,one,-0.530832,1.575083
b,two,0.243163,-0.540113


In [10]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 遍历各个分组

In [13]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -1.182476 -1.963430
1    a  two -0.392948  0.084866
4    a  one -0.253188  0.296963
b
  key1 key2     data1     data2
2    b  one -0.530832  1.575083
3    b  two  0.243163 -0.540113


In [14]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -1.182476 -1.963430
4    a  one -0.253188  0.296963
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.392948  0.084866
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.530832  1.575083
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.243163 -0.540113


In [15]:
pieces = dict(list(df.groupby('key1')))  # 将groupby的结果存入字典
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.530832,1.575083
3,b,two,0.243163,-0.540113


In [21]:
df.dtypes
'''
key1      object
key2      object
data1    float64
data2    float64
dtype: object
'''

grouped = df.groupby(df.dtypes, axis=1)  # 在其他轴向上进行分组
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -1.182476 -1.963430
1 -0.392948  0.084866
2 -0.530832  1.575083
3  0.243163 -0.540113
4 -0.253188  0.296963
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


### 选择一列或所有列的子集

- df.groupby('key1')['data1']   <==> df['data1'].groupby(df['key1'])
- df.groupby('key1')[['data1']] <==> df[['data1']].groupby(df['key1'])

In [26]:
df.groupby(['key1', 'key2'])['data2'].mean()

# type: pandas.core.series.Series

key1  key2
a     one    -0.833234
      two     0.084866
b     one     1.575083
      two    -0.540113
Name: data2, dtype: float64

In [23]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.833234
a,two,0.084866
b,one,1.575083
b,two,-0.540113
