In [1]:
# group by 技术
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib.pyplot as plt
from numpy.random import randn
import os
from datetime import datetime

In [2]:
df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.153258,-1.704409
1,a,two,0.231531,-1.039485
2,b,one,2.039811,-1.767591
3,b,two,1.154342,0.985161
4,a,one,-0.106685,0.149286


In [3]:
grouped = df['data1'].groupby(df['key1'])   # 访问data1根据key1调用group by
grouped                 # 实际上这部分没有进行聚合操作，只是进行了分组

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000022AB9929208>

In [4]:
grouped.mean()          # 这里是数据series根据分组键进行的聚合操作，产生的是一个新的series

key1
a   -0.009471
b    1.597077
Name: data1, dtype: float64

In [5]:
# 如果groupby 的时候一次传入多个数组，那么情况就会不一样了
means = df['data1'].groupby([df['key1'],df['key2']]).mean()   # 按照两个key索引会得到一个层次化索引的数据（由唯一的键对组成）
means

key1  key2
a     one    -0.129971
      two     0.231531
b     one     2.039811
      two     1.154342
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.129971,0.231531
b,2.039811,1.154342


In [7]:
# 实际上，分组键可以是任何长度适当的数组
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005    0.231531
            2006    2.039811
Ohio        2005    0.500542
            2006   -0.106685
Name: data1, dtype: float64

In [9]:
# 此外分组键还可以是列名
df.groupby('key1').mean()   # 这个代码执行的时候，会自动忽略字符串格式的列，也叫做# 麻烦列

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.009471,-0.86487
b,1.597077,-0.391215


In [8]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.129971,-0.777562
a,two,0.231531,-1.039485
b,one,2.039811,-1.767591
b,two,1.154342,0.985161


In [10]:
# groupby的size方法，他可以返回一个含有分组大小的size
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [11]:
# groupby 对象支持迭代，可以产生一组二元元组（由分组名和数据块组成）
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.153258 -1.704409
1    a  two  0.231531 -1.039485
4    a  one -0.106685  0.149286
b
  key1 key2     data1     data2
2    b  one  2.039811 -1.767591
3    b  two  1.154342  0.985161


In [12]:
# 如果是有多重键的话，元组的第一个元素将会是由键值组成的元组：
for (k1, k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one -0.153258 -1.704409
4    a  one -0.106685  0.149286
a two
  key1 key2     data1     data2
1    a  two  0.231531 -1.039485
b one
  key1 key2     data1     data2
2    b  one  2.039811 -1.767591
b two
  key1 key2     data1     data2
3    b  two  1.154342  0.985161


In [13]:
# 还可以把迭代的数据片段做成一个字典：
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.153258 -1.704409
 1    a  two  0.231531 -1.039485
 4    a  one -0.106685  0.149286, 'b':   key1 key2     data1     data2
 2    b  one  2.039811 -1.767591
 3    b  two  1.154342  0.985161}

In [15]:
# group by 默认是按照axis=0 上进行分组的，通过设置也可以在其他任何轴上进行分组

In [14]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [16]:
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.153258 -1.704409
 1  0.231531 -1.039485
 2  2.039811 -1.767591
 3  1.154342  0.985161
 4 -0.106685  0.149286, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [17]:
# 选取一个或一组列进行分组聚合
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022AB99CBCC0>

In [18]:
# 以上代码是以下代码的语法糖
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022AB99DC390>

In [19]:
# 对于大数据集，如果我们只需要对部分列进行聚合运算，如果只计算data2列的平均值并以Dataframe形式得#　到结果
a = df.groupby(['key1','key2'])[['data2']].mean() # 该种索引返回的是一个已分组的Dataframe
print(a)
print(type(a))

data2
key1 key2          
a    one  -0.777562
     two  -1.039485
b    one  -1.767591
     two   0.985161
<class 'pandas.core.frame.DataFrame'>


In [20]:
b = df.groupby(['key1','key2'])['data2'].mean()   # 如果传入的是标量形式的单个列名,返回的是# 已分组的series
print(b)   
print(type(b))     

key1  key2
a     one    -0.777562
      two    -1.039485
b     one    -1.767591
      two     0.985161
Name: data2, dtype: float64
<class 'pandas.core.series.Series'>


In [21]:
s_grouped = df.groupby(['key1','key2'])['data2'] 
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000022AA8F6BC50>

In [22]:
s_grouped.mean()

key1  key2
a     one    -0.777562
      two    -1.039485
b     one    -1.767591
      two     0.985161
Name: data2, dtype: float64

In [23]:
# 接下来有三个分组的方法：
# 1.是通过字典或series进行分组
# 2.是通过函数进行分组
# 3.是通过索引级别分组

In [24]:
#　通过字典Series进行分组
people = DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e']
                                        ,index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']] = np.nan    # 添加几个na值
people

Unnamed: 0,a,b,c,d,e
Joe,0.988715,0.088757,0.357103,0.465662,-1.560522
Steve,1.470586,-0.662865,0.268969,-0.51198,0.803723
Wes,1.146165,,,-0.126868,-2.130307
Jim,-0.571089,-0.527224,0.255759,0.058619,0.661666
Travis,-2.355058,-0.262931,1.782651,0.691793,0.899694


In [32]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'} # 按照字典# group
by_column = people.groupby(mapping,axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.822765,-0.48305
Steve,-0.243011,1.611444
Wes,-0.126868,-0.984142
Jim,0.314378,-0.436647
Travis,2.474444,-1.718295


In [25]:
map_series = pd.Series(mapping)
map_series
people.groupby(map_series,axis=1).count()

NameError: name 'mapping' is not defined

In [None]:
# 2.是通过函数进行分组
# 按照索引列的名字长度进行分组：
people.groupby(len).sum()

In [None]:
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).sum()

In [None]:
people.groupby(key_list).sum()

In [None]:
# 根据索引级别分组，层次化索引数据集最方便的方法就是能够根据索引级别进行聚合，通过level关键字传入级别编号和名称
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['city','tenor'])
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns )
hier_df

In [None]:
hier_df.groupby(level='city',axis=1).count()