In [1]:
# group by 技术
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib.pyplot as plt
from numpy.random import randn
import os
from datetime import datetime

In [2]:
df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.028115,-0.853598
1,a,two,0.210138,0.820495
2,b,one,2.108009,0.883269
3,b,two,-0.425044,0.710237
4,a,one,0.076562,-1.147938


In [3]:
grouped = df['data1'].groupby(df['key1'])   # 访问data1根据key1调用group by
grouped                 # 实际上这部分没有进行聚合操作，只是进行了分组

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000022F74E73E48>

In [5]:
grouped.mean()          # 这里是数据series根据分组键进行的聚合操作，产生的是一个新的series

key1
a    0.086195
b    0.841483
Name: data1, dtype: float64

In [6]:
# 如果groupby 的时候一次传入多个数组，那么情况就会不一样了
means = df['data1'].groupby([df['key1'],df['key2']]).mean()   # 按照两个key索引会得到一个层次化索引的数据（由唯一的键对组成）
means

key1  key2
a     one     0.024223
      two     0.210138
b     one     2.108009
      two    -0.425044
Name: data1, dtype: float64

In [8]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.632122,-1.696925
b,-0.027817,0.770397


In [7]:
# 实际上，分组键可以是任何长度适当的数组
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005    0.210138
            2006    2.108009
Ohio        2005   -0.226580
            2006    0.076562
Name: data1, dtype: float64

In [8]:
# 此外分组键还可以是列名
df.groupby('key1').mean()   # 这个代码执行的时候，会自动忽略字符串格式的列，也叫做# 麻烦列

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.086195,-0.39368
b,0.841483,0.796753


In [9]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.024223,-1.000768
a,two,0.210138,0.820495
b,one,2.108009,0.883269
b,two,-0.425044,0.710237


In [10]:
# groupby的size方法，他可以返回一个含有分组大小的size
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [12]:
# groupby 对象支持迭代，可以产生一组二元元组（由分组名和数据块组成）
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.028115 -0.853598
1    a  two  0.210138  0.820495
4    a  one  0.076562 -1.147938
b
  key1 key2     data1     data2
2    b  one  2.108009  0.883269
3    b  two -0.425044  0.710237


In [15]:
# 如果是有多重键的话，元组的第一个元素将会是由键值组成的元组：
for (k1, k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one -0.028115 -0.853598
4    a  one  0.076562 -1.147938
a two
  key1 key2     data1     data2
1    a  two  0.210138  0.820495
b one
  key1 key2     data1     data2
2    b  one  2.108009  0.883269
b two
  key1 key2     data1     data2
3    b  two -0.425044  0.710237


In [20]:
# 还可以把迭代的数据片段做成一个字典：
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.028115 -0.853598
 1    a  two  0.210138  0.820495
 4    a  one  0.076562 -1.147938, 'b':   key1 key2     data1     data2
 2    b  one  2.108009  0.883269
 3    b  two -0.425044  0.710237}

In [None]:
# group by 默认是按照axis=0 上进行分组的，通过设置也可以在其他任何轴上进行分组

In [21]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [26]:
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.028115 -0.853598
 1  0.210138  0.820495
 2  2.108009  0.883269
 3 -0.425044  0.710237
 4  0.076562 -1.147938, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [None]:
# 选取一个或一组列
