In [2]:
# 分组级运算和转换
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib.pyplot as plt
from numpy.random import randn
import os
from datetime import datetime

In [3]:
df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
# 假设我们想要为一个DataFrame添加一个用于存放各索引分组平均值的列
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.645575,0.384956
b,-0.345913,-0.750981


In [4]:
pd.merge(df, k1_means ,left_on ='key1',right_index=True) 

Unnamed: 0,key1,key2,data1,data2,mean_data1,mean_data2
0,a,one,0.201208,0.757576,0.645575,0.384956
1,a,two,1.322967,0.458102,0.645575,0.384956
4,a,one,0.412549,-0.06081,0.645575,0.384956
2,b,one,-0.931383,-0.315149,-0.345913,-0.750981
3,b,two,0.239557,-1.186813,-0.345913,-0.750981


In [5]:
# 另外一种实现上述目的的方式是：利用np.mean函数对两个数据列进行转换。
people = DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']] = np.nan   # 添加几个NA值
people

Unnamed: 0,a,b,c,d,e
Joe,0.965725,-0.14262,-1.758814,-0.184406,0.707202
Steve,-1.045786,-0.412665,-0.50864,0.708017,0.155282
Wes,0.516514,,,0.963912,0.51436
Jim,-1.349568,-0.032591,-0.951477,-1.173634,-0.707805
Travis,1.757991,-2.039455,0.131181,0.216283,-1.173614


In [6]:
key = ['one','two','one','two','one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,1.080077,-1.091037,-0.813817,0.33193,0.015983
two,-1.197677,-0.222628,-0.730058,-0.232808,-0.276262


In [7]:
people.groupby(key).transform(np.mean)   # transform 会将一个函数应用到各个分组，然后将结果放置到合适的位置上。

Unnamed: 0,a,b,c,d,e
Joe,1.080077,-1.091037,-0.813817,0.33193,0.015983
Steve,-1.197677,-0.222628,-0.730058,-0.232808,-0.276262
Wes,1.080077,-1.091037,-0.813817,0.33193,0.015983
Jim,-1.197677,-0.222628,-0.730058,-0.232808,-0.276262
Travis,1.080077,-1.091037,-0.813817,0.33193,0.015983


In [8]:
def demean(arr):                    # 创建一个距平化函数，然后将其传给transferm
    return arr-arr.mean()

demeaned = people.groupby(key).transform(demean) 
demeaned

Unnamed: 0,a,b,c,d,e
Joe,-0.114352,0.948417,-0.944997,-0.516335,0.691219
Steve,0.151891,-0.190037,0.221418,0.940825,0.431543
Wes,-0.563562,,,0.631982,0.498378
Jim,-0.151891,0.190037,-0.221418,-0.940825,-0.431543
Travis,0.677914,-0.948417,0.944997,-0.115647,-1.189597


In [9]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,1.110223e-16,0.0,0.0,-2.775558e-17,0.0
two,1.110223e-16,-1.387779e-17,0.0,0.0,0.0


In [10]:
# 接下来进入apply的学习，一般性的拆分-应用-合并操作
def top(df, n=5, column='tip_pct'):  # 在指定的列找出最大值，然后把这个值所在的行选取出来
    return df.sort_index(by=column)[-n:]


In [11]:
path = '../My_python_for_data_analyst\\data\\tips.csv'
tips = pd.read_csv(path)
# 添加消费占总额比的一列
tips['tip_pct'] = tips['tip']/tips['total_bill'] 
tips[:6]   # 数据前五行

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [12]:
top(tips)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [13]:
# 如果对smoker分组并调用apply，就会得到：
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [14]:
tips.groupby(['smoker','day']).apply(top, n=1, column='total_bill') # 分组键会和原始对象的索引共同构成结果对象中的层次化索引

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [15]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [16]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [17]:
# 在groupby中调用describe之类的方法时，实际上只是应用了下面两条代码的快捷方式
f = lambda x:x.describe()
tips.groupby('smoker')['tip_pct'].apply(f)

smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64

In [18]:
# 将group_keys = False传入groupby中可以禁止分组键和原始索引组成层次化索引
tip_1 = tips.groupby('smoker',group_keys=False).apply(top)
tip_2 = tips.groupby('smoker').apply(top)
print(tip_1, tip_2)

total_bill   tip smoker   day    time  size   tip_pct
88        24.71  5.85     No  Thur   Lunch     2  0.236746
185       20.69  5.00     No   Sun  Dinner     5  0.241663
51        10.29  2.60     No   Sun  Dinner     2  0.252672
149        7.51  2.00     No  Thur   Lunch     2  0.266312
232       11.61  3.39     No   Sat  Dinner     2  0.291990
109       14.31  4.00    Yes   Sat  Dinner     2  0.279525
183       23.17  6.50    Yes   Sun  Dinner     4  0.280535
67         3.07  1.00    Yes   Sat  Dinner     1  0.325733
178        9.60  4.00    Yes   Sun  Dinner     2  0.416667
172        7.25  5.15    Yes   Sun  Dinner     2  0.710345             total_bill   tip smoker   day    time  size   tip_pct
smoker                                                           
No     88        24.71  5.85     No  Thur   Lunch     2  0.236746
       185       20.69  5.00     No   Sun  Dinner     5  0.241663
       51        10.29  2.60     No   Sun  Dinner     2  0.252672
       149        7.51  2.

In [19]:
# 对数据集进行分位数和桶分析
frame = pd.DataFrame({'data1':np.random.randn(1000),
                      'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)    # cut得到的是长度相等的切分
factor[:10]     

0    (-1.42, 0.115]
1    (0.115, 1.651]
2    (-1.42, 0.115]
3    (0.115, 1.651]
4    (0.115, 1.651]
5    (0.115, 1.651]
6    (0.115, 1.651]
7    (0.115, 1.651]
8    (-1.42, 0.115]
9    (-1.42, 0.115]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.961, -1.42] < (-1.42, 0.115] < (0.115, 1.651] < (1.651, 3.186]]

In [20]:
#　由cut返回的fator对象可直接用于groupby
#  
def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),                          'mean':group.mean()}

grouped = frame.data2.groupby(factor)
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000282CBB09E80>

In [21]:
print(grouped.apply(get_stats))
print(grouped.apply(get_stats).unstack())   # 层次化索引展开

data1                 
(-2.961, -1.42]  count     70.000000
                 max        1.959877
                 mean      -0.079326
                 min       -2.420920
(-1.42, 0.115]   count    479.000000
                 max        2.604815
                 mean      -0.022976
                 min       -2.987015
(0.115, 1.651]   count    403.000000
                 max        3.058415
                 mean      -0.011243
                 min       -3.402030
(1.651, 3.186]   count     48.000000
                 max        2.161785
                 mean      -0.095574
                 min       -3.938884
Name: data2, dtype: float64
                 count       max      mean       min
data1                                               
(-2.961, -1.42]   70.0  1.959877 -0.079326 -2.420920
(-1.42, 0.115]   479.0  2.604815 -0.022976 -2.987015
(0.115, 1.651]   403.0  3.058415 -0.011243 -3.402030
(1.651, 3.186]    48.0  2.161785 -0.095574 -3.938884


In [22]:
# cut得到的是长度相等的切分；qcut得到的是数量相等的切分
grouping = pd.qcut(frame.data1, 10,labels=False)
grouped = frame.data1.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,-1.196914,-1.707925,-2.955077
1,100.0,-0.830326,-1.016659,-1.194502
2,100.0,-0.502875,-0.6682,-0.827739
3,100.0,-0.193043,-0.337839,-0.502669
4,100.0,-0.002467,-0.103267,-0.187245
5,100.0,0.2238,0.111393,-0.001629
6,100.0,0.500469,0.360435,0.228444
7,100.0,0.82439,0.656609,0.505829
8,100.0,1.284531,1.03545,0.825672
9,100.0,3.185709,1.77505,1.284797


In [23]:
# 示例：用特定分组的值填充缺失值
# 之前对na值的处理是直接drop，现在有了新的方案是按照分组fillna
s = Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.996653
2         NaN
3    1.350534
4         NaN
5   -0.368054
dtype: float64

In [24]:
s.fillna(s.mean())

0    0.659711
1    0.996653
2    0.659711
3    1.350534
4    0.659711
5   -0.368054
dtype: float64

In [27]:
states = ['Ohio','New Yourk','Vermont', 'Florida', 'Oregon', 'Nevada','California','Idaho']
group_key = ['East'] * 4 + ['west'] * 4
data = Series(np.random.randn(8), index = states)
data[['Vermont','Nevada','Idaho']] = np.nan
data

Ohio          1.207297
New Yourk    -0.732012
Vermont            NaN
Florida      -0.522221
Oregon        0.702607
Nevada             NaN
California   -0.135902
Idaho              NaN
dtype: float64

In [29]:
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)   # 按照均值填充 此处还有个函数是lambda函数的应用

Ohio          1.207297
New Yourk    -0.732012
Vermont      -0.015645
Florida      -0.522221
Oregon        0.702607
Nevada        0.283352
California   -0.135902
Idaho         0.283352
dtype: float64

In [30]:
# 也可以在代码中预定义各组的填充值
fill_values = {'East':0.5,'west':-1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Ohio          1.207297
New Yourk    -0.732012
Vermont       0.500000
Florida      -0.522221
Oregon        0.702607
Nevada       -1.000000
California   -0.135902
Idaho        -1.000000
dtype: float64

In [40]:
# 示例随机采样和排列
# 下面是构造一副英语型扑克牌的方式：
# 红桃（Hearts）、黑桃（Spades）、梅花（Clubs）、方片（Diamonds）
suits = ['H','S','C','D']
card_val = (list(range(1,11)) + [10] * 3 ) *4   #　这块和书中不一样，需要显式指定
base_names = ['A']+ list(range(2,11)) +['J','K','Q']
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index = cards)
deck          # 生成的扑克牌和对应的点数对应关系

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [41]:
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])   # 随机按照蒙特卡罗模拟抽五张牌
draw(deck)

8H      8
7C      7
2D      2
5S      5
10H    10
dtype: int64

In [42]:
get_suit = lambda card:card[-1]
deck.groupby(get_suit).apply(draw, n=2)  # 先按照分组排列，然后抽两张牌

C  JC    10
   7C     7
D  JD    10
   7D     7
H  9H     9
   4H     4
S  9S     9
   KS    10
dtype: int64

In [43]:
# 另外一种类似的办法是：
deck.groupby(get_suit, group_keys = False).apply(draw, n=2)

KC    10
AC     1
AD     1
3D     3
3H     3
7H     7
8S     8
6S     6
dtype: int64