In [1]:
# 分组级运算和转换
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib.pyplot as plt
from numpy.random import randn
import os
from datetime import datetime

In [2]:
df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
# 假设我们想要为一个DataFrame添加一个用于存放各索引分组平均值的列
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.468598,-0.921386
b,0.677545,0.764841


In [3]:
pd.merge(df, k1_means ,left_on ='key1',right_index=True) 

Unnamed: 0,key1,key2,data1,data2,mean_data1,mean_data2
0,a,one,0.811799,-2.118995,0.468598,-0.921386
1,a,two,1.034439,0.135667,0.468598,-0.921386
4,a,one,-0.440445,-0.780831,0.468598,-0.921386
2,b,one,1.761861,1.888262,0.677545,0.764841
3,b,two,-0.406771,-0.358581,0.677545,0.764841


In [4]:
# 另外一种实现上述目的的方式是：利用np.mean函数对两个数据列进行转换。
people = DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']] = np.nan   # 添加几个NA值
people

Unnamed: 0,a,b,c,d,e
Joe,-0.462057,-0.070218,-0.155935,0.406953,-1.277281
Steve,0.411274,-0.544938,0.135432,-1.035441,-0.647206
Wes,-0.396716,,,0.797704,0.916997
Jim,1.409027,-0.697698,1.04992,0.026381,0.0505
Travis,-0.102511,-0.634433,-0.576203,1.220831,2.074399


In [5]:
key = ['one','two','one','two','one']
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.320428,-0.352326,-0.366069,0.808496,0.571372
two,0.910151,-0.621318,0.592676,-0.50453,-0.298353


In [6]:
people.groupby(key).transform(np.mean)   # transform 会将一个函数应用到各个分组，然后将结果放置到合适的位置上。

Unnamed: 0,a,b,c,d,e
Joe,-0.320428,-0.352326,-0.366069,0.808496,0.571372
Steve,0.910151,-0.621318,0.592676,-0.50453,-0.298353
Wes,-0.320428,-0.352326,-0.366069,0.808496,0.571372
Jim,0.910151,-0.621318,0.592676,-0.50453,-0.298353
Travis,-0.320428,-0.352326,-0.366069,0.808496,0.571372


In [7]:
def demean(arr):                    # 创建一个距平化函数，然后将其传给transferm
    return arr-arr.mean()

demeaned = people.groupby(key).transform(demean) 
demeaned

Unnamed: 0,a,b,c,d,e
Joe,-0.141629,0.282107,0.210134,-0.401543,-1.848653
Steve,-0.498876,0.07638,-0.457244,-0.530911,-0.348853
Wes,-0.076289,,,-0.010792,0.345625
Jim,0.498876,-0.07638,0.457244,0.530911,0.348853
Travis,0.217917,-0.282107,-0.210134,0.412335,1.503028


In [8]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-1.850372e-17,0.0,-1.387779e-17,5.5511150000000004e-17,0.0
two,2.775558e-17,-5.5511150000000004e-17,-5.5511150000000004e-17,-5.5511150000000004e-17,-2.775558e-17


In [9]:
# 接下来进入apply的学习，一般性的拆分-应用-合并操作
def top(df, n=5, column='tip_pct'):  # 在指定的列找出最大值，然后把这个值所在的行选取出来
    return df.sort_index(by=column)[-n:]


In [10]:
path = '../My_python_for_data_analyst\\data\\tips.csv'
tips = pd.read_csv(path)
# 添加消费占总额比的一列
tips['tip_pct'] = tips['tip']/tips['total_bill'] 
tips[:6]   # 数据前五行

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


In [11]:
top(tips)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [12]:
# 如果对smoker分组并调用apply，就会得到：
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [13]:
tips.groupby(['smoker','day']).apply(top, n=1, column='total_bill') # 分组键会和原始对象的索引共同构成结果对象中的层次化索引

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [14]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [15]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [16]:
# 在groupby中调用describe之类的方法时，实际上只是应用了下面两条代码的快捷方式
f = lambda x:x.describe()
tips.groupby('smoker')['tip_pct'].apply(f)

smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64

In [17]:
# 将group_keys = False传入groupby中可以禁止分组键和原始索引组成层次化索引
tip_1 = tips.groupby('smoker',group_keys=False).apply(top)
tip_2 = tips.groupby('smoker').apply(top)
print(tip_1, tip_2)

total_bill   tip smoker   day    time  size   tip_pct
88        24.71  5.85     No  Thur   Lunch     2  0.236746
185       20.69  5.00     No   Sun  Dinner     5  0.241663
51        10.29  2.60     No   Sun  Dinner     2  0.252672
149        7.51  2.00     No  Thur   Lunch     2  0.266312
232       11.61  3.39     No   Sat  Dinner     2  0.291990
109       14.31  4.00    Yes   Sat  Dinner     2  0.279525
183       23.17  6.50    Yes   Sun  Dinner     4  0.280535
67         3.07  1.00    Yes   Sat  Dinner     1  0.325733
178        9.60  4.00    Yes   Sun  Dinner     2  0.416667
172        7.25  5.15    Yes   Sun  Dinner     2  0.710345             total_bill   tip smoker   day    time  size   tip_pct
smoker                                                           
No     88        24.71  5.85     No  Thur   Lunch     2  0.236746
       185       20.69  5.00     No   Sun  Dinner     5  0.241663
       51        10.29  2.60     No   Sun  Dinner     2  0.252672
       149        7.51  2.

In [22]:
# 对数据集进行分位数和桶分析
frame = pd.DataFrame({'data1':np.random.randn(1000),
                      'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)    # cut得到的是长度相等的切分
factor[:10]     

0    (-0.138, 1.655]
1    (-1.93, -0.138]
2     (-3.73, -1.93]
3     (-3.73, -1.93]
4    (-0.138, 1.655]
5    (-1.93, -0.138]
6    (-0.138, 1.655]
7    (-1.93, -0.138]
8    (-1.93, -0.138]
9    (-1.93, -0.138]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.73, -1.93] < (-1.93, -0.138] < (-0.138, 1.655] < (1.655, 3.447]]

In [24]:
#　由cut返回的fator对象可直接用于groupby
#  
def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),                          'mean':group.mean()}

grouped = frame.data2.groupby(factor)
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000278EC1046D8>

In [26]:
print(grouped.apply(get_stats))
print(grouped.apply(get_stats).unstack())   # 层次化索引展开

data1                 
(-3.73, -1.93]   count     29.000000
                 max        2.358781
                 mean       0.283241
                 min       -2.142477
(-1.93, -0.138]  count    400.000000
                 max        2.770157
                 mean       0.002558
                 min       -2.855511
(-0.138, 1.655]  count    528.000000
                 max        2.838561
                 mean       0.033148
                 min       -3.163147
(1.655, 3.447]   count     43.000000
                 max        1.638431
                 mean      -0.132355
                 min       -1.795309
Name: data2, dtype: float64
                 count       max      mean       min
data1                                               
(-3.73, -1.93]    29.0  2.358781  0.283241 -2.142477
(-1.93, -0.138]  400.0  2.770157  0.002558 -2.855511
(-0.138, 1.655]  528.0  2.838561  0.033148 -3.163147
(1.655, 3.447]    43.0  1.638431 -0.132355 -1.795309


In [33]:
# cut得到的是长度相等的切分；qcut得到的是数量相等的切分
grouping = pd.qcut(frame.data1, 10,labels=False)
grouped = frame.data1.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,-1.178678,-1.726122,-3.722773
1,100.0,-0.766461,-0.943136,-1.177516
2,100.0,-0.465062,-0.616191,-0.764217
3,100.0,-0.217281,-0.334249,-0.463204
4,100.0,0.03112,-0.094461,-0.213453
5,100.0,0.288146,0.150729,0.031171
6,100.0,0.551781,0.420318,0.288156
7,100.0,0.826206,0.700933,0.554092
8,100.0,1.263453,1.009045,0.831153
9,100.0,3.447491,1.696372,1.264516
