In [1]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

### 7.1 GroupBy机制

#### 一.  基本概念
1. groupby在R语言中又叫做,"split-apply-combine"  
  1. pandas数据结构根据传进来的key拆分为多个分组
  2. 将一个函数应用到每个分组产生各自的结果
  3. 组合这些结果到新的对象, 作为返回
2. 分组中的缺失值会被自动忽略.   
3. 查看分组情况 :  
 GroupBy.size()返回一个Series, 该Series的多层index表示分组数值. 数据为该分组内有几个数据 
  
#### 二.  Series的分组  
  1. `Series.groupBy(otherSeries)` : 按照`otherSeries`的值分组, 然后把分组后的index复用到Series上, 形成分组信息
  2. Series分组后进行聚合操作, 产生的新Series结构为:  
   index: otherSeries分好组后的值.  
   `index.name=otherSeries.index.name`
   新Series的data = 聚合后的数据
  3. `Series.groupby(otherSeries1,otherSeries2, ..)` : 按照多个Series进行分组  
   按照多个otherSeries分组, 则产生的新Series的索引时多层级的. 第一层为otherSeries1的值, 第二层为otherSeries2的值
   
#### 三. DataFrame.groupby('columnIdx')  
1. DataFrame默认把所有列, 按照`df['columnIdx']`这个Series进行分组   
 若分组后执行的聚合函数为mean这种数值函数, 则mean()会自动忽略非数值列
 
2. DataFrame语法糖  
 `df.groupby('columnidx')['show_columnidx'] = df['show_columnidx'].groupby('columnidx')`

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.557559,-0.612829,a,one
1,-0.7927,0.249046,a,two
2,-0.025781,0.188782,b,one
3,-0.133812,1.637359,b,two
4,-1.983949,-0.596026,a,one


In [3]:
# SeriesGroupBy对象实际没有进行任何运算, 
# 只是根据df['key1']这个Series的值进行分组后, 把分好组的index复用到df['data1']上
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x1116e4dd0>

In [4]:
grouped.mean()

key1
a   -0.406363
b   -0.079796
Name: data1, dtype: float64

In [5]:
# Series.groupby(otherSeries1,otherSeries2)
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.213195
      two    -0.792700
b     one    -0.025781
      two    -0.133812
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.213195,-0.7927
b,-0.025781,-0.133812


In [7]:
# DataFrame分组
# mean()自动忽略非数值列'key2'
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.406363,-0.319936
b,-0.079796,0.91307


In [8]:
# Group.size查看分组情况
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

#### 四. 分组迭代
1. 因为分组条件为多个Series的值区分, 所以可以查看每个分组的层级索引, 及组内包含的数据`Series/DataFrame`  
 `for name,Group in df.groupby('columnIdx')`
2. 通常一个有用的操作是:  
 把分组名与分组内容形成字典dict,可使用`dict(list(df.groupby(['columnidx'])))`
 1. 用list(GroupBy), 把GroupBy内的iterator元组构成列表
 2. 再用dict(list) : 把`(组名,分组内容)`这个列表变成`dict`
 
3. 分组通常是把行分组, 也可指定axis=1,使其把列分组

In [9]:
# 迭代查看分组的层级索引与分组内容
for name,group in df.groupby(['key1','key2']):
    print name
    print group
    print "-------------------------------"

('a', 'one')
      data1     data2 key1 key2
0  1.557559 -0.612829    a  one
4 -1.983949 -0.596026    a  one
-------------------------------
('a', 'two')
    data1     data2 key1 key2
1 -0.7927  0.249046    a  two
-------------------------------
('b', 'one')
      data1     data2 key1 key2
2 -0.025781  0.188782    b  one
-------------------------------
('b', 'two')
      data1     data2 key1 key2
3 -0.133812  1.637359    b  two
-------------------------------


In [10]:
# 将分组信息包装成dict
dict(list(df.groupby('key1')))

{'a':       data1     data2 key1 key2
 0  1.557559 -0.612829    a  one
 1 -0.792700  0.249046    a  two
 4 -1.983949 -0.596026    a  one, 'b':       data1     data2 key1 key2
 2 -0.025781  0.188782    b  one
 3 -0.133812  1.637359    b  two}

In [11]:
# 把列分组
print df.dtypes # Series
grouped = df.groupby(df.dtypes,axis=1)

data1    float64
data2    float64
key1      object
key2      object
dtype: object


In [12]:
for dtype,group in grouped:
    print dtype
    print group
    print '-------------------------------'

float64
      data1     data2
0  1.557559 -0.612829
1 -0.792700  0.249046
2 -0.025781  0.188782
3 -0.133812  1.637359
4 -1.983949 -0.596026
-------------------------------
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
-------------------------------


#### 五.  其它分组条件
1. 字典做分组条件  
  1. 按照字典的value分组, 记录下被分到同一个group的key
  2. key对应pandas数据结构的index/columnIdx, 从而对pandas的数据结构进行分组

2. 函数作为分组条件  
  1. 函数会应用在pandas数据结构的index/columnIdx上, 返回同一个值得作为一个分组
  
3. 根据索引级别分组

In [13]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.033863,-1.715167,2.053917,-0.885003,1.172416
Steve,1.179518,-0.725034,-2.587221,-0.777278,0.879079
Wes,-0.984972,0.187762,1.066811,1.005461,-0.397249
Jim,-0.303225,-1.29457,0.468318,-0.054268,-0.808964
Travis,-0.990036,1.249763,-1.857503,1.280221,-0.139408


In [14]:
# 字典作为分组条件
mapping = {'a': 'red', 
           'b': 'red', 
           'c': 'blue',
           'd': 'blue', 
           'e': 'red', 
           'f': 'orange'}
people.groupby(mapping,axis=1).sum()

Unnamed: 0,blue,red
Joe,1.168914,-0.508887
Steve,-3.364499,1.333563
Wes,2.072273,-1.194459
Jim,0.41405,-2.406759
Travis,-0.577281,0.120319


In [15]:
# 函数作为分组条件
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.254333,-2.821975,3.589046,0.066191,-0.033797
5,1.179518,-0.725034,-2.587221,-0.777278,0.879079
6,-0.990036,1.249763,-1.857503,1.280221,-0.139408


In [16]:
# 按照索引级别分组
columnIdx = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
                                     names = ['cty','tenor'])
df = DataFrame(np.random.randn(4,5),columns=columnIdx)
df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.936234,0.081074,-0.25795,0.719789,-0.15816
1,-1.619354,-0.420456,-0.163847,0.909962,-0.13644
2,-0.849784,1.578561,0.697229,0.006685,-0.082318
3,-2.09089,-1.096405,0.683609,0.947007,-0.527211


In [17]:
df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


### 10.2 聚合函数

#### 1. 任何能从数组生成标量的过程, 都叫聚合  
 例如, Series/DataFrame的函数quantile(), 计算Series/DataFrame的分位数. 虽然quantile()没有明确定义在GroupBy对象中, 但是他能从一个Series产生一个标量, 因此他可以用作聚合函数
 
#### 2. 使用aggregate自定义聚合函数
 GroupBy.aggregate(func) 

In [18]:
# quantile用于聚合
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df['data1'].groupby(df['key1']).quantile(0.9)

key1
a    0.995973
b    1.364612
Name: data1, dtype: float64

In [19]:
# 自定义聚合函数
peek2peek = lambda x : x.max()-x.min()
df.groupby('key1').aggregate(peek2peek)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.456266,1.42094
b,0.490655,1.881052


#### 3. GroupBy.agg()面向多列的多聚合函数应用
1. `GroupBy.agg('aggfuncName')` :  agg函数可使用函数名代表聚合函数
2. `GroupBy.agg([func1,func2])` :  在一列同时应用多个聚合函数
3.  `GroupBy.agg([('alias1','func1'),('alias2','func2')])` :   
 在一列应用多个函数时, 给最后聚合成的结果的index起名(使用元组组成的列表(别名. 函数名))
4. 在多个列上进行多个聚合函数: `GroupBy['columnIdx1','columnIdx2'..].agg('aggfuncName')`
5. 终级应用, 定义在DataFrame的哪一列使用什么样的聚合函数:  
 `GroupBy.agg(dict('columnIdx1':['functions1'] , 'columnIdx2':['functions2']))`

In [20]:
df = pd.read_csv('../example/tips.csv')
# Add tip percentage of total bill
df['tip_pct'] = df['tip'] / df['total_bill']
df.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [21]:
# 使用函数名代表聚合函数
grouped = df.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [22]:
# 在一列同时应用多个聚合函数
grouped_pct.agg(['mean','std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [23]:
# 给最后聚合成的结果的index起名
grouped_pct.agg([('foo','mean'),('bar',np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [24]:
# 在多个列上进行多个聚合函数
functions = ['mean','count','max']
grouped['total_bill','tip_pct'].agg(functions)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,max,mean,count,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,18.42,4,22.75,0.15165,4,0.187735
Fri,Yes,16.813333,15,40.17,0.174783,15,0.26348
Sat,No,19.661778,45,48.33,0.158048,45,0.29199
Sat,Yes,21.276667,42,50.81,0.147906,42,0.325733
Sun,No,20.506667,57,48.17,0.160113,57,0.252672
Sun,Yes,24.12,19,45.35,0.18725,19,0.710345
Thur,No,17.113111,45,41.19,0.160298,45,0.266312
Thur,Yes,19.190588,17,43.11,0.163863,17,0.241255


In [25]:
# 终级应用, 定义在DataFrame的哪一列使用什么样的聚合函数
grouped.agg({'tip_pct':['min','max','mean'],'size':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,sum
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.120385,0.187735,0.15165,9
Fri,Yes,0.103555,0.26348,0.174783,31
Sat,No,0.056797,0.29199,0.158048,115
Sat,Yes,0.035638,0.325733,0.147906,104
Sun,No,0.059447,0.252672,0.160113,167
Sun,Yes,0.06566,0.710345,0.18725,49
Thur,No,0.072961,0.266312,0.160298,112
Thur,Yes,0.090014,0.241255,0.163863,40


### 10.3 分组最通用的方法apply

#### 一. GroupBy.apply的思路
1. `GroupBy.apply`方法的思路 :    
  1. 把函数(可以是非聚合函数)应用在分好组的`DataFrame`片段上  
  2. 再把这些片段上生成的结果   
  3. 通过`pd.concat`组装到一起.  
2. 若应用于`apply`中的自定义函数还含有参数, 则使用`apply(func,func_param)`的形式传参


In [45]:
# apply方法用用top n排名
def top_n(df,n,columnIdx):
    return df.sort_values(by=columnIdx).iloc[-n:,:]
    
df.groupby('day').apply(top_n,3,'total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Fri,96,27.28,4.0,Yes,Fri,Dinner,2,0.146628
Fri,90,28.97,3.0,Yes,Fri,Dinner,2,0.103555
Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Sat,59,48.27,6.73,No,Sat,Dinner,4,0.139424
Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Sun,184,40.55,3.0,Yes,Sun,Dinner,2,0.073983
Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
Thur,85,34.83,5.17,No,Thur,Lunch,4,0.148435


#### 二. 通过分位数分组 (均分)
1. groupby与cut面元连用 :     
 由于pd.cut把DataFrame切分到n个面元, 从而生成一个的Categorical的Series.   
 Categorical分组信息可作为group by的条件  
2. pd.qcut(DataFrame,n) : 把DataFrame均分成n个组


In [80]:
df = pd.DataFrame({'data1':np.random.randn(1000),
                  'data2':np.random.randn(1000)})
df.head()

Unnamed: 0,data1,data2
0,-0.708793,0.814724
1,0.497586,1.105503
2,-0.436543,-0.94543
3,-0.574237,-0.420498
4,-0.416187,-0.554051


In [81]:
# cut面元+group分组 
def get_status(group):
    return {'min':group.min(),'max':group.max(),
            'count':group.count(),'mean':group.mean()}
quartiles = pd.cut(df['data1'],4)
grouped = df['data2'].groupby(quartiles)
obj = grouped.apply(get_status)
obj.unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.838, -1.279]",107.0,3.255839,0.109909,-2.180182
"(-1.279, 0.273]",485.0,3.468034,0.019587,-3.163693
"(0.273, 1.826]",374.0,3.471177,0.025501,-2.789016
"(1.826, 3.379]",34.0,2.767393,0.174577,-1.70045


In [78]:
# qcut分位数分组
grouping = pd.qcut(df['data2'],10,labels=False)
df['data2'].groupby(grouping).apply(get_status).unstack()

Unnamed: 0_level_0,count,max,mean,min
data2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,-1.35727,-1.814709,-2.885619
1,100.0,-0.864421,-1.085386,-1.351681
2,100.0,-0.513518,-0.675455,-0.862023
3,100.0,-0.246844,-0.3903,-0.51333
4,100.0,-0.006711,-0.125361,-0.244483
5,100.0,0.288082,0.148783,-0.00053
6,100.0,0.554489,0.426151,0.289599
7,100.0,0.814672,0.686113,0.555566
8,100.0,1.237484,1.016201,0.818938
9,100.0,3.220821,1.788441,1.242679


#### 三. 用特定于分组的值填充缺失值
1. 步骤 :  
  1. 使用将数据分组
  2. 使用apply函数, 把'填充策略函数'应用在分好组的每个DataFrame上 
  
2. 自定义各分组的填充值  
 由于GroupBy对象有属性name, 为各组相同的index值. 因此可定义dict, {'index name': 自定义填充值}
  

In [91]:
# 使用分组的平均值填充
obj = pd.Series(np.random.randn(8), 
                index=['Ohio', 'New York', 'Vermont', 'Florida','Oregon', 'Nevada', 'California', 'Idaho'])
obj[['Vermont', 'Nevada', 'Idaho']] = np.nan # 构造缺失值
print obj
print '====================================='
group_key = ['East'] * 4 + ['West'] * 4
obj.groupby(group_key).apply(lambda x:x.fillna(x.mean()))

Ohio         -0.314515
New York     -2.912437
Vermont            NaN
Florida      -1.494522
Oregon       -0.716492
Nevada             NaN
California   -0.782278
Idaho              NaN
dtype: float64


Ohio         -0.314515
New York     -2.912437
Vermont      -1.573824
Florida      -1.494522
Oregon       -0.716492
Nevada       -0.749385
California   -0.782278
Idaho        -0.749385
dtype: float64

In [95]:
# 使用预定义在每个分组上的数值填充
obj = pd.Series(np.random.randn(8), 
                index=['Ohio', 'New York', 'Vermont', 'Florida','Oregon', 'Nevada', 'California', 'Idaho'])
obj[['Vermont', 'Nevada', 'Idaho']] = np.nan # 构造缺失值
print obj
print '====================================='
# 定义填充值
fill_values = {'East':0.5,'West':-1}
group_key = ['East'] * 4 + ['West'] * 4
obj.groupby(group_key).apply(lambda g:g.fillna(fill_values[g.name]))

Ohio          0.000267
New York      1.714961
Vermont            NaN
Florida      -0.289177
Oregon        0.624699
Nevada             NaN
California    0.128856
Idaho              NaN
dtype: float64


Ohio          0.000267
New York      1.714961
Vermont       0.500000
Florida      -0.289177
Oregon        0.624699
Nevada       -1.000000
California    0.128856
Idaho        -1.000000
dtype: float64

### 7.4 透视表和交叉表

#### 一. DataFrame.pivot_table
1. pivot_table:    
  创建一个电子表格, 使用DataFrame数据结构进行表示. 这个DataFrame包括分层索引和多个column   
 根据一个或多个键对数据进行聚合(作为结果的层级索引),并根据行和列上的分组将数据分配但每个矩形区域  
 除此之外, 还能通过参数margin=True对行和列进行小计

2. 参数列表  
  1. values : 需要被聚合的columnIdx.  作为结果DtaFrame的最高层columnIdx
  2. index : 需要被作为结果DataFrame分层索引的columnIdx
  3. columns : 被聚合的values内的分组条件(colunIdx). 作为结果DtaFrame的第二层columnIdx
  4. aggfunc : 默认为np.mean. 对分组后的column进行聚合的函数
  5. margins : 是否在结果DataFrame上计算所有行/列的和

3. stack,pivot,pivot_table的区别
  1. stack : 将数据结构,在最内层index上顺时针旋转.
  2. pivot : 将长表转换为宽表. 因此, 要定义宽表的index, columnIdx, column下的value  
  3. pivot_table : 生成一张带聚合且带小计的电子表. 可自定义index, values=被聚合的字段, columns=聚合字段内的分组条件  
  即 : stack最贴近原生形式,顺时针旋转.  
   pivot:生成一张宽表.  
   pivot_table:生成一张分组聚合后的表

In [97]:
df = pd.read_csv('../example/tips.csv')
# Add tip percentage of total bill
df['tip_pct'] = df['tip'] / df['total_bill']
df.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [99]:
# 对所有字段统计, 只在[day,smoker]上进行聚合
df.pivot_table(index = ['day','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [106]:
# 对['tip_pct', 'size']统计, 在['time', 'day']上进行聚合, 并按照smoker进行分组, 然后进行小计
df.pivot_table(['tip_pct', 'size'],index=['time', 'day'],columns='smoker',margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


#### 二. 交叉表
1. crosstab是一种特殊的pivot_table  
 pd.crosstab为参数aggfunc=count的pivot_table  
2. 参数  
  1. index : 用于航上的Groupby (Series)
  2. columns : 列上的groupby (Series)
  3. values与aggfun都与pivot_table中意义一样, 但在croos_table中是可选参数
  
3. pivot_table进行频率统计时, 先要增加1个全1列.指定参数value=这个全1列. 再把column指定成分组条件

In [107]:
df = pd.DataFrame({'Nationality':['USA','Japan','USA','Japan','Japan','Japan','USA','USA','Japan','USA'],
 'Handedness':['Right-handed','Left-handed','Right-handed','Right-handed','Left-handed',
               'Right-handed','Right-handed','Left-handed','Right-handed','Right-handed']})
df

Unnamed: 0,Handedness,Nationality
0,Right-handed,USA
1,Left-handed,Japan
2,Right-handed,USA
3,Right-handed,Japan
4,Left-handed,Japan
5,Right-handed,Japan
6,Right-handed,USA
7,Left-handed,USA
8,Right-handed,Japan
9,Right-handed,USA


In [122]:
# 使用pivot_table形成分组表
df['number'] = 1
coumtnumber = lambda x:x.count()
df.pivot_table('number',index='Nationality',columns='Handedness',aggfunc='count',margins=True)

Handedness,Left-handed,Right-handed,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2.0,3.0,5.0
USA,1.0,4.0,5.0
All,3.0,7.0,10.0


In [129]:
# 使用cross_table简化频率统计
pd.crosstab(df['Nationality'],columns=df['Handedness'],margins=True)

Handedness,Left-handed,Right-handed,All
Nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Japan,2,3,5
USA,1,4,5
All,3,7,10
