### 基本功能 排序和排名
- 
    • 对行或列索引进行排序
    • 对于DataFrame，根据任意一个轴上的索引进行排序
    • 可以指定升序降序
    • 按值排序
    • 对于DataFrame，可以指定按值排序的列
    • rank函数

In [2]:
import numpy as np
from pandas import Series, DataFrame

print ('根据索引排序，对于DataFrame可以指定轴。')
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
print (obj.sort_index())
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index = ['three', 'one'],
                  columns = list('dabc'))
# 按行排序
print (frame.sort_index())
# 按列排序
print( frame.sort_index(axis = 1))
# ascending = False 降序
print (frame.sort_index(axis = 1, ascending = False)) # 降序
print()

print ('根据值排序')
obj = Series([4, 7, -3, 2])
print (obj.sort_values()) # order已淘汰
print()

print ('DataFrame指定列排序')
frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
print (frame)
print (frame.sort_values(by = 'b')) # sort_index(by = ...)已淘汰
print (frame.sort_values(by = ['a', 'b']))
print()

print ('rank，求排名的平均位置(从1开始)')
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 对应排名：-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print (obj.rank())
print (obj.rank(method = 'first'))  # 去第一次出现，不求平均值。
print (obj.rank(ascending = False, method = 'max')) # 逆序，并取最大值。所以-5的rank是7.
frame = DataFrame({'b':[4.3, 7, -3, 2],
                  'a':[0, 1, 0, 1],
                  'c':[-2, 5, 8, -2.5]})
print (frame)
print (frame.rank(axis = 1))


根据索引排序，对于DataFrame可以指定轴。
a    1
b    2
c    3
d    0
dtype: int64
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
       d  c  b  a
three  0  3  2  1
one    4  7  6  5

根据值排序
2   -3
3    2
0    4
1    7
dtype: int64

DataFrame指定列排序
   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1

rank，求排名的平均位置(从1开始)
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
     b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0


In [8]:
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print ('重复的索引')
obj = Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])
print (obj.index.is_unique) # 判断是非有重复索引
# print obj['a'][0], obj.a[1]
df = DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])
print (df)
print (df.ix['b'].ix[0])
print (df.ix['b'].ix[1])

重复的索引
False
          0         1         2
a  0.269140  0.137496 -1.738879
a  0.468879 -0.087178  2.252480
b  0.560772  0.851843  0.166579
b  1.055077 -1.065766  1.315886
0    0.560772
1    0.851843
2    0.166579
Name: b, dtype: float64
0    1.055077
1   -1.065766
2    1.315886
Name: b, dtype: float64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


In [9]:
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame

print ('求和')
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
              index = ['a', 'b', 'c', 'd'],
              columns = ['one', 'two'])
print (df)
print (df.sum())  # 按列求和
print (df.sum(axis = 1) ) # 按行求和
print()

print ('平均数')
print (df.mean(axis = 1, skipna = False))
print (df.mean(axis = 1))
print()

print ('其它')
print (df.idxmax())
print (df.cumsum())
print (df.describe())
obj = Series(['a', 'a', 'b', 'c'] * 4)
print( obj.describe())


求和
    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

平均数
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

其它
one    b
two    d
dtype: object
    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8
            one       two
count  3.000000  2.000000
mean   3.083333 -2.900000
std    3.493685  2.262742
min    0.750000 -4.500000
25%    1.075000 -3.700000
50%    1.400000 -2.900000
75%    4.250000 -2.100000
max    7.100000 -1.300000
count     16
unique     3
top        a
freq       8
dtype: object


### 层次化索引
- 
    • 使你能在一个轴上拥有多个（两个以上）索引级别。抽象的说，它使你能以低
    纬度形式处理高维度数据。
    
    
    • 通过stack与unstack变换DataFrame

In [10]:
# -*- coding: utf-8 -*- 

import numpy as np
from pandas import Series, DataFrame, MultiIndex

print ('Series的层次索引')
data = Series(np.random.randn(10),
              index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                       [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
print (data)
print (data.index)
print (data.b)
print (data['b':'c'])
print (data[:2])
print (data.unstack())
print (data.unstack().stack())
print()

print ('DataFrame的层次索引')
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
print( frame)
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
print (frame)
print (frame.ix['a', 1])
print (frame.ix['a', 2]['Colorado'])
print (frame.ix['a', 2]['Ohio']['Red'])
print()

print ('直接用MultiIndex创建层次索引结构')
print (MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Gree', 'Red', 'Green']],
                             names = ['state', 'color'])
)

Series的层次索引
a  1   -0.098461
   2    1.787009
   3   -0.268434
b  1   -0.406123
   2    0.208323
   3    0.282483
c  1   -0.262988
   2   -0.083770
d  2    1.249196
   3    0.718496
dtype: float64
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
1   -0.406123
2    0.208323
3    0.282483
dtype: float64
b  1   -0.406123
   2    0.208323
   3    0.282483
c  1   -0.262988
   2   -0.083770
dtype: float64
a  1   -0.098461
   2    1.787009
dtype: float64
          1         2         3
a -0.098461  1.787009 -0.268434
b -0.406123  0.208323  0.282483
c -0.262988 -0.083770       NaN
d       NaN  1.249196  0.718496
a  1   -0.098461
   2    1.787009
   3   -0.268434
b  1   -0.406123
   2    0.208323
   3    0.282483
c  1   -0.262988
   2   -0.083770
d  2    1.249196
   3    0.718496
dtype: float64

DataFrame的层次索引
     Ohio     Colorado
    Green Red    Green
a 1     0   1        2
  2     3   4        5
b 1    

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


## 其它话题 面板(Pannel)数据

    • 通过三维ndarray创建pannel对象
    • 通过ix[...]选取需要的数据
    • 访问顺序：item -> major -> minor
    • 通过stack展现面板数据

In [12]:
# -*- coding: utf-8 -*- 

import numpy as np
import pandas as pd
#import pandas.io.formats as web
from pandas import Series, DataFrame, Index, Panel

pdata = Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2016', '1/15/2016')) for stk in ['AAPL', 'GOOG', 'BIDU', 'MSFT']))
print (pdata)
pdata = pdata.swapaxes('items', 'minor')
print (pdata)
print()

print ("访问顺序：# Item -> Major -> Minor")
print (pdata['Adj Close'])
print (pdata[:, '1/5/2016', :])
print (pdata['Adj Close', '1/6/2016', :])
print()

print ('Panel与DataFrame相互转换')
stacked = pdata.ix[:, '1/7/2016':, :].to_frame()
print (stacked)
print (stacked.to_panel())


NameError: name 'web' is not defined