In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


Table of Contents

1.[introduction to panda data structure](#intro)

   [series](#series)
   
   [DataFrame & python data structure](#DataFrame)
   
   [Index Object](#IndexObject)
   
 2.[Essential Functions](#Functions)
   
   [reindexing](#reindexing)
   
   [indexing, selection, filtering](#indexing)
   
   [ix](#ix)
   
   [sorting and ranking](#sorting)
   
   

## intro
## series
A Series is a one-dimensional array-like object containing an array of data (of any NumPy data type) and an associated array of data labels, called its index

一个前面有index的数组。

In [2]:
from pandas import Series, DataFrame
obj = Series([1,2,3,4])
obj

0    1
1    2
2    3
3    4
dtype: int64

custom index!!
===

In [3]:
obj2 = Series([2,4,6,7],index = ['a','b','c','d'])
obj2
obj2['a']

2

使用boolean方法筛选

In [4]:
obj2[obj2 >4]

c    6
d    7
dtype: int64

In [5]:
np.exp(obj2)

a       7.389056
b      54.598150
c     403.428793
d    1096.633158
dtype: float64

某种程度上series can be considered as a fixed-length, ordered dict。

In [6]:
'b' in obj2

True

In [7]:
'r' in obj2

False

所以这样一看，我们可以通过字典来构建series!!
==

In [8]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

如下，还可以传入index，但是如果没有对应的value，就会是nan

In [9]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

index可以修改：
==

In [10]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      1
Steve    2
Jeff     3
Ryan     4
dtype: int64

pd.isnull
===
去检查是不是有null。

In [11]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [12]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [13]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

一个非常重要的特点是，series之间的加法能够加对应的值！
===
A critical Series feature for many applications is that it automatically aligns differently- indexed data in arithmetic operations:
====

In [14]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

series的对象和index都有一个name的attribute,就像行的名字，列的名字，表格名字一样的感觉。可以和pandas的一些功能一起使用

In [15]:
obj4.name = 'population'

In [16]:
obj4.index.name = 'state'

In [17]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## DataFrame
想一个dict of series which use the same index

In [18]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002],'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


## 1 输入dict

In [19]:
data = { 'state': ['Ohino', 'Ohino', 'Ohino', 'Nevada', 'Nevada'],
         'year': [2000, 2001, 2002, 2001, 2002],
         'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

In [20]:
data

{'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
 'state': ['Ohino', 'Ohino', 'Ohino', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002]}

In [21]:
# dict to sereis
#不指定index，则data的key作为index
s = pd.Series(data)
s

state    [Ohino, Ohino, Ohino, Nevada, Nevada]
year            [2000, 2001, 2002, 2001, 2002]
pop                  [1.5, 1.7, 3.6, 2.4, 2.9]
dtype: object

In [22]:
#dict to dataframe
#不指定column,key作为column
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohino,2000,1.5
1,Ohino,2001,1.7
2,Ohino,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


## 2 输入list

In [23]:
data = [[2000, 'Ohino', 1.5],
        [2001, 'Ohino', 1.7],
        [2002, 'Ohino', 3.6],
        [2001, 'Nevada', 2.4],
        [2002, 'Nevada', 2.9]]
data

[[2000, 'Ohino', 1.5],
 [2001, 'Ohino', 1.7],
 [2002, 'Ohino', 3.6],
 [2001, 'Nevada', 2.4],
 [2002, 'Nevada', 2.9]]

In [24]:
#list to series
l = pd.Series(data)
print(l)
l = pd.Series(data, index = ['one','two','tree','four','five'])
print(l)

0     [2000, Ohino, 1.5]
1     [2001, Ohino, 1.7]
2     [2002, Ohino, 3.6]
3    [2001, Nevada, 2.4]
4    [2002, Nevada, 2.9]
dtype: object
one      [2000, Ohino, 1.5]
two      [2001, Ohino, 1.7]
tree     [2002, Ohino, 3.6]
four    [2001, Nevada, 2.4]
five    [2002, Nevada, 2.9]
dtype: object


In [25]:
#list to dataframe
df = pd.DataFrame(data)
print(df)
df = pd.DataFrame(data, columns = ['year','state','population'])
print(df)

      0       1    2
0  2000   Ohino  1.5
1  2001   Ohino  1.7
2  2002   Ohino  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
   year   state  population
0  2000   Ohino         1.5
1  2001   Ohino         1.7
2  2002   Ohino         3.6
3  2001  Nevada         2.4
4  2002  Nevada         2.9


## 3 输入array

In [26]:
data = [[2000, 'Ohino', 1.5],
        [2001, 'Ohino', 1.7],
        [2002, 'Ohino', 3.6],
        [2001, 'Nevada', 2.4],
        [2002, 'Nevada', 2.9]]
data = np.array(data)
data

array([['2000', 'Ohino', '1.5'],
       ['2001', 'Ohino', '1.7'],
       ['2002', 'Ohino', '3.6'],
       ['2001', 'Nevada', '2.4'],
       ['2002', 'Nevada', '2.9']], dtype='<U21')

In [27]:
df = pd.DataFrame(data,columns = ['year','state','population'])
df

Unnamed: 0,year,state,population
0,2000,Ohino,1.5
1,2001,Ohino,1.7
2,2002,Ohino,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


## 4 输入series

In [28]:
s

state    [Ohino, Ohino, Ohino, Nevada, Nevada]
year            [2000, 2001, 2002, 2001, 2002]
pop                  [1.5, 1.7, 3.6, 2.4, 2.9]
dtype: object

In [29]:
#sereis to dict
dic = s.to_dict()

In [30]:
dic

{'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
 'state': ['Ohino', 'Ohino', 'Ohino', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002]}

In [31]:
## sereis to array
foo = s.as_matrix()
foo

  


array([list(['Ohino', 'Ohino', 'Ohino', 'Nevada', 'Nevada']),
       list([2000, 2001, 2002, 2001, 2002]),
       list([1.5, 1.7, 3.6, 2.4, 2.9])], dtype=object)

In [32]:
foo[0]

['Ohino', 'Ohino', 'Ohino', 'Nevada', 'Nevada']

## 5 dataframe

In [33]:
df

Unnamed: 0,year,state,population
0,2000,Ohino,1.5
1,2001,Ohino,1.7
2,2002,Ohino,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [34]:
## df to array
foo1 = DataFrame.as_matrix(df)
foo2 = df.as_matrix()
foo3 = df.values
foo4 = np.array(df)
print(foo1,'\n\n',foo2,'\n\n',foo3,'\n\n',foo4)

[['2000' 'Ohino' '1.5']
 ['2001' 'Ohino' '1.7']
 ['2002' 'Ohino' '3.6']
 ['2001' 'Nevada' '2.4']
 ['2002' 'Nevada' '2.9']] 

 [['2000' 'Ohino' '1.5']
 ['2001' 'Ohino' '1.7']
 ['2002' 'Ohino' '3.6']
 ['2001' 'Nevada' '2.4']
 ['2002' 'Nevada' '2.9']] 

 [['2000' 'Ohino' '1.5']
 ['2001' 'Ohino' '1.7']
 ['2002' 'Ohino' '3.6']
 ['2001' 'Nevada' '2.4']
 ['2002' 'Nevada' '2.9']] 

 [['2000' 'Ohino' '1.5']
 ['2001' 'Ohino' '1.7']
 ['2002' 'Ohino' '3.6']
 ['2001' 'Nevada' '2.4']
 ['2002' 'Nevada' '2.9']]


  
  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
## df to dictionary
df1 = df.to_dict(orient='dict')
print(df1,'\n')
print(df1['year'])

{'year': {0: '2000', 1: '2001', 2: '2002', 3: '2001', 4: '2002'}, 'state': {0: 'Ohino', 1: 'Ohino', 2: 'Ohino', 3: 'Nevada', 4: 'Nevada'}, 'population': {0: '1.5', 1: '1.7', 2: '3.6', 3: '2.4', 4: '2.9'}} 

{0: '2000', 1: '2001', 2: '2002', 3: '2001', 4: '2002'}


In [36]:
df2 = df.to_dict(orient = "list")
print(df2,'\n')

print(df2['year'])


{'year': ['2000', '2001', '2002', '2001', '2002'], 'state': ['Ohino', 'Ohino', 'Ohino', 'Nevada', 'Nevada'], 'population': ['1.5', '1.7', '3.6', '2.4', '2.9']} 

['2000', '2001', '2002', '2001', '2002']


In [37]:
df3 = df.to_dict(orient = "series")
print(df3,'\n')
print(df3['year'])

{'year': 0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: object, 'state': 0     Ohino
1     Ohino
2     Ohino
3    Nevada
4    Nevada
Name: state, dtype: object, 'population': 0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
Name: population, dtype: object} 

0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: object


In [38]:
df4 = df.to_dict(orient='records')
print(df4,'\n')
print(df4[0])

[{'year': '2000', 'state': 'Ohino', 'population': '1.5'}, {'year': '2001', 'state': 'Ohino', 'population': '1.7'}, {'year': '2002', 'state': 'Ohino', 'population': '3.6'}, {'year': '2001', 'state': 'Nevada', 'population': '2.4'}, {'year': '2002', 'state': 'Nevada', 'population': '2.9'}] 

{'year': '2000', 'state': 'Ohino', 'population': '1.5'}


Pass column names
===

In [39]:
frame1 = DataFrame(data, columns = ['year','state','pop'])
frame1

Unnamed: 0,year,state,pop
0,2000,Ohino,1.5
1,2001,Ohino,1.7
2,2002,Ohino,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


pass row names -- index
==

In [40]:
frame2 = DataFrame(data, columns = ['year','state','pop'], index = ['one', 'two', 'three', 'four', 'five'])
frame2


Unnamed: 0,year,state,pop
one,2000,Ohino,1.5
two,2001,Ohino,1.7
three,2002,Ohino,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9


In [41]:
frame2.columns

Index(['year', 'state', 'pop'], dtype='object')

访问每一列元素，既可以用 “.”，也可以用[]
==

In [42]:
frame2['state']

one       Ohino
two       Ohino
three     Ohino
four     Nevada
five     Nevada
Name: state, dtype: object

In [43]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: object

## frame.ix
可以用ix去访问特定index的行。
==

In [44]:
frame2.ix['four']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


year       2001
state    Nevada
pop         2.4
Name: four, dtype: object

增加列
===
[]法
==

In [45]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohino,1.5,16.5
two,2001,Ohino,1.7,16.5
three,2002,Ohino,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [46]:
# 重新赋值
frame2.debt = np.arange(5)

In [47]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohino,1.5,0
two,2001,Ohino,1.7,1
three,2002,Ohino,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


直接赋值series。但series要注意，长度要和dataframe一样，否则会有nan。
===

In [48]:
val = Series([-1.2, -1.5, -1.7], index = ['two','three','five'])
frame2.debt = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohino,1.5,
two,2001,Ohino,1.7,-1.2
three,2002,Ohino,3.6,-1.5
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,-1.7


根据已有数据赋值
===

In [49]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohino,1.5,,False
two,2001,Ohino,1.7,-1.2,False
three,2002,Ohino,3.6,-1.5,False
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,-1.7,False


删除列
===
del
===

In [50]:
del frame2['eastern']

In [51]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohino,1.5,
two,2001,Ohino,1.7,-1.2
three,2002,Ohino,3.6,-1.5
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,-1.7


还可以传入一个nested dict的形式去生成dataframe
==
外面的key看做是columns, inner key as row indices
==

In [52]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [53]:
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


.T随时可以transpose
===

In [54]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


dataframe的列和行的名字都可以添加及修改！
==

In [55]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [56]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

## IndexObject
index object
===
虽然我们不需要很了解index作为一个对象，但是作者是想说这个也是panda的一个重要组成部分。他有一个特点是immutable，所以分享给其他数据用作index，会很安全。这个对象既像一个list，又想一个set，因为每一个数是独一无二。

In [57]:
obj = Series(np.random.randn(3),index = np.arange(3))

In [58]:
obj

0    0.925612
1   -0.391786
2    0.947284
dtype: float64

In [59]:
index = obj.index

In [60]:
index

Int64Index([0, 1, 2], dtype='int64')

In [61]:
index[1:]

Int64Index([1, 2], dtype='int64')

In [62]:
2 in obj.index

True

## Functions
## reindexing
就是给value重新加不一样的index

In [63]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [64]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

发现有nan，于是在reindex的时候要小心这个！如下操作即可！
===

In [65]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

或者，在时间线分析中，需要interpolation插入，fill of values when reindexing, 于是就有一个method叫ffill(fill values forward)，如下
==

In [66]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [67]:
#bfill --- fill backward values
obj3 = Series(['blue', 'purple', 'yellow'], index=[1, 3, 5])
obj3.reindex(range(6), method = 'bfill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

默认情况下修改的是行的index，但是列的index也可以改哦！只要传入参数columns即可
==

In [68]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [69]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [70]:
# reindex row and columns in one shot
frame.reindex(index=['a', 'b', 'c', 'd'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


使用ix可以行和列一起修改了！
==

In [71]:
frame.ix[['a', 'b', 'c', 'd'], states]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## drop
entries from an axis
删除！
==

In [72]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

通过index删除
==
批量删除，传入list。而且，有了index在手，想删哪里就删哪里！！！！只要说明是那个axis就好！默认不写的话是行index。axis = 1,代表列。

In [73]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [74]:
obj.drop(['c','d'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [75]:
data = DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [76]:
data.drop(['one','four'],axis = 1)

Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10
New York,13,14


## indexing
selection, filtering
==
如果用label去切片，那么最后的那个label是包含的，这和正常的slicing不一样！！！
==

In [77]:
obj = Series(range(4), index = ['a','b','c','d'])
obj

a    0
b    1
c    2
d    3
dtype: int64

In [78]:
obj['b':'d']

b    1
c    2
d    3
dtype: int64

In [79]:
obj[1:3]

b    1
c    2
dtype: int64

In [80]:
data = DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


选取行
==

In [81]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


boolean方法选数据，特定范围的值
==

In [82]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [83]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [84]:
data[data < 5] = 0

In [85]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## ix
data[列的label]选择列
===
data.ix[行的label/数字]选择行
===
data.ix[: , 列的label/数字]切片选行，label选列
==
data.ix[行的label/数字,列的label/数字]选择行和列
===

In [86]:
data.ix['Colorado', ['two', 'three']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


two      5
three    6
Name: Colorado, dtype: int64

In [87]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [88]:
#默认是行的index
data.ix[2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [89]:
data.ix[:'Utah', 'two']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [90]:
#也可以用boolean！！
data.ix[data.three > 5, :3]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


选择列！！
===

In [91]:
data[['one','three']]

Unnamed: 0,one,three
Ohio,0,0
Colorado,0,6
Utah,8,10
New York,12,14


不需要使用如下方法其实！
==

In [92]:
data.ix[:,['one','three']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,one,three
Ohio,0,0
Colorado,0,6
Utah,8,10
New York,12,14


In [93]:
data.ix[:,1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Ohio         0
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [94]:
data.ix[0:2,1:3]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,two,three
Ohio,0,0
Colorado,5,6


## Arithmetic 
and data alignment
====

In [95]:
df1 =  DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 =  DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [96]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


如果想避免出现nan,我们需要加上fill_value，让没有这个value的那个变成一个规定的值。
==

In [97]:
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


broadcasting
==

In [98]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
series = frame.ix[0]
print(series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [99]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [100]:
frame + series

Unnamed: 0,b,d,e
Utah,0.0,2.0,4.0
Ohio,3.0,5.0,7.0
Texas,6.0,8.0,10.0
Oregon,9.0,11.0,13.0


In [101]:
series2 = Series(range(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

如果对应的加数没有value，那么就会是nan。这个是上面的两个dataframe相加的例子是一样的。
==

In [102]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


Function application and mapping
==
很多numpy的函数elementwise的那种，也可以作用在dataframe上。
==
dataframe也可以使用.apply函数。
==

In [103]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.656465,0.440515,-1.139722
Ohio,-0.264573,-1.634157,-0.255566
Texas,0.668726,0.474205,-0.136463
Oregon,-1.404178,-0.660561,0.448461


In [104]:
f = lambda x: x.max() - x.min()

In [105]:
frame.apply(f)

b    2.072904
d    2.108362
e    1.588183
dtype: float64

In [106]:
frame.apply(f,axis = 1)

Utah      1.580237
Ohio      1.378591
Texas     0.805189
Oregon    1.852639
dtype: float64

下面的函数，把function的结果作为series传了出来，很厉害！
==

In [107]:
def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])

In [108]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.404178,-1.634157,-1.139722
max,0.668726,0.474205,0.448461


下面的例子是展示element-wise python functions作用在df上。这里使用的是.appymap函数。每一个数都保留两位小数。
==

In [109]:
format = lambda x : '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.66,0.44,-1.14
Ohio,-0.26,-1.63,-0.26
Texas,0.67,0.47,-0.14
Oregon,-1.4,-0.66,0.45


The reason for the name applymap is that Series has a map method for applying an ele- ment-wise function:
==

In [110]:
frame['e'].map(format)

Utah      -1.14
Ohio      -0.26
Texas     -0.14
Oregon     0.45
Name: e, dtype: object

## sorting
and ranking

Sort index
==
## sort_index()

In [111]:
obj = Series(range(4), index = ['d','a','b','c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [112]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [113]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


Sort index可以传入axis参数，这样就知道sort哪个index了！加上ascending参数，可以控制升序/降序
==

In [114]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [115]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [116]:
frame.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


Sort by value
==
## sort_values()

In [117]:
obj = Series([4,7,-3,2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

Sort by column
==
用sort_values()最好！
==

In [118]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [119]:
frame.sort_index(by = 'b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [120]:
frame.sort_index(by = ['a','b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [121]:
frame.sort_values(by = ['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


ranking
==
如果啥都不改，默认出来的结果，如果一样大，则会出现0.5这样。想要有序，可以加上method = 'first'或者'max'或'min'，效果略有不同。升降序依旧是修改ascending

In [122]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [123]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

降序！
==

In [124]:
obj.rank(ascending = False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [125]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


还可以根据axis给rank。
==

In [126]:
frame.rank(axis = 1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


Axis indexes with duplicate values
==
index.is_unique检测是否index是unique
==

In [127]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique

False