In [2]:
# 基本功能
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

In [3]:
# 重新索引

In [4]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [5]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [6]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill') # 使用ffill填充缺失值

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [7]:
frame = DataFrame(np.arange(9).reshape((3, 3)),
                  index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [8]:
frame2 = frame.reindex(['a', 'b', 'c', 'd']) # 匹配不到的索引自动填充NaN
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [9]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states) # 调整columns

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [10]:
# reindex函数的参数
# index：     用作索引的新序列。既可以是Index实例，也可以是其他序列类型的Python数据结构。Index会被完全使用，就像没有任何复制一样。
# method：    插值（填充）方式
# fill_value：在重新索引的过程中，需要引入缺失值时使用的替代值。
# limit：     前向或后向填充时的最大填充量
# level：     在MultiIndex的指定级别上匹配简单索引，否则选取其子集。
# copy：      默认为True，无论如何都复制，如果为False，如果新旧相等就不复制。

In [11]:
# 丢弃指定轴上的项

In [12]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [13]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data_row = data.drop(['Colorado', 'Ohio']) # 删除指定行，因为默认axis=0
data_col = data.drop(['two', 'four'], axis=1)
print(data_row)
print(data_col)

          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [14]:
# --------------------
# 索引、选取和过滤
# --------------------

In [15]:
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj['b'])
print(obj[1]) # 自动数位置

1.0
1.0


In [16]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [17]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [18]:
obj[[1, 3]] 

b    1.0
d    3.0
dtype: float64

In [19]:
obj[obj < 2] # 按条件筛选

a    0.0
b    1.0
dtype: float64

In [20]:
obj['b':'c'] = 5 # 利用切片批量更新元素
print(obj)

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [21]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [22]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [23]:
data[data['three'] > 5] # 选取three列值大于5的行

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [24]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [25]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
data.loc['Colorado', ['two', 'three']] # 第一个参数对应行，第二个对应列。

two      5
three    6
Name: Colorado, dtype: int64

In [27]:
# data.ix[['Colorado', 'Utah'], [3, 0, 1]]
data.loc[['Colorado', 'Utah']].iloc[:, [3, 0, 1]] # 与上面写法等价

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [28]:
data.iloc[2] # 第2行

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [29]:
# data.ix[data.three > 5, :3]
data[data.three > 5].iloc[:, :3] # 与上面写法等价

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [30]:
# 算术运算和数据对齐

In [1]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2 # 在不重叠的索引处自动填充NaN

NameError: name 'Series' is not defined

['b', 'c', 'd']

In [32]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
                columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)),
                columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2 # DataFrame在行和列上自动填充

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [33]:
df1.add(df2, fill_value=0) # 使用0作为NaN的替代填充值

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [34]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,b,d,e
Ohio,0.0,2.0,0
Texas,3.0,5.0,0
Colorado,6.0,8.0,0


In [35]:
# 算术方法
# add：加法
# sub：减法
# div：除法
# mul：乘法

In [36]:
# DataFrame和Series之间的运算

In [37]:
arr = np.arange(12.).reshape((3, 4))
arr - arr[0] # 在每一行上广播操作

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [38]:
frame = DataFrame(np.arange(12.).reshape((4, 3)),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
print(frame)
print(series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


In [39]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [40]:
series2 = Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [41]:
series3 = frame['d']
frame.sub(series3, axis=0) # 每一行减去对应列上的元素

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [42]:
# 函数应用和映射

In [43]:
frame = DataFrame(np.random.randn(4, 3),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
np.abs(frame) # numpy的ufuncs（元素级数组方法）也可以操作pandas对象

Unnamed: 0,b,d,e
Utah,0.926417,0.176078,0.100153
Ohio,0.337543,1.155308,0.757115
Texas,0.031684,0.960605,0.6173
Oregon,0.664964,0.438543,2.846081


In [44]:
f = lambda x: x.max() - x.min()
# 可能有同学问为axis=0不是按行操作吗？
# 所谓的按行操作是沿着行的方向，也就是从上到下，
# 那么f对应的x就是每一列。
print(frame)
print(frame.apply(f)) # 默认axis=0，每一列上最大值减最小值
print(frame.apply(f, axis=1)) # 每一行上最大值减最小值

               b         d         e
Utah   -0.926417  0.176078 -0.100153
Ohio   -0.337543 -1.155308 -0.757115
Texas   0.031684  0.960605  0.617300
Oregon  0.664964  0.438543  2.846081
b    1.591381
d    2.115912
e    3.603196
dtype: float64
Utah      1.102495
Ohio      0.817764
Texas     0.928921
Oregon    2.407538
dtype: float64


In [45]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.926417,-1.155308,-0.757115
max,0.664964,0.960605,2.846081


In [46]:
format = lambda x: '%.2f' % x
frame.applymap(format) # applymap是元素级别的

Unnamed: 0,b,d,e
Utah,-0.93,0.18,-0.1
Ohio,-0.34,-1.16,-0.76
Texas,0.03,0.96,0.62
Oregon,0.66,0.44,2.85


In [47]:
frame['e'].map(format) # 对于Series对象，map是元素级别的。

Utah      -0.10
Ohio      -0.76
Texas      0.62
Oregon     2.85
Name: e, dtype: object

In [48]:
# 排序和排名

In [49]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index() # 根据索引做排序

a    1
b    2
c    3
d    0
dtype: int64

In [50]:
frame = DataFrame(np.arange(8).reshape((2, 4)),
                  index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(frame.sort_index()) # 根据行做排序
print(frame.sort_index(axis=1)) # 根据列做排序

       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [51]:
frame.sort_index(axis=1, ascending=False) #使用降序

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [52]:
obj = Series([4, 7, -3, 2])
obj.sort_values() # 按值排序，order函数已消失。

2   -3
3    2
0    4
1    7
dtype: int64

In [53]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values() # NaN默认排到最后

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [54]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by='b') # 指定列进行排序，这里以后不要再用sort_index。

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [55]:
frame.sort_values(by=['a', 'b']) # 指定多个列，如果相等，参考后一个列的值。

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [56]:
# 先对数组做排序，结果为[-5, 0, 2, 4, 4, 7, 7]
# 对于元素7，出现位置的平均值为6.5，这里索引从1开始。
# 对于元素-5，出现位置的平均值为1。
# ...
# rank输出为对应位置元素在排序后数组出现的平均位置。
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [57]:
obj.rank(method='first') # 不求平均，取第一次出现的位置。如果某个值多次出现，对应多个位置。

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [58]:
# 降序。而且不管出现几次。取出现位置的最大值。这样元素7出现的最大位置就是2。
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [59]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 
                   'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [60]:
# 沿着列的方向计算排名
# 以第0行为例，排序后为[-2.0, 0, 4.3]，
# 对应rank输出[2. 3. 1]。
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [61]:
# rank函数的method选项
# average：默认值，取平均排名。
# min/max：最小/最大排名
# first：  按值在原始数据中的出现顺序分配排名

In [62]:
# 带有重复值的轴索引

In [63]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
print(obj)
print(obj.index.is_unique)

a    0
a    1
b    2
b    3
c    4
dtype: int64
False


In [64]:
print(obj['a']) # 对于重复索引，返回Series对象。
print(obj['b'])

a    0
a    1
dtype: int64
b    2
b    3
dtype: int64


In [65]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,1.792788,-0.744451,-0.46829
a,0.191357,0.139785,1.810245
b,-2.171865,-1.287709,-0.828183
b,0.845363,-0.351035,0.102817


In [66]:
df.loc['b'] # 返回DataFrame对象

Unnamed: 0,0,1,2
b,-2.171865,-1.287709,-0.828183
b,0.845363,-0.351035,0.102817
