In [3]:
import pandas as pd
import numpy as np

In [4]:
from pandas import Series, DataFrame

### Series使用

In [16]:
test = pd.Series([1,3,2,1])
test

0    1
1    3
2    2
3    1
dtype: int64

In [17]:
test.values

array([1, 3, 2, 1])

In [10]:
test.index

RangeIndex(start=0, stop=4, step=1)

In [18]:
test2 = pd.Series([1,4,2,3], index = ['a', 'b', 'c', 'd'])
test2

a    1
b    4
c    2
d    3
dtype: int64

In [19]:
test2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [20]:
test2.values

array([1, 4, 2, 3])

In [21]:
test2[test2 > 1]

b    4
c    2
d    3
dtype: int64

## numpy操作

In [22]:
np.square(test2)

a     1
b    16
c     4
d     9
dtype: int64

In [23]:
'c' in test2

True

In [24]:
'q' in test2

False

## 字典操作

In [31]:
dict0 = {'w':1, 'o':2, 'r':3, 'l':4, 'd':5}
test3 = pd.Series(dict0)
test3

w    1
o    2
r    3
l    4
d    5
dtype: int64

## NaN标记缺失值

In [32]:
keys = ['h', 'e', 'w', 'o', 'r', 'l', 'd', ]
test4 = pd.Series(dict0, keys)
test4

h    NaN
e    NaN
w    1.0
o    2.0
r    3.0
l    4.0
d    5.0
dtype: float64

## .isnull  .notnull检查缺失值

In [34]:
pd.isnull(test4)


h     True
e     True
w    False
o    False
r    False
l    False
d    False
dtype: bool

In [35]:
pd.notnull(test4)

h    False
e    False
w     True
o     True
r     True
l     True
d     True
dtype: bool

## name 属性

In [36]:
test4.name =  'primary'
test4.index.name = 'key'
test4

key
h    NaN
e    NaN
w    1.0
o    2.0
r    3.0
l    4.0
d    5.0
Name: primary, dtype: float64

## 改变索引

In [37]:
test

0    1
1    3
2    2
3    1
dtype: int64

In [38]:
test.index = ['a', 'b', 'c', 'd']

In [39]:
test

a    1
b    3
c    2
d    1
dtype: int64

# DataFrame

In [40]:
data = {'key':['h', 'e', 'l', 'l', 'o', 'w'],
        'num':[1, 2, 3, 4, 5, 6],
        '?':['!', '@', '#', '$', '%', '^']}
frame = pd.DataFrame(data)
frame

Unnamed: 0,key,num,?
0,h,1,!
1,e,2,@
2,l,3,#
3,l,4,$
4,o,5,%
5,w,6,^


## head方法选出前五行

In [43]:
frame.head()

Unnamed: 0,key,num,?
0,h,1,!
1,e,2,@
2,l,3,#
3,l,4,$
4,o,5,%


## 按照指定顺序排列column

In [44]:
pd.DataFrame(data, columns=['num', 'key', '?'])

Unnamed: 0,num,key,?
0,1,h,!
1,2,e,@
2,3,l,#
3,4,l,$
4,5,o,%
5,6,w,^


## 缺失值

In [55]:
frame2 = pd.DataFrame(data, columns=['num', '?', 'key', 'will_be_NaN'],
                     index=['row1', 'row2', 'row3', 'row4', 'row5', 'row6'])
frame2

Unnamed: 0,num,?,key,will_be_NaN
row1,1,!,h,
row2,2,@,e,
row3,3,#,l,
row4,4,$,l,
row5,5,%,o,
row6,6,^,w,


## 检索

In [51]:
frame2['num']

row1    1
row2    2
row3    3
row4    4
row5    5
row6    6
Name: num, dtype: int64

In [52]:
frame2.key

row1    h
row2    e
row3    l
row4    l
row5    o
row6    w
Name: key, dtype: object

## 修改列

In [59]:
frame2.will_be_NaN = np.arange(6)
frame2

Unnamed: 0,num,?,key,will_be_NaN
row1,1,!,h,0
row2,2,@,e,1
row3,3,#,l,2
row4,4,$,l,3
row5,5,%,o,4
row6,6,^,w,5


In [60]:
frame2['will_be_NaN'] = np.arange(4, 10)
frame2

Unnamed: 0,num,?,key,will_be_NaN
row1,1,!,h,4
row2,2,@,e,5
row3,3,#,l,6
row4,4,$,l,7
row5,5,%,o,8
row6,6,^,w,9


## 增加col与.del删除col

In [63]:
frame2['boolean'] = frame2.key >= 'l'
frame2

Unnamed: 0,num,?,key,will_be_NaN,boolean
row1,1,!,h,4,False
row2,2,@,e,5,False
row3,3,#,l,6,True
row4,4,$,l,7,True
row5,5,%,o,8,True
row6,6,^,w,9,True


In [65]:
del frame2['?']
frame2

Unnamed: 0,num,key,will_be_NaN,boolean
row1,1,h,4,False
row2,2,e,5,False
row3,3,l,6,True
row4,4,l,7,True
row5,5,o,8,True
row6,6,w,9,True


## 字典操作

In [69]:
dict02 = {'hello':{2017: 0.8, 2018: 12, 2019: 0},
          'world':{2017: 1, 2018: 2, 2019: 3}}
                   
frame3 = pd.DataFrame(dict02)

frame3

Unnamed: 0,hello,world
2017,0.8,1
2018,12.0,2
2019,0.0,3


## Numpy语法

In [70]:
frame3.T

Unnamed: 0,2017,2018,2019
hello,0.8,12.0,0.0
world,1.0,2.0,3.0


## name属性

In [73]:
frame3.index.name = 'salaries'
frame3.columns.name = 'states'
frame3

states,hello,world
salaries,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,0.8,1
2018,12.0,2
2019,0.0,3


## .values属性返回ndarray形式

In [75]:
frame3.values

array([[ 0.8,  1. ],
       [12. ,  2. ],
       [ 0. ,  3. ]])

## 索引对象

In [77]:
obj = pd.Series(range(3), index = ['q', 'w', 'e'])
index = obj.index
index

Index(['q', 'w', 'e'], dtype='object')

In [78]:
index[1:]

Index(['w', 'e'], dtype='object')

In [80]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [81]:
obj2 = pd.Series([1.2, -3.6, 7.2], index = labels)
obj2

0    1.2
1   -3.6
2    7.2
dtype: float64

In [84]:
obj2.index is labels

True

In [85]:
frame3

states,hello,world
salaries,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,0.8,1
2018,12.0,2
2019,0.0,3


In [86]:
frame3.columns


Index(['hello', 'world'], dtype='object', name='states')

In [87]:
frame3.index

Int64Index([2017, 2018, 2019], dtype='int64', name='salaries')

In [92]:
'hello' in frame3

True

In [94]:
'hello' in frame3.columns

True

In [95]:
'2017' in frame3

False

In [96]:
2017 in frame3.index

True

## 重建索引reindex

In [97]:
obj = pd.Series([9.3, 4.5, 2.1, -3.6], index=['d', 'u', 's', 't'])
obj

d    9.3
u    4.5
s    2.1
t   -3.6
dtype: float64

In [100]:
obj2 = obj.reindex(['d', 'u', 's', 'g', 'l', 't'])
obj2

d    9.3
u    4.5
s    2.1
g    NaN
l    NaN
t   -3.6
dtype: float64

## 插值

In [108]:
obj3 = pd.Series(['zero', 'two', 'three', 'five', 'eight'], index=[0, 2, 3, 5, 8])
obj3

0     zero
2      two
3    three
5     five
8    eight
dtype: object

In [111]:
obj3.reindex(range(9), method='bfill')
# 'bfill' 后向填充，缺失值和后一项相同来源


0     zero
1      two
2      two
3    three
4     five
5     five
6    eight
7    eight
8    eight
dtype: object

## loc标签索引

In [112]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
                    index=['q', 'w', 'e'],
                    columns=['one', 'two', 'three'])
frame

Unnamed: 0,one,two,three
q,0,1,2
w,3,4,5
e,6,7,8


In [113]:
frame.loc[['w', 'e']]

Unnamed: 0,one,two,three
w,3,4,5
e,6,7,8


In [114]:
frame.loc['w']

one      3
two      4
three    5
Name: w, dtype: int64

## 轴向上删除drop

In [117]:
obj = pd.Series(np.arange(5.), index=['q', 'w', 'e', 'r', 't'])
obj

q    0.0
w    1.0
e    2.0
r    3.0
t    4.0
dtype: float64

In [118]:
new_obj = obj.drop('e')
new_obj

q    0.0
w    1.0
r    3.0
t    4.0
dtype: float64

In [120]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=['one', 'two', 'three', 'four'],
                   columns=['I', 'II', 'III', 'IV'])
data

Unnamed: 0,I,II,III,IV
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [125]:
data.drop('one')
# 默认删除行

Unnamed: 0,I,II,III,IV
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [126]:
# 若需删除列，需要axis=1
data.drop('II', axis=1)

Unnamed: 0,I,III,IV
one,0,2,3
two,4,6,7
three,8,10,11
four,12,14,15


## 索引、选择、过滤

In [127]:
obj = pd.Series(np.arange(4.), index=['q', 'w', 'e', 'r'])
obj

q    0.0
w    1.0
e    2.0
r    3.0
dtype: float64

In [157]:
# 利⽤标签的切⽚运算与普通的Python切⽚运算不同，其末端是包含的
obj['w':'r']
# 包含'q'

w    5.0
e    5.0
r    3.0
dtype: float64

In [158]:
obj[1:3]
# 切片，不包含'3'

w    5.0
e    5.0
dtype: float64

In [146]:
# numpy 切片举例
a = np.arange(20)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [147]:
a[2:4]

array([2, 3])

## 设值修改

In [148]:
obj

q    0.0
w    1.0
e    2.0
r    3.0
dtype: float64

In [150]:
obj['q':'e'] = 5
obj

q    5.0
w    5.0
e    5.0
r    3.0
dtype: float64

## 根据布尔值选择

In [151]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=['one', 'two', 'three', 'four'],
                   columns=['I', 'II', 'III', 'IV'])
data

Unnamed: 0,I,II,III,IV
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [152]:
data[data['I'] > 4]

Unnamed: 0,I,II,III,IV
three,8,9,10,11
four,12,13,14,15


In [153]:
data < 5

Unnamed: 0,I,II,III,IV
one,True,True,True,True
two,True,False,False,False
three,False,False,False,False
four,False,False,False,False


In [154]:
data[data < 5] = '<5'
data

Unnamed: 0,I,II,III,IV
one,<5,<5,<5,<5
two,<5,5,6,7
three,8,9,10,11
four,12,13,14,15


## loc（轴标签)；iloc（整数索引）      进行选取

In [174]:
data.loc['two', ['II', 'IV']]

II    5
IV    7
Name: two, dtype: object

In [175]:
data.iloc[1, [1, 3]]

II    5
IV    7
Name: two, dtype: object

In [176]:
data.iloc[2, [0, 2]]

I       8
III    10
Name: three, dtype: object

In [177]:
data[data == '<5'] = 888
data

Unnamed: 0,I,II,III,IV
one,888,888,888,888
two,888,5,6,7
three,8,9,10,11
four,12,13,14,15


In [182]:
data.iloc[:, 2:][data.III >=10 ]

Unnamed: 0,III,IV
one,888,888
three,10,11
four,14,15


## 算术和数据对齐

pandas最重要的⼀个功能是，它可以对不同索引的对象进⾏算
术运算。在将对象相加时，如果存在不同的索引对，则结果的索
引就是该索引对的并集。对于有数据库经验的⽤户，这就像在索
引标签上进⾏⾃动外连接

In [183]:
s1 = pd.Series([2.1, 4.1, -5.2, 2.8],
              index=['q', 'w', 'e', 'r'])
s2 = pd.Series([4.3, 6.4, 7.3, -8.2, 6.1],
              index=['q', 'e', 'a', 'r', 'o'])
s1 + s2

a    NaN
e    1.2
o    NaN
q    6.4
r   -5.4
w    NaN
dtype: float64

In [188]:
df1 = pd.DataFrame({'A':[3,2], 'B':[1,1]})
df2 = pd.DataFrame({'B':[6,9], 'C':[-1,-1]})
df1 - df2

Unnamed: 0,A,B,C
0,,-5,
1,,-8,


## 填充值
在对不同索引的对象进⾏算术运算时，你可能希望当⼀个对象中
某个轴标签在另⼀个对象中找不到时填充⼀个特殊值（⽐如
0）

In [190]:
df1 = pd.DataFrame(np.arange(12.).reshape(3,4),
                  columns=list('qwer'))
df2 = pd.DataFrame(np.arange(20.).reshape(4,5),
                  columns=list('qwert'))

df2.loc[1, 'w'] = np.nan
df2

Unnamed: 0,q,w,e,r,t
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [192]:
df1

Unnamed: 0,q,w,e,r
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [193]:
df1 + df2

Unnamed: 0,e,q,r,t,w
0,4.0,0.0,6.0,,2.0
1,13.0,9.0,15.0,,
2,22.0,18.0,24.0,,20.0
3,,,,,


## add方法，fill_value作为参数

In [195]:
df1.add(df2, fill_value=888)

Unnamed: 0,e,q,r,t,w
0,4.0,0.0,6.0,892.0,2.0
1,13.0,9.0,15.0,897.0,893.0
2,22.0,18.0,24.0,902.0,20.0
3,905.0,903.0,906.0,907.0,904.0


## DataFrame和Series之间操作

In [200]:
frame = pd.DataFrame(np.arange(12.).reshape((3,4)),
                    columns=list('qwer'),
                    index=['one', 'two', 'three'])
series = frame.iloc[0]
print(frame)
print()
print(series)

         q    w     e     r
one    0.0  1.0   2.0   3.0
two    4.0  5.0   6.0   7.0
three  8.0  9.0  10.0  11.0

q    0.0
w    1.0
e    2.0
r    3.0
Name: one, dtype: float64


In [203]:
# 当我们从arr减去arr[0]，每⼀⾏都会执⾏这个操作。这就叫做⼴播（broadcasting）


frame - series
# 默认情况下， DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列，
# 然后沿着⾏⼀直向下⼴播


Unnamed: 0,q,w,e,r
one,0.0,0.0,0.0,0.0
two,4.0,4.0,4.0,4.0
three,8.0,8.0,8.0,8.0


In [206]:
# 如果某个索引值在DataFrame的列或Series的索引中找不到，
# 则参与运算的两个对象就会被重新索引以形成并集
series2 = pd.Series(range(3), index=['w', 'r', 'f'])
series2

w    0
r    1
f    2
dtype: int64

In [207]:
series2 + frame

Unnamed: 0,e,f,q,r,w
one,,,,4.0,1.0
two,,,,8.0,5.0
three,,,,12.0,9.0


In [213]:
# 如果你希望匹配⾏且在列上⼴播，则必须使⽤算术运算⽅法
series3 = frame['w']
print(frame, '\n\n', series3)

         q    w     e     r
one    0.0  1.0   2.0   3.0
two    4.0  5.0   6.0   7.0
three  8.0  9.0  10.0  11.0 

 one      1.0
two      5.0
three    9.0
Name: w, dtype: float64


In [215]:
frame.sub(series3, axis='index')

Unnamed: 0,q,w,e,r
one,-1.0,0.0,1.0,2.0
two,-1.0,0.0,1.0,2.0
three,-1.0,0.0,1.0,2.0
