In [1]:
# 处理缺失数据
import numpy as np
import pandas as pd
from numpy import nan as NA
from pandas import DataFrame
from pandas import Series

In [2]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
0    False
1    False
2     True
3    False
dtype: bool


In [3]:
string_data[0] = None # None与NaN等价
print(string_data.isnull())

0     True
1    False
2     True
3    False
dtype: bool


In [4]:
# NA处理方法
# dropna：根据各标签的值中是否存在缺少数据对轴标签进行过滤，可通过阈值调节对缺失值的容忍度。
# fillna：用指定值或插值方法（ffill或bfill）填充缺失数据
# isnull：返回一个含有布尔值的对象，这些布尔值表示哪些是缺失值/NaN，该对象的类型与源类型一样。
# notnull：isnull的否定式

In [5]:
# --------------------
# 滤除缺失数据
# --------------------

In [6]:
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
 data[data.notnull()] # 与dropna等价

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
data = DataFrame([[1., 6.5, 3.],
                  [1., NA, NA],
                  [NA, NA, NA],
                  [NA, 6.5, 3.]])
data.dropna() # 默认一行里只要有1个元素是NA就放弃

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
data.dropna(how='all') # 只有一行的所有元素是NA才放弃

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
data[4] = NA # 添加全部值为NA的一列
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [11]:
data.dropna(axis=1, how='all') # 如果某列值全部为NA则删除

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
df = DataFrame(np.random.randn(7, 3))
df.iloc[:3, 1] = NA # 前3行的第2列
df.iloc[:2, 2] = NA # 前2行的第3列
df

Unnamed: 0,0,1,2
0,1.461675,,
1,1.894771,,
2,1.016794,,-1.777335
3,0.622431,-0.039849,0.845624
4,-0.581082,-2.400173,0.254417
5,-0.724201,-0.500151,0.26001
6,-0.211176,-1.208431,0.597146


In [13]:
df.dropna(thresh=2) # 每行至少有几个不为NA的数

Unnamed: 0,0,1,2
2,1.016794,,-1.777335
3,0.622431,-0.039849,0.845624
4,-0.581082,-2.400173,0.254417
5,-0.724201,-0.500151,0.26001
6,-0.211176,-1.208431,0.597146


In [14]:
# 填充缺失数据

In [15]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.461675,0.0,0.0
1,1.894771,0.0,0.0
2,1.016794,0.0,-1.777335
3,0.622431,-0.039849,0.845624
4,-0.581082,-2.400173,0.254417
5,-0.724201,-0.500151,0.26001
6,-0.211176,-1.208431,0.597146


In [16]:
df.fillna({1: 0.5, 2: -1}) # 针对不同列上的NA填充，因为axis默认为0。

Unnamed: 0,0,1,2
0,1.461675,0.5,-1.0
1,1.894771,0.5,-1.0
2,1.016794,0.5,-1.777335
3,0.622431,-0.039849,0.845624
4,-0.581082,-2.400173,0.254417
5,-0.724201,-0.500151,0.26001
6,-0.211176,-1.208431,0.597146


In [17]:
df.fillna(66.66, inplace=True) # 原地填充
df

Unnamed: 0,0,1,2
0,1.461675,66.66,66.66
1,1.894771,66.66,66.66
2,1.016794,66.66,-1.777335
3,0.622431,-0.039849,0.845624
4,-0.581082,-2.400173,0.254417
5,-0.724201,-0.500151,0.26001
6,-0.211176,-1.208431,0.597146


In [18]:
df = DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA # 第2行及以后行的第1列
df.iloc[4:, 2] = NA # 第2行及以后行的第2列
df

Unnamed: 0,0,1,2
0,0.288089,-1.146596,-0.466263
1,-1.57901,-0.407622,1.601867
2,1.20685,,-0.364749
3,-0.176566,,0.204815
4,0.428873,,
5,0.914837,,


In [19]:
df.fillna(method='ffill') # 用上一行对应位置的值填充

Unnamed: 0,0,1,2
0,0.288089,-1.146596,-0.466263
1,-1.57901,-0.407622,1.601867
2,1.20685,-0.407622,-0.364749
3,-0.176566,-0.407622,0.204815
4,0.428873,-0.407622,0.204815
5,0.914837,-0.407622,0.204815


In [20]:
df.fillna(method='ffill', limit=2) # 最多填充2个元素，剩下的不管了。

Unnamed: 0,0,1,2
0,0.288089,-1.146596,-0.466263
1,-1.57901,-0.407622,1.601867
2,1.20685,-0.407622,-0.364749
3,-0.176566,-0.407622,0.204815
4,0.428873,,0.204815
5,0.914837,,0.204815


In [21]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean()) # 使用平均值填充

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [22]:
# fill_na函数的参数
# value：用于填充缺失值的标量值或字典对象
# method：插值方式，默认为ffill。
# axis：待填充的轴，默认axis=0。
# inplace：修改调用者对象而不产生副本
# limit：可以连续填充的最大数量