In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
string_data = pd.Series([None, 'artichoke', np.nan, 'avocado'])  # NaN，Not a Number；NA，not available；
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [4]:
# NA值的处理方法
# 方法       说明
# dropna   根据各标签的值中是否存在缺失数据对轴标签进行过滤，可通过阈值调节对缺失值的容忍度
# fillna   用指定值或插值方法（如ffill或bfill）填充缺失数据
# isnull   返回一个含有布尔值的对象，这些布尔值表示哪些值是缺失值/NA，该对象的类型与源类型一样
# notnull  isnull的反函数

In [5]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()  # <==> data[data.notnull()] 过滤缺失值

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data.dropna()  # DataFrame的dropna()方法默认会删除包含NA值的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [7]:
data.dropna(how='all')  # 使用how='all'只删除全部值为NA的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [8]:
data[4] = NA
data.dropna(axis=1, how='all')  # 按照列操作

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df.dropna(thresh=2)  # 只检查第2列，而不是默认检查所有列

Unnamed: 0,0,1,2
2,1.048074,,-1.604828
3,-0.626995,,-2.638073
4,-0.066457,-0.313715,0.263844
5,-0.889869,0.162188,-0.252377
6,-0.198165,0.8454,0.305323


In [10]:
df.fillna(0)  # 补全缺失值

Unnamed: 0,0,1,2
0,-1.483726,0.0,0.0
1,-0.815884,0.0,0.0
2,1.048074,0.0,-1.604828
3,-0.626995,0.0,-2.638073
4,-0.066457,-0.313715,0.263844
5,-0.889869,0.162188,-0.252377
6,-0.198165,0.8454,0.305323


In [11]:
df.fillna({1:100, 2:200})  # 不同的列，补全不同的缺失值

Unnamed: 0,0,1,2
0,-1.483726,100.0,200.0
1,-0.815884,100.0,200.0
2,1.048074,100.0,-1.604828
3,-0.626995,100.0,-2.638073
4,-0.066457,-0.313715,0.263844
5,-0.889869,0.162188,-0.252377
6,-0.198165,0.8454,0.305323


In [12]:
_ = df.fillna(0, inplace=True)  # fillna()返回一个新的对象，使用inplace=True可以更改自身
df

Unnamed: 0,0,1,2
0,-1.483726,0.0,0.0
1,-0.815884,0.0,0.0
2,1.048074,0.0,-1.604828
3,-0.626995,0.0,-2.638073
4,-0.066457,-0.313715,0.263844
5,-0.889869,0.162188,-0.252377
6,-0.198165,0.8454,0.305323


In [13]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df.fillna(method='ffill')  # 使用前一个值补全缺失值

Unnamed: 0,0,1,2
0,-1.938353,-1.689334,0.425757
1,-0.598094,-0.176695,0.210936
2,-0.600905,-0.176695,-0.978383
3,-0.370658,-0.176695,-0.222551
4,-0.722491,-0.176695,-0.222551
5,1.30153,-0.176695,-0.222551


In [14]:
df.fillna(method='ffill', limit=2)  # 限制补全的次数

Unnamed: 0,0,1,2
0,-1.938353,-1.689334,0.425757
1,-0.598094,-0.176695,0.210936
2,-0.600905,-0.176695,-0.978383
3,-0.370658,-0.176695,-0.222551
4,-0.722491,,-0.222551
5,1.30153,,-0.222551


In [15]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())  # 使用平均值补全缺失值

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [16]:
# fillna()函数参数
# 参数      说明
# value   用于填充缺失值的标量值或字典对象
# method  插值方式。如果函数调用时未指定其他参数的话，默认为“ffill”
# axis     待填充的轴，默认axis=O
# inplace  修改调用者对象而不产生副本
# limit   （对于前向和后向填充）可以连续填充的最大数量