<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#dropna()" data-toc-modified-id="dropna()-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>dropna()</a></span></li><li><span><a href="#fillna()" data-toc-modified-id="fillna()-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>fillna()</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## dropna()
参数 | 说明
---|---
axis | 默认axis=0，删除存在缺失值的行，axis=1，删除存在缺失的行
how='all' | 删除全为NA的行或列
thresh=2 | 保留至少有2个非NA值的行

In [7]:
from numpy import nan as NA

In [8]:
data = Series([1, NA, 3.5, NA, 7])

In [9]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [10]:
data.dropna() # 删除缺失值

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.isnull()]

1   NaN
3   NaN
dtype: float64

In [13]:
data[~data.isnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data = DataFrame([[1., 6.5, 3.], 
                  [1., NA, NA],
                  [NA, NA, NA],
                  [NA, 6.5, 3.]])

In [15]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
cleaned = data.dropna() # 默认删除包含缺失值的行

In [17]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [18]:
data.dropna(how='all') # 删除全为NA的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
# 添加一列，其值全为NA
data[4] = NA

In [20]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
# 删除全为NA的列
data.dropna(axis=1, # 对列  
            how='all') # 全为NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
df = DataFrame(np.random.randn(7, 3))

In [23]:
df.iloc[:4, 1] = NA # 前4行第2列全赋值为NA

In [24]:
df.iloc[:2, 2] = NA # 前2行第3列全赋值为NA

In [25]:
df

Unnamed: 0,0,1,2
0,-1.07867,,
1,-1.315517,,
2,0.591444,,-0.018877
3,0.18984,,1.860634
4,-0.701338,-0.254066,-0.323945
5,-0.520631,-0.496564,-1.577097
6,-0.727681,0.64429,0.20089


In [26]:
df.dropna() # 删除存在缺失值的行

Unnamed: 0,0,1,2
4,-0.701338,-0.254066,-0.323945
5,-0.520631,-0.496564,-1.577097
6,-0.727681,0.64429,0.20089


In [27]:
df.dropna(thresh=2) # 保留至少有2个非NA值的行

Unnamed: 0,0,1,2
2,0.591444,,-0.018877
3,0.18984,,1.860634
4,-0.701338,-0.254066,-0.323945
5,-0.520631,-0.496564,-1.577097
6,-0.727681,0.64429,0.20089


## fillna()
参数 | 说明
---|---
value | 填充缺失值的值
method | 默认'ffill'，用前面的值来填充后面的缺失值
axis | # 默认axis=0，行；axis=1，列
inplace | 原地修改
limit | 用于前向或后向填充时最大的填充范围

In [28]:
df.fillna(0) # 用0来填充缺失值

Unnamed: 0,0,1,2
0,-1.07867,0.0,0.0
1,-1.315517,0.0,0.0
2,0.591444,0.0,-0.018877
3,0.18984,0.0,1.860634
4,-0.701338,-0.254066,-0.323945
5,-0.520631,-0.496564,-1.577097
6,-0.727681,0.64429,0.20089


In [29]:
df.fillna({1: 0.5, 2: 0}) # 第2列的缺失值用0.5填充，第3列的缺失值用0来填充

Unnamed: 0,0,1,2
0,-1.07867,0.5,0.0
1,-1.315517,0.5,0.0
2,0.591444,0.5,-0.018877
3,0.18984,0.5,1.860634
4,-0.701338,-0.254066,-0.323945
5,-0.520631,-0.496564,-1.577097
6,-0.727681,0.64429,0.20089


In [30]:
df.fillna(0, inplace=True)

In [31]:
df

Unnamed: 0,0,1,2
0,-1.07867,0.0,0.0
1,-1.315517,0.0,0.0
2,0.591444,0.0,-0.018877
3,0.18984,0.0,1.860634
4,-0.701338,-0.254066,-0.323945
5,-0.520631,-0.496564,-1.577097
6,-0.727681,0.64429,0.20089


In [32]:
df = DataFrame(np.random.randn(6, 3))

In [33]:
df.iloc[2:, 1] = NA

In [34]:
df.iloc[4:, 2] = NA

In [35]:
df

Unnamed: 0,0,1,2
0,0.319185,1.606035,1.404441
1,0.680935,-3.177887,-0.140141
2,-0.42004,,0.740604
3,-0.820164,,0.543204
4,0.657036,,
5,0.561043,,


In [36]:
df.fillna(method='ffill') # 前向插值，用前面的值来填充缺失值

Unnamed: 0,0,1,2
0,0.319185,1.606035,1.404441
1,0.680935,-3.177887,-0.140141
2,-0.42004,-3.177887,0.740604
3,-0.820164,-3.177887,0.543204
4,0.657036,-3.177887,0.543204
5,0.561043,-3.177887,0.543204


In [37]:
df.fillna(method='ffill', limit=2) # 前向插值，只填充两行

Unnamed: 0,0,1,2
0,0.319185,1.606035,1.404441
1,0.680935,-3.177887,-0.140141
2,-0.42004,-3.177887,0.740604
3,-0.820164,-3.177887,0.543204
4,0.657036,,0.543204
5,0.561043,,0.543204


In [38]:
data = Series([1., NA, 3.5, NA, 7])

In [39]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [40]:
data.fillna(data.mean()) # 用平均值来填充缺失值

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64