In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

### The built-in Python None value is also treated as NA in object arrays:

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

__SERIE__

In [5]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
# above command is equivalent to this command
data[data.notnull()]

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

__DATAFRAME__

In [8]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
# dropna by default drops any row containing a missing value:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [10]:
# Passing how='all' will only drop rows that are all NA:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [11]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [12]:
# To drop columns in the same way, pass axis=1:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


A related way to filter out DataFrame rows tends to concern time series data. Suppose
you want to keep only rows containing a certain number of observations. You can
indicate this with the thresh argument:

In [13]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.730074,0.0176,-0.734177
1,1.243056,0.434537,-0.432813
2,1.15909,0.468565,0.007352
3,0.096538,2.541599,-1.120454
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


In [15]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.730074,,
1,1.243056,,
2,1.15909,,0.007352
3,0.096538,,-1.120454
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


In [16]:
df.dropna()

Unnamed: 0,0,1,2
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


In [17]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.15909,,0.007352
3,0.096538,,-1.120454
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


### Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways. For most pur‐
poses, the fillna method is the workhorse function to use. Calling fillna with a
constant replaces missing values with that value:

In [20]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.730074,0.0,0.0
1,1.243056,0.0,0.0
2,1.15909,0.0,0.007352
3,0.096538,0.0,-1.120454
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


In [21]:
# Calling fillna with a dict, you can use a different fill value for each column:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.730074,0.5,0.0
1,1.243056,0.5,0.0
2,1.15909,0.5,0.007352
3,0.096538,0.5,-1.120454
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


In [23]:
# fillna returns a new object, but you can modify the existing object in-place:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.730074,0.0,0.0
1,1.243056,0.0,0.0
2,1.15909,0.0,0.007352
3,0.096538,0.0,-1.120454
4,0.18886,-1.089404,1.135245
5,-0.33954,-0.162134,0.721704
6,1.947455,0.688426,1.773547


__The same interpolation methods available for reindexing can be used with fillna:__

In [25]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,1.160938,-0.277262,0.447775
1,1.419639,0.687655,0.189035
2,-0.212163,,-1.808403
3,-0.713295,,0.689244
4,0.450059,,
5,0.219115,,


In [26]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.160938,-0.277262,0.447775
1,1.419639,0.687655,0.189035
2,-0.212163,0.687655,-1.808403
3,-0.713295,0.687655,0.689244
4,0.450059,0.687655,0.689244
5,0.219115,0.687655,0.689244


In [27]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,1.160938,-0.277262,0.447775
1,1.419639,0.687655,0.189035
2,-0.212163,0.687655,-1.808403
3,-0.713295,0.687655,0.689244
4,0.450059,,0.689244
5,0.219115,,0.689244


__you might pass the mean or median value of a Series:__

In [28]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64