# Chapter 7

# Working with Missing Data

In [4]:
import pandas as pd
import numpy as np

In [5]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [6]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [8]:
float_data = pd.Series([1, 2, None], dtype='float64')

In [9]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [10]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## Filter Out Missing Data

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [15]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [17]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Those were new objects, to drop within an object, pass axis.

In [18]:
data[4] = np.nan

In [19]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


bad choice for example, it created a 4th column of all nulls then dropped

In [20]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [21]:
df.iloc[:4, 1] = np.nan

In [22]:
df.iloc[:2, 2] = np.nan

In [23]:
df

Unnamed: 0,0,1,2
0,-0.704829,,
1,-0.869488,,
2,-1.012902,,0.793551
3,-0.911104,,-0.710205
4,-0.037348,-1.505042,-0.480043
5,-0.225778,-0.720147,0.477771
6,-0.065548,1.448754,2.107412


In [24]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.037348,-1.505042,-0.480043
5,-0.225778,-0.720147,0.477771
6,-0.065548,1.448754,2.107412


In [25]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.012902,,0.793551
3,-0.911104,,-0.710205
4,-0.037348,-1.505042,-0.480043
5,-0.225778,-0.720147,0.477771
6,-0.065548,1.448754,2.107412


In [26]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.704829,0.0,0.0
1,-0.869488,0.0,0.0
2,-1.012902,0.0,0.793551
3,-0.911104,0.0,-0.710205
4,-0.037348,-1.505042,-0.480043
5,-0.225778,-0.720147,0.477771
6,-0.065548,1.448754,2.107412


In [27]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.704829,0.5,0.0
1,-0.869488,0.5,0.0
2,-1.012902,0.5,0.793551
3,-0.911104,0.5,-0.710205
4,-0.037348,-1.505042,-0.480043
5,-0.225778,-0.720147,0.477771
6,-0.065548,1.448754,2.107412
