7.1 Handling Missing Data

In [4]:
import pandas as pd
import numpy as np


string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print('\n')
print(string_data.isnull())

string_data[0] = None
print('\n')
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


0    False
1    False
2     True
3    False
dtype: bool


0     True
1    False
2     True
3    False
dtype: bool


In [6]:
# Filtering Out Missing Data

from numpy import nan as NA


data = pd.Series([1, NA, 3.5, NA, 7])
print(data.dropna())
print('\n')
print(data[data.notnull()])



0    1.0
2    3.5
4    7.0
dtype: float64


0    1.0
2    3.5
4    7.0
dtype: float64


In [9]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
print(data)
print('\n')
print(cleaned)
print('\n')
print(data.dropna(how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


     0    1    2
0  1.0  6.5  3.0


     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0


In [11]:
data[4] = NA
print('\n')
print(data)
print('\n')
print(data.dropna(axis=1, how='all'))



     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN


     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [14]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
print(df)
print('\n')
print(df.dropna())
print('\n')
print(df.dropna(thresh=2))

          0         1         2
0 -2.028763       NaN       NaN
1 -1.519579       NaN       NaN
2  1.335718       NaN  0.422142
3 -1.774236       NaN  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
2  1.335718       NaN  0.422142
3 -1.774236       NaN  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


In [17]:
# Filling In Missing Data

print(df.fillna(0))
print('\n')
print(df.fillna({1: 0.5, 2: 0}))
print('\n')

_ = df.fillna(0, inplace=True)

print(df)

          0         1         2
0 -2.028763  0.000000  0.000000
1 -1.519579  0.000000  0.000000
2  1.335718  0.000000  0.422142
3 -1.774236  0.000000  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
0 -2.028763  0.500000  0.000000
1 -1.519579  0.500000  0.000000
2  1.335718  0.500000  0.422142
3 -1.774236  0.500000  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
0 -2.028763  0.000000  0.000000
1 -1.519579  0.000000  0.000000
2  1.335718  0.000000  0.422142
3 -1.774236  0.000000  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


In [23]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
print(df)
print('\n')
print(df.ffill())
print('\n')
print(df.ffill(limit=2))


          0         1         2
0 -0.567748  0.384288  0.453660
1  0.232388 -0.070936  0.407820
2  0.622049       NaN  1.659861
3  1.029840       NaN -0.219562
4  0.012250       NaN       NaN
5  0.053815       NaN       NaN


          0         1         2
0 -0.567748  0.384288  0.453660
1  0.232388 -0.070936  0.407820
2  0.622049 -0.070936  1.659861
3  1.029840 -0.070936 -0.219562
4  0.012250 -0.070936 -0.219562
5  0.053815 -0.070936 -0.219562


          0         1         2
0 -0.567748  0.384288  0.453660
1  0.232388 -0.070936  0.407820
2  0.622049 -0.070936  1.659861
3  1.029840 -0.070936 -0.219562
4  0.012250       NaN -0.219562
5  0.053815       NaN -0.219562


In [24]:
data = pd.Series([1., NA, 3.5, NA, 7])
print(data.fillna(data.mean()))

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64
