In [48]:
# Reference: 
# online free docs:          https://pandas.pydata.org/pandas-docs/stable/
#                                      https://pandas.pydata.org/pandas-docs/stable/missing_data.html
# book old edition free:  https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/
# book new edition pay: https://smile.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/

In [49]:
# pandas uses NaN (np.nan) to denote missing values (NaT for datetime dtypes)

In [3]:
import numpy as np
import pandas as pd

In [4]:
##### missing data comparison #####

In [5]:
ser = pd.Series(data=[1, 2, 3, None, 5, 6, np.nan, 8], index = list('abcdefgh'))
ser

a    1.0
b    2.0
c    3.0
d    NaN
e    5.0
f    6.0
g    NaN
h    8.0
dtype: float64

In [17]:
# note: nan's don't compare equal; in other words, == and != doesn't work with np.nan
ser == np.nan

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
dtype: bool

In [18]:
ser != np.nan

a    True
b    True
c    True
d    True
e    True
f    True
g    True
h    True
dtype: bool

In [20]:
# instead use pd.isna and pd.notna
pd.isna(ser) # pd.isnull(ser)

a    False
b    False
c    False
d     True
e    False
f    False
g     True
h    False
dtype: bool

In [21]:
pd.notna(ser)

a     True
b     True
c     True
d    False
e     True
f     True
g    False
h     True
dtype: bool

In [25]:
##### missing data propagation #####
# np.nan proprogates through arithmetic operations 
# (they are like viruses and infect anything they come in contact with)

In [27]:
ser1 = pd.Series(data=[1, 2, 3, np.nan, 5, 6, np.nan, 8], index = list('abcdefgh'))
ser2 = pd.Series(data=[10, 20, 30, 40, 50, 60, 70, 80], index = list('abcdefgh'))
ser1 + ser2

a    11.0
b    22.0
c    33.0
d     NaN
e    55.0
f    66.0
g     NaN
h    88.0
dtype: float64

In [32]:
df1 = pd.DataFrame(data=[[1, 2, 3, np.nan, 5], [6, np.nan, 8, 9, 10]], index=list('XY'), columns=list('abcde'))
df2 = pd.DataFrame(data=[[10, 20, 30, 40, 50], [60, 70, 80, 90, 100]], index=list('XY'), columns=list('abcde'))
print (df1)
print (df2)
df1 + df2

   a    b  c    d   e
X  1  2.0  3  NaN   5
Y  6  NaN  8  9.0  10
    a   b   c   d    e
X  10  20  30  40   50
Y  60  70  80  90  100


Unnamed: 0,a,b,c,d,e
X,11,22.0,33,,55
Y,66,,88,99.0,110


In [33]:
##### dropping missing data #####
# df.dropna(axis=0/1, how='any'/'all', thresh=None/number, subset=None/list, inplace=False)

In [45]:
df = pd.DataFrame(data=[[1, 2, 3, np.nan, 5], 
                        [6, np.nan, 8, np.nan, 10], 
                        [np.nan, np.nan, np.nan, np.nan, np.nan], 
                        [60, 70, 80, 90, 100]], 
                  index=list('XYZW'), 
                  columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
X,1.0,2.0,3.0,,5.0
Y,6.0,,8.0,,10.0
Z,,,,,
W,60.0,70.0,80.0,90.0,100.0


In [48]:
# drop rows that have all nulls
df.dropna(axis=0,how='all')

Unnamed: 0,a,b,c,d,e
X,1.0,2.0,3.0,,5.0
Y,6.0,,8.0,,10.0
W,60.0,70.0,80.0,90.0,100.0


In [49]:
# drop rows that have any nulls
df.dropna(axis=0,how='any')

Unnamed: 0,a,b,c,d,e
W,60.0,70.0,80.0,90.0,100.0


In [50]:
# keep rows that have at least 4 non-na values
df.dropna(axis=0,how='any',thresh=4)

Unnamed: 0,a,b,c,d,e
X,1.0,2.0,3.0,,5.0
W,60.0,70.0,80.0,90.0,100.0


In [53]:
# keep columns that have at least 3 non-na values
df.dropna(axis=1,how='any',thresh=3)

Unnamed: 0,a,c,e
X,1.0,3.0,5.0
Y,6.0,8.0,10.0
Z,,,
W,60.0,80.0,100.0


In [54]:
df

Unnamed: 0,a,b,c,d,e
X,1.0,2.0,3.0,,5.0
Y,6.0,,8.0,,10.0
Z,,,,,
W,60.0,70.0,80.0,90.0,100.0


In [55]:
# keep columns that have at least 3 non-na values - and do this inplace
df.dropna(axis=1,how='any',thresh=3, inplace=True)

In [56]:
df

Unnamed: 0,a,c,e
X,1.0,3.0,5.0
Y,6.0,8.0,10.0
Z,,,
W,60.0,80.0,100.0


In [65]:
##### filling missing data #####
# df.fillna(value=None, axis=None, inplace=False,...)

In [66]:
df = pd.DataFrame(data=[[1, 2, 3, np.nan, 5], 
                        [6, np.nan, 8, np.nan, 10], 
                        [10, np.nan, np.nan, np.nan, np.nan], 
                        [60, 70, 80, 90, 100]], 
                  index=list('XYZW'), 
                  columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
X,1,2.0,3.0,,5.0
Y,6,,8.0,,10.0
Z,10,,,,
W,60,70.0,80.0,90.0,100.0


In [82]:
df.fillna(value='MISSING')

Unnamed: 0,a,b,c,d,e
X,1,2,3,MISSING,5
Y,6,MISSING,8,MISSING,10
Z,10,MISSING,MISSING,MISSING,MISSING
W,60,70,80,90,100


In [83]:
df.fillna(value=0)

Unnamed: 0,a,b,c,d,e
X,1,2.0,3.0,0.0,5.0
Y,6,0.0,8.0,0.0,10.0
Z,10,0.0,0.0,0.0,0.0
W,60,70.0,80.0,90.0,100.0


In [84]:
# fillna each column with the column's mean
df.fillna(value=df.mean())

Unnamed: 0,a,b,c,d,e
X,1,2.0,3.0,90.0,5.0
Y,6,36.0,8.0,90.0,10.0
Z,10,36.0,30.333333,90.0,38.333333
W,60,70.0,80.0,90.0,100.0


In [85]:
# fillna specified columns with specified columns' mean - and return full dataframe
df.fillna(value=df.mean()[['b','c']])

Unnamed: 0,a,b,c,d,e
X,1,2.0,3.0,,5.0
Y,6,36.0,8.0,,10.0
Z,10,36.0,30.333333,,
W,60,70.0,80.0,90.0,100.0


In [86]:
# fillna only for specific column with specific column's mean - and return specified column
df['b'].fillna(value=df['b'].mean())

X     2.0
Y    36.0
Z    36.0
W    70.0
Name: b, dtype: float64