Missing Data
    - Empty value, a number 0 or invalid value (such as a string) can be considered missing data - "Falsy"

In [6]:
import numpy as np
import pandas as pd 

In [1]:
# values considered "falsy":

falsy_values = (0, False, None, '', [], {})

In [2]:
any(falsy_values)

# returns true if at least one element is true 

False

In [8]:
# numpy has a special "nullable" value for numbers which is np.nan
# NaN: "Not a number"

np.nan

nan

In [10]:
# everything np.nan touches becomes np.nan:

3 + np.nan

nan

In [11]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [12]:
a.sum() 

# will be nan because everything np.nan touches becomes np.nan

nan

In [13]:
a.mean()

# same applies

nan

In [None]:
3 + None  # would raise an exception due to different types, so np.nan is a good alternative 

In [16]:
# numeric array -> None value is replaced by np.nan:

a = np.array([1, 2, 3, np.nan, None, 4], dtype='float')

In [17]:
a

array([ 1.,  2.,  3., nan, nan,  4.])

In [18]:
#  "Infinite" type which also behaves as a virus

np.inf

inf

In [19]:
3 + np.inf

inf

In [20]:
np.inf / 3


inf

In [22]:
b = np.array([1, 2, 3, np.inf, np.nan, 4], dtype=float)

In [23]:
b.sum()

# will be nan due to the virus characteristics of nan and inf


nan

Checking for NaN of inf 
    - 2 functions that help:
      - np.isnan
      - np.isinf 
      - both also take arrays as inputs, and return boolean arrays as result

In [24]:
np.isnan(np.nan)

# returns true if nan

True

In [25]:
np.isinf(np.inf)

# returns true if inf

True

In [26]:
# joint operation can be performed with np.isfinite

np.isfinite(np.nan), np.isfinite(np.inf)


(False, False)

In [27]:
np.isnan(np.array([1, 2, 3, np.nan, np.inf, 4]))

# ex of isnan taking array as input

array([False, False, False,  True, False, False])

In [28]:
np.isinf(np.array([1, 2, 3, np.nan, np.inf, 4]))

array([False, False, False, False,  True, False])

In [29]:
np.isfinite(np.array([1, 2, 3, np.nan, np.inf, 4]))


# note its more common to find nan values than inf 

array([ True,  True,  True, False, False,  True])

Filtering out missing values before proceeding 
    - Avoid nan propagation/virus 
    - Use combo of np.isnan and bool arrays for filtering 

In [30]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [31]:
a[~np.isnan(a)]     # filter out nan 

# equivalent to a[np.isfinite(a)]

array([1., 2., 3., 4.])

In [34]:
a[np.isfinite(a)]

array([1., 2., 3., 4.])

In [32]:
# now all operations can be preformed 
a[np.isfinite(a)].sum()

10.0

In [33]:
a[np.isfinite(a)].mean()

2.5