In [1]:
import pandas as pd
import numpy as np

# Representing Missing Values
Language | Method | Pros | Why Pandas doesn't use
--- | --- | --- | ---
R | Assign bit patterns to each data type | only 4 data types (suitable) | Pandas has at least 14 data types. Cannot sacrifice so many
SciDB | Storage of masked array | no need to sacrifice | storage overhead
Numpy | Masked array | no need to sacrifice | storage overhead

1. Pandas uses   
    Python's `None` object
    > - Cannot use Pandas' fast computation as the whole column is now casted into `object` dtype
    > - Undefined when applying arithmetic operations
    
    IEEE floating point representation `NaN`
    > - always considered as floating point, and will convert whatever column it is in into `floating` dtype
    > - Common arithmetic operations are not useful (often produce `np.nan`) unless we use `np.nansum()`...
     
2. Useful operations
- `isnull()` or `notnull()`
- `dropna()`: default drop row
- `fillna()`

In [5]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      np.nan,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,,5
2,,4.0,6


In [7]:
# `how`: drop when `all` or `any` is null? ; `thresh`: drop when count of non-null is below this threshold, and overrides `how`
df.dropna(axis = 1, how = 'all', thresh = 2)

Unnamed: 0,0,2
0,1.0,2
1,2.0,5
2,,6


In [8]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [9]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [10]:
data.fillna(method = 'ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [12]:
df.fillna(method = 'bfill', axis = 1)

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,2.0,5.0,5.0
2,4.0,4.0,6.0
