In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
#### Handling missing data


d = np.array([1,2,3,None])

In [4]:
np.sum(d)  ### Throws error as sum() doesnt support missing data

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [5]:
d = np.array([1,np.nan,2])

In [6]:
np.sum(d)

nan

In [7]:
np.nansum(d) ### Sum of the values ignoring NaN

3.0

In [8]:
data = pd.Series([1,2,np.nan,4,5])

In [9]:
data

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
dtype: float64

In [10]:
data.isnull()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [11]:
data.notnull()

0     True
1     True
2    False
3     True
4     True
dtype: bool

In [12]:
data.dropna()

0    1.0
1    2.0
3    4.0
4    5.0
dtype: float64

In [13]:
data = pd.DataFrame([ [1,2,np.nan],
               [np.nan,3,1],
               [5,6,7]])

In [14]:
data

Unnamed: 0,0,1,2
0,1.0,2,
1,,3,1.0
2,5.0,6,7.0


In [15]:
data.dropna()

Unnamed: 0,0,1,2
2,5.0,6,7.0


In [16]:
data.dropna(axis=1)

Unnamed: 0,1
0,2
1,3
2,6


In [17]:
data.dropna(axis = 0)

Unnamed: 0,0,1,2
2,5.0,6,7.0


In [18]:
data.dropna(axis='columns')  ### Equivalent of axis = 1

Unnamed: 0,1
0,2
1,3
2,6


In [20]:
data.dropna(axis='rows')  ### Equivalent of axis = 0

Unnamed: 0,0,1,2
2,5.0,6,7.0


In [21]:
data

Unnamed: 0,0,1,2
0,1.0,2,
1,,3,1.0
2,5.0,6,7.0


In [22]:
data[3] = np.nan ### adds a column with index 3 and values NaN

In [23]:
data

Unnamed: 0,0,1,2,3
0,1.0,2,,
1,,3,1.0,
2,5.0,6,7.0,


In [24]:
data.dropna(thresh=3)   #### Minimum non-null value set to 3 

## i.e there should be atleast 3 non-zero values along any row

Unnamed: 0,0,1,2,3
2,5.0,6,7.0,


In [26]:
data.dropna(thresh=2)  ## there should be atleast 3 non-zero values along any row

Unnamed: 0,0,1,2,3
0,1.0,2,,
1,,3,1.0,
2,5.0,6,7.0,


In [27]:
data = pd.Series([1,2,3,np.nan, 4,None])

In [28]:
data

0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    NaN
dtype: float64

In [29]:
d = data.fillna(0)

In [30]:
d

0    1.0
1    2.0
2    3.0
3    0.0
4    4.0
5    0.0
dtype: float64

In [31]:
np.sum(d)

10.0

In [32]:
data = pd.DataFrame([ [1,2,np.nan],
               [np.nan,3,1],
               [5,6,7]])

In [33]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,2,0.0
1,0.0,3,1.0
2,5.0,6,7.0


In [34]:
data.fillna(99)

Unnamed: 0,0,1,2
0,1.0,2,99.0
1,99.0,3,1.0
2,5.0,6,7.0


In [35]:
data.fillna(method = 'ffill')   ### By default it is considered as axis = 0 , ffill - forward fill
## Fills the previous value available row by row. Also, 1st row is always kept as it is.

Unnamed: 0,0,1,2
0,1.0,2,
1,1.0,3,1.0
2,5.0,6,7.0


In [37]:
data.fillna(method='ffill',axis=1)  #### Column wise value copied from previous column

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,,3.0,1.0
2,5.0,6.0,7.0


In [38]:
data

Unnamed: 0,0,1,2
0,1.0,2,
1,,3,1.0
2,5.0,6,7.0


In [41]:
data.fillna(method='bfill',axis=1)  ### bfill - backward fill

Unnamed: 0,0,1,2
0,1.0,2.0,
1,3.0,3.0,1.0
2,5.0,6.0,7.0


In [42]:
d = pd.Series([1,2,np.nan,np.nan])

In [43]:
d

0    1.0
1    2.0
2    NaN
3    NaN
dtype: float64

In [44]:
d.fillna(method='ffill')

0    1.0
1    2.0
2    2.0
3    2.0
dtype: float64