# Handling missing data with pandas

In [2]:
import numpy as np
import pandas as pd

In [4]:
pd.isnull(np.nan)

True

In [5]:
pd.isnull(None)

True

In [6]:
pd.isna(np.nan)

True

In [7]:
pd.isna(None)

True

In [8]:
pd.notnull(None)

False

In [9]:
pd.notnull(np.nan)

False

In [10]:
pd.notnull(3)

True

In [11]:
pd.isnull(pd.Series([1, np.nan, 7]))

0    False
1     True
2    False
dtype: bool

In [12]:
pd.isnull(pd.DataFrame({'A':[1, np.nan, 7],
                        'B':[np.nan, 2, 3],
                        'C':[np.nan, 5, np.nan]}))

Unnamed: 0,A,B,C
0,False,True,True
1,True,False,False
2,False,False,True


In [13]:
s = pd.Series([1,2,3,np.nan,np.nan,5])

In [14]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [15]:
pd.notnull(s).sum()

4

In [16]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    5.0
dtype: float64

## Dropping null values

In [17]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    5.0
dtype: float64

In [18]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    5.0
dtype: float64

In [22]:
df = pd.DataFrame({'A':[1, np.nan, 7, 34],
                    'B':[np.nan, 2, 3, 28],
                    'C':[np.nan, 5, np.nan, 10],
                    'D':[5, 8, np.nan, 110]})

In [23]:
df

Unnamed: 0,A,B,C,D
0,1.0,,,5.0
1,,2.0,5.0,8.0
2,7.0,3.0,,
3,34.0,28.0,10.0,110.0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      float64
 2   C       2 non-null      float64
 3   D       3 non-null      float64
dtypes: float64(4)
memory usage: 256.0 bytes


In [25]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,True,True,False
1,True,False,False,False
2,False,False,True,True
3,False,False,False,False


In [31]:
df.shape

(4, 4)

In [32]:
df.isnull().sum()

A    1
B    1
C    2
D    1
dtype: int64

In [35]:
df.dropna()

Unnamed: 0,A,B,C,D
3,34.0,28.0,10.0,110.0


In [36]:
df.dropna(axis=1)

0
1
2
3


In [37]:
df2 = pd.DataFrame({'A':[1, np.nan, 2],
                    'B':[2, np.nan, 31],
                    'C':[np.nan, np.nan, 100]})

In [38]:
df2

Unnamed: 0,A,B,C
0,1.0,2.0,
1,,,
2,2.0,31.0,100.0


In [39]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,,,5.0
1,,2.0,5.0,8.0
2,7.0,3.0,,
3,34.0,28.0,10.0,110.0


In [43]:
df.dropna(how='any')

Unnamed: 0,A,B,C,D
3,34.0,28.0,10.0,110.0


In [44]:
df.dropna(thresh=3)

Unnamed: 0,A,B,C,D
1,,2.0,5.0,8.0
3,34.0,28.0,10.0,110.0


In [45]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,A,B,D
0,1.0,,5.0
1,,2.0,8.0
2,7.0,3.0,
3,34.0,28.0,110.0


In [46]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    5.0
dtype: float64

In [47]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    5.0
dtype: float64

In [48]:
s.fillna(s.mean())

0    1.00
1    2.00
2    3.00
3    2.75
4    2.75
5    5.00
dtype: float64

In [49]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    5.0
dtype: float64

In [50]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    5.0
dtype: float64

In [51]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    5.0
4    5.0
5    5.0
dtype: float64

In [55]:
pd.Series([np.nan, 3, np.nan, 9]).fillna(method='ffill')

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [56]:
pd.Series([np.nan, 3, np.nan, 9]).fillna(method='bfill')

0    3.0
1    3.0
2    9.0
3    9.0
dtype: float64

In [57]:
df

Unnamed: 0,A,B,C,D
0,1.0,,,5.0
1,,2.0,5.0,8.0
2,7.0,3.0,,
3,34.0,28.0,10.0,110.0


In [58]:
df.fillna({'A':0, 'B':99, 'C': df['C'].mean()})

Unnamed: 0,A,B,C,D
0,1.0,99.0,7.5,5.0
1,0.0,2.0,5.0,8.0
2,7.0,3.0,7.5,
3,34.0,28.0,10.0,110.0


In [59]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,A,B,C,D
0,1.0,,,5.0
1,1.0,2.0,5.0,8.0
2,7.0,3.0,5.0,8.0
3,34.0,28.0,10.0,110.0


In [60]:
 df.fillna(method='ffill', axis=1)

Unnamed: 0,A,B,C,D
0,1.0,1.0,1.0,5.0
1,,2.0,5.0,8.0
2,7.0,3.0,3.0,3.0
3,34.0,28.0,10.0,110.0


In [61]:
len(s)

6

In [62]:
s.count()

4

In [63]:
missing_values = s.count() != len(s)

In [64]:
missing_values

True

In [65]:
pd.Series([True, False, False]).any()

True

In [66]:
pd.Series([True, False, False]).all()

False

In [67]:
pd.Series([True, True, True]).all()

True

In [68]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [69]:
pd.Series([1, np.nan]).isnull().any()

True

In [70]:
pd.Series([1, 2]).isnull().any()

False

In [72]:
s.isnull().any()

True