Handling Missing Data with Pandas

In [2]:
import numpy as np
import pandas as pd 

In [3]:
# Pandas utility function  
# pandas also has utilities to identify and detect null values 

pd.isnull(np.nan)  # returns true if null 

True

In [4]:
pd.isnull(None)

True

In [6]:
pd.isna(np.nan)   # returns true if nan

True

In [7]:
pd.isna(None)

True

In [8]:
pd.notnull(None)  # returns false because it is null 

False

In [9]:
pd.notnull(np.nan)

False

In [11]:
pd.notna(np.nan)

False

This also works with Series and DataFrames

In [12]:
pd.isnull(pd.Series([1, np.nan, 7]))

# pd.Series -> creates a series, isnull checks if the elements are null returns true or false 

0    False
1     True
2    False
dtype: bool

In [14]:
pd.notnull(pd.Series([1, np.nan, 7]))

# returns true for series elements that are not null

0     True
1    False
2     True
dtype: bool

In [15]:
pd.isnull(pd.DataFrame({
    'Column A': [1, np.nan, 7],
    'Column B': [np.nan, 2, 3],
    'Column C': [np.nan, 2, np.nan]
}))

# check which elements in series is Null returns true for nulls

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


Pandas Operations with Missing Values
    - manages missing values more gracefully than numpy
    - nans will no longer behave as "viruses"
    - operations will just ignore them completely

In [16]:
pd.Series([1, 2, np.nan]).count()

# doesnt result in nan like numpy does

2

In [17]:
pd.Series([1, 2, np.nan]).sum()

3.0

In [18]:
pd.Series([2, 2, np.nan]).mean()

2.0

Filtering Missing Data
    - combine boolean selection + pd.isnull -> filter out nans and null values

In [19]:
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])

In [20]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [21]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [22]:
pd.notnull(s).sum()

4

In [23]:
pd.isnull(s).sum()

2

In [24]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [25]:
# both notnull and isnull are also methods of Series and DataFrames, so :

s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [26]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [27]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

Dropping Null Values
    - using dropna method

In [28]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [29]:
s.dropna()

# drops null or nan vals

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

Dropping null values on DataFrames 
    - Cant drop single values

In [30]:
df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [31]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [32]:
df.shape

# 4 x 4 matrix 

(4, 4)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 260.0 bytes


In [34]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [35]:
df.isnull().sum()

# col A has 2 nulls, col B as 1 null so on 

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [36]:
# default dropna behavior -> drop all the rows if null is present

df.dropna()

# row 2 was the only row without nulls

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [37]:
# rows containing null values dropped from the DF
# can use the axis parameter to drop columns containing null values

df.dropna(axis=1)  # axis='columns' also works

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [38]:
# any row or column that contains at least one null value will be dropped
# control this behavior with the how parameter. Can be either 'any' or 'all'

df2 = pd.DataFrame({
    'Column A': [1, np.nan, 30],
    'Column B': [2, np.nan, 31],
    'Column C': [np.nan, np.nan, 100]
})

In [39]:
df2

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [40]:
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [41]:
df.dropna(how='any')  # default behavior

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [42]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [43]:
# thresh parameter to indicate a threshold (a minimum number) of non-null values for the row/column to be kept

df.dropna(thresh=3)

# drop null values in row 3 

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [44]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


Filling Null Values
    - Instead of dropping nulls, may want to fill them depending on context and dataset
    - sometimes nan can be replaced with 0 or mean of sample etc
      - all depends on context

In [45]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [46]:
# Filling nulls with a arbitrary value

s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [47]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [48]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [49]:
# Filling nulls with contiguous (close) values

# following method used to fill null values with other values close to that null one

s.fillna(method='ffill')

# fills with val before nan/null

  s.fillna(method='ffill')


0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [51]:
s.fillna(method='bfill')

# fills with val after nan/null

  s.fillna(method='bfill')


0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [52]:
# fillna method can still leave null values at the extremes of the Series/DataFrame:

pd.Series([np.nan, 3, np.nan, 9]).fillna(method='ffill')

  pd.Series([np.nan, 3, np.nan, 9]).fillna(method='ffill')


0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [53]:
pd.Series([1, np.nan, 3, np.nan, np.nan]).fillna(method='bfill')

  pd.Series([1, np.nan, 3, np.nan, np.nan]).fillna(method='bfill')


0    1.0
1    3.0
2    3.0
3    NaN
4    NaN
dtype: float64

Filling Null Values on DataFrames
    - fillna method also works on DataFrames
    - main difference: 
      - can specify axis (rows & columns) to use to fill
      - more control on values passed

In [54]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [55]:
df.fillna({'Column A': 0, 'Column B': 99, 'Column C': df['Column C'].mean()})

# fills column As nans with 0s, B with 99, C with the mean of column Cs values

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [56]:
df.fillna(method='ffill', axis=0)

  df.fillna(method='ffill', axis=0)


Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [57]:
df.fillna(method='ffill', axis=1)

  df.fillna(method='ffill', axis=1)


Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


Checking for NAs
    - Does the Series or DataFrame contain any missing value?
      - Should be True or False 
      - Multiple ways to verify this:
          - s.dropna()
          - count method
          - any 
          - isnull()
          - etc...

In [58]:
# Example 1: Checking the length
# if there are missing values -> s.dropna() will have less elements than s

s.dropna().count()

4

In [59]:
missing_values = len(s.dropna()) != len(s)
missing_values

True

In [60]:
# Example 2: count method
# excludes nans from its result

len(s)

6

In [63]:
s.count()

4

In [64]:
# so just do

missing_values = s.count() != len(s)
missing_values

True

In [65]:
# More Pythonic solution  -  any
# methods -any- and -all- check if either there's any True value in a Series or all the values are True
# both work in the same way as in Python

pd.Series([True, False, False]).any()

True

In [66]:
pd.Series([True, False, False]).all()

False

In [67]:
pd.Series([True, True, True]).all()

True

In [68]:
# isnull() method 
# returns a Boolean Series with True values wherever there was a nan

s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [69]:
# so just use the any method with the boolean array returned

pd.Series([1, np.nan]).isnull().any()

True

In [70]:
pd.Series([1, 2]).isnull().any()

False

In [71]:
s.isnull().any()

True

In [72]:
# more strict version would check only the values of the Series

s.isnull().values

array([False, False, False,  True,  True, False])

In [73]:
s.isnull().values.any()

True