# Missing Value handling with Pandas

Missing values are represented by NaN in Python. We may not always drop the missing values but at times approximate and impute them.


In [19]:
# importing Numpy and Pandas
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [20]:
# Creating a data set with missing values
missing = np.nan
Series_Object = Series(['row1','row2', missing, 'row4','row5'])
Series_Object

0    row1
1    row2
2     NaN
3    row4
4    row5
dtype: object

In [21]:
# Identifying Missing Values
Series_Object.isnull()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [22]:
# Filling in the missing values
# We first create a DataFrame object
np.random.seed(2)
DF_Object = DataFrame(np.random.rand(36).reshape(6,6))
DF_Object

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,0.529142
2,0.13458,0.513578,0.18444,0.785335,0.853975,0.494237
3,0.846561,0.079645,0.505246,0.065287,0.428122,0.096531
4,0.12716,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [23]:
# Creating missing values in Data Frame
DF_Object.iloc[3:5,0] = missing
DF_Object.iloc[1:4,5] = missing
DF_Object

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,
2,0.13458,0.513578,0.18444,0.785335,0.853975,
3,,0.079645,0.505246,0.065287,0.428122,
4,,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [24]:
# Fill NaN with specific values
FilledDataFrame = DF_Object.fillna(0)
FilledDataFrame

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,0.0
2,0.13458,0.513578,0.18444,0.785335,0.853975,0.0
3,0.0,0.079645,0.505246,0.065287,0.428122,0.0
4,0.0,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [25]:
# We can fill in NaN with unique values using Dictionary object
# In 0th column, missing values would be filled with 44 and in 5th column with 99
ReFilledDataFrame = DF_Object.fillna({0:44,5:99})
ReFilledDataFrame

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,99.0
2,0.13458,0.513578,0.18444,0.785335,0.853975,99.0
3,44.0,0.079645,0.505246,0.065287,0.428122,99.0
4,44.0,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [26]:
# We can fill NaN with last not null value from the same column using forward fill
ReFillDataFrame = DF_Object.fillna(method = 'ffill')
ReFillDataFrame

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,0.330335
2,0.13458,0.513578,0.18444,0.785335,0.853975,0.330335
3,0.13458,0.079645,0.505246,0.065287,0.428122,0.330335
4,0.13458,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [27]:
# Counting Missing Values for summary statistics
np.random.seed(2)
DF_Object = DataFrame(np.random.rand(36).reshape(6,6))
DF_Object
# Creating missing values in Data Frame
DF_Object.iloc[3:5,0] = missing
DF_Object.iloc[1:4,5] = missing
DF_Object

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,
2,0.13458,0.513578,0.18444,0.785335,0.853975,
3,,0.079645,0.505246,0.065287,0.428122,
4,,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [28]:
DF_Object.isnull().sum() # count of missing values

0    2
1    0
2    0
3    0
4    0
5    3
dtype: int64

In [29]:
DF_Object_NoNaN = DF_Object.dropna() # this drops from rows
DF_Object_NoNaN


Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893


In [30]:
DF_Object_NoNaN = DF_Object.dropna( axis =1) # this drops the columns
DF_Object_NoNaN

Unnamed: 0,1,2,3,4
0,0.025926,0.549662,0.435322,0.420368
1,0.619271,0.299655,0.266827,0.621134
2,0.513578,0.18444,0.785335,0.853975
3,0.079645,0.505246,0.065287,0.428122
4,0.596745,0.226012,0.106946,0.220306
5,0.201743,0.640407,0.48307,0.505237


In [31]:
DF_Object_NoNaN = DF_Object.dropna(how='all') # drop rows only when all values are null
DF_Object_NoNaN

Unnamed: 0,0,1,2,3,4,5
0,0.435995,0.025926,0.549662,0.435322,0.420368,0.330335
1,0.204649,0.619271,0.299655,0.266827,0.621134,
2,0.13458,0.513578,0.18444,0.785335,0.853975,
3,,0.079645,0.505246,0.065287,0.428122,
4,,0.596745,0.226012,0.106946,0.220306,0.349826
5,0.467787,0.201743,0.640407,0.48307,0.505237,0.386893
