# Pandas - Missing Data

In [1]:
# import library
import pandas as pd
import numpy as np

In [4]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
                 columns=['one', 'two', 'three'])
print(df, '\n')

# after reindexing
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)

        one       two     three
a  1.487281  1.320122 -1.268090
c -0.984506  0.111753  0.786059
e -0.724411 -0.741908  0.558101
f -0.644961 -0.279207 -0.425655
h  0.495393 -1.938737 -1.169035 

        one       two     three
a  1.487281  1.320122 -1.268090
b       NaN       NaN       NaN
c -0.984506  0.111753  0.786059
d       NaN       NaN       NaN
e -0.724411 -0.741908  0.558101
f -0.644961 -0.279207 -0.425655
g       NaN       NaN       NaN
h  0.495393 -1.938737 -1.169035


In [7]:
# isnull()
print(df['one'].isnull(), '\n')

# notnull()
print(df['one'].notnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool 

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


In [9]:
# cleaning & filling missing values
# fillna()
print(df.fillna(0)) # fill nan value with 0

        one       two     three
a  1.487281  1.320122 -1.268090
b  0.000000  0.000000  0.000000
c -0.984506  0.111753  0.786059
d  0.000000  0.000000  0.000000
e -0.724411 -0.741908  0.558101
f -0.644961 -0.279207 -0.425655
g  0.000000  0.000000  0.000000
h  0.495393 -1.938737 -1.169035


In [12]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],
                  columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df, '\n')

# pad/fill - Fill methods forward
print(df.fillna(method='pad'))

        one       two     three
a  0.625262 -1.058353  0.133325
b       NaN       NaN       NaN
c -0.853231  1.712917 -0.237105
d       NaN       NaN       NaN
e -0.134870  0.295034 -0.569179
f  0.455246  0.462551 -0.222280
g       NaN       NaN       NaN
h -0.991103 -1.415018  1.962065 

        one       two     three
a  0.625262 -1.058353  0.133325
b  0.625262 -1.058353  0.133325
c -0.853231  1.712917 -0.237105
d -0.853231  1.712917 -0.237105
e -0.134870  0.295034 -0.569179
f  0.455246  0.462551 -0.222280
g  0.455246  0.462551 -0.222280
h -0.991103 -1.415018  1.962065


In [14]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],
                  columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df, '\n')

# bfill/backfill - Fill methods backward
print(df.fillna(method='backfill'))

        one       two     three
a  1.282108  1.110758 -1.127954
b       NaN       NaN       NaN
c -0.832396  0.135907 -0.810012
d       NaN       NaN       NaN
e -1.569941 -0.548883  0.478183
f -1.477382 -0.043388  0.332701
g       NaN       NaN       NaN
h  1.094731  0.609795  0.872627 

        one       two     three
a  1.282108  1.110758 -1.127954
b -0.832396  0.135907 -0.810012
c -0.832396  0.135907 -0.810012
d -1.569941 -0.548883  0.478183
e -1.569941 -0.548883  0.478183
f -1.477382 -0.043388  0.332701
g  1.094731  0.609795  0.872627
h  1.094731  0.609795  0.872627


In [20]:
# Drop missing values
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],
                  columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df, '\n')

# dropna()
print(df.dropna(), '\n')

# axis=1 - drop column wise
print(df.dropna(axis=1))

        one       two     three
a  2.367007  0.431272  1.204336
b       NaN       NaN       NaN
c  1.276044  0.608188 -0.135379
d       NaN       NaN       NaN
e -2.750822 -0.184327 -0.763708
f -0.929413 -0.346281 -0.003286
g       NaN       NaN       NaN
h -0.721280 -0.789014 -0.356296 

        one       two     three
a  2.367007  0.431272  1.204336
c  1.276044  0.608188 -0.135379
e -2.750822 -0.184327 -0.763708
f -0.929413 -0.346281 -0.003286
h -0.721280 -0.789014 -0.356296 

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


In [24]:
print(df, '\n')

# after replacing nan value with 1000
df.replace({np.nan:1000})

        one       two     three
a  2.367007  0.431272  1.204336
b       NaN       NaN       NaN
c  1.276044  0.608188 -0.135379
d       NaN       NaN       NaN
e -2.750822 -0.184327 -0.763708
f -0.929413 -0.346281 -0.003286
g       NaN       NaN       NaN
h -0.721280 -0.789014 -0.356296 



Unnamed: 0,one,two,three
a,2.367007,0.431272,1.204336
b,1000.0,1000.0,1000.0
c,1.276044,0.608188,-0.135379
d,1000.0,1000.0,1000.0
e,-2.750822,-0.184327,-0.763708
f,-0.929413,-0.346281,-0.003286
g,1000.0,1000.0,1000.0
h,-0.72128,-0.789014,-0.356296
