## Dealing with Missing Data

In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [2]:
df = pd.DataFrame(np.random.randn(5,3), 
                  index=['a','c','d','f','g'],
                  columns=['one','two','three'])

In [3]:
df['four'] = 'blah'

In [4]:
df['five'] = df['one'] > 0

## Creating Missing Data

In [5]:
df.iloc[2,2] = np.nan
df.iloc[3,4] = np.nan # Notice what happens here!
df.iloc[3,3] = np.nan
df.iloc[4,4] = np.nan
df.iloc[1,1] = np.nan

In [13]:
df

Unnamed: 0,one,two,three,four,five
a,0.496714,-0.138264,0.647689,blah,1.0
c,1.52303,,-0.234137,blah,1.0
d,1.579213,0.767435,,blah,1.0
f,0.54256,-0.463418,-0.46573,,
g,0.241962,-1.91328,-1.724918,blah,


## Drop Missing Data

In [7]:
df.dropna() # Keeps only complete rows

Unnamed: 0,one,two,three,four,five
a,0.496714,-0.138264,0.647689,blah,1.0


In [68]:
df.dropna(axis=1) # Keeps only complete columns

Unnamed: 0,one
a,-0.719844
c,0.343618
d,-0.385082
f,1.031
g,-0.309212


## Keep rows at Threshold

In [8]:
df.dropna(thresh=4) # Keeps rows that have AT LEAST 4 non-na values

Unnamed: 0,one,two,three,four,five
a,0.496714,-0.138264,0.647689,blah,1.0
c,1.52303,,-0.234137,blah,1.0
d,1.579213,0.767435,,blah,1.0
g,0.241962,-1.91328,-1.724918,blah,


## Fill Missing Values

In [14]:
df.fillna(value="PINK FLUFFY UNICORN") # Fill with whatever you want

Unnamed: 0,one,two,three,four,five
a,0.496714,-0.138264,0.647689,blah,1
c,1.52303,PINK FLUFFY UNICORN,-0.234137,blah,1
d,1.579213,0.767435,PINK FLUFFY UNICORN,blah,1
f,0.54256,-0.463418,-0.46573,PINK FLUFFY UNICORN,PINK FLUFFY UNICORN
g,0.241962,-1.91328,-1.72492,blah,PINK FLUFFY UNICORN


In [85]:
df['two'].fillna(value=df['two'].mean())

a   -0.460639
c    0.031246
d   -0.676922
f    0.931280
g    0.331263
Name: two, dtype: float64

# DONE!