# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [6]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [7]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [16]:
df.dtypes

A    float64
B    float64
C      int64
dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
A    2 non-null float64
B    1 non-null float64
C    3 non-null int64
dtypes: float64(2), int64(1)
memory usage: 152.0 bytes


In [6]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0
B,1.0,5.0,,5.0,5.0,5.0,5.0,5.0
C,3.0,2.0,1.0,1.0,1.5,2.0,2.5,3.0


In [8]:
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [7]:
df.dropna()  # drop ROWS with NAN values

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [9]:
df.dropna(axis=1)  # drop columns with NAN values

Unnamed: 0,C
0,1
1,2
2,3


In [10]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [20]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [21]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


'hello'

In [None]:
df[df==100] = 1000

In [11]:
df.fillna(value='hello world'.upper())

Unnamed: 0,A,B,C
0,1,5,1
1,2,HELLO WORLD,2
2,HELLO WORLD,HELLO WORLD,3


In [24]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [12]:
df['A'].fillna(value=df['A'].mean(), inplace=True)

In [13]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,1.5,,3


In [14]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,1.0,3.0
mean,1.5,5.0,2.0
std,0.5,,1.0
min,1.0,5.0,1.0
25%,1.25,5.0,1.5
50%,1.5,5.0,2.0
75%,1.75,5.0,2.5
max,2.0,5.0,3.0


# Great Job!