In [4]:
import pandas as pd
import numpy as np
df = pd.DataFrame([    
                   ['F','M',np.nan],
                   ['M',np.nan,np.nan],
                   ['T','M',35],
                   ['t','M',35],
                   ['J',np.nan,21],
                   ['L','F',20]])
df.columns = ['name','gender','age']
df

Unnamed: 0,name,gender,age
0,F,M,
1,M,,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


In [5]:
df['age'].isnull()

0     True
1     True
2    False
3    False
4    False
5    False
Name: age, dtype: bool

In [6]:
df.isnull()

Unnamed: 0,name,gender,age
0,False,False,True
1,False,True,True
2,False,False,False
3,False,False,False
4,False,True,False
5,False,False,False


In [7]:
df['gender'].notnull()

0     True
1    False
2     True
3     True
4    False
5     True
Name: gender, dtype: bool

In [8]:
df.notnull()

Unnamed: 0,name,gender,age
0,True,True,False
1,True,False,False
2,True,True,True
3,True,True,True
4,True,False,True
5,True,True,True


In [9]:
df.isnull().values.all()

False

In [11]:
df['age'].isnull().values.all()

False

In [12]:
df['gender'].isnull().sum()

2

In [13]:
df.isnull().sum()

name      0
gender    2
age       2
dtype: int64

In [14]:
df.isnull().sum().sum()

4

## 舍弃缺失值

In [20]:
import pandas as pd
import numpy as np
df = pd.DataFrame([    
                   ['F','M',np.nan],
                   [np.nan,np.nan,np.nan],
                   ['T','M',35],
                   ['t','M',35],
                   ['J',np.nan,21],
                   ['L','F',20]])
df.columns = ['name','gender','age']
df

Unnamed: 0,name,gender,age
0,F,M,
1,,,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


In [21]:
df.dropna()

Unnamed: 0,name,gender,age
2,T,M,35.0
3,t,M,35.0
5,L,F,20.0


In [22]:
df.dropna(how='all')

Unnamed: 0,name,gender,age
0,F,M,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


In [23]:
df.dropna(thresh=2)

Unnamed: 0,name,gender,age
0,F,M,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


In [26]:
df['employee'] = np.nan
df

Unnamed: 0,name,gender,age,employee
0,F,M,,
1,,,,
2,T,M,35.0,
3,t,M,35.0,
4,J,,21.0,
5,L,F,20.0,


In [28]:
df.dropna(axis=1,how='all')


Unnamed: 0,name,gender,age
0,F,M,
1,,,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


## 填补缺失值

In [29]:
df

Unnamed: 0,name,gender,age,employee
0,F,M,,
1,,,,
2,T,M,35.0,
3,t,M,35.0,
4,J,,21.0,
5,L,F,20.0,


In [30]:
df.fillna(0)

Unnamed: 0,name,gender,age,employee
0,F,M,0.0,0.0
1,0,0,0.0,0.0
2,T,M,35.0,0.0
3,t,M,35.0,0.0
4,J,0,21.0,0.0
5,L,F,20.0,0.0


In [31]:
df['age'].fillna(df['age'].mean())

0    27.75
1    27.75
2    35.00
3    35.00
4    21.00
5    20.00
Name: age, dtype: float64

In [32]:
df.groupby('gender')['age'].transform('mean')

0    35.0
1     NaN
2    35.0
3    35.0
4     NaN
5    20.0
Name: age, dtype: float64

In [2]:
import pandas as pd
import numpy as np
df = pd.DataFrame([    
                   ['F','M',np.nan],
                   [np.nan,np.nan,np.nan],
                   ['T','M',35],
                   ['t','M',35],
                   ['J',np.nan,21],
                   ['L','F',20]])
df.columns = ['name','gender','age']
df

Unnamed: 0,name,gender,age
0,F,M,
1,,,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


In [4]:
df['age'].fillna(df.groupby('gender')['age'].transform('mean'),inplace=True)

In [5]:
df

Unnamed: 0,name,gender,age
0,F,M,35.0
1,,,
2,T,M,35.0
3,t,M,35.0
4,J,,21.0
5,L,F,20.0


In [6]:
df['employee'] = np.nan

In [7]:
df

Unnamed: 0,name,gender,age,employee
0,F,M,35.0,
1,,,,
2,T,M,35.0,
3,t,M,35.0,
4,J,,21.0,
5,L,F,20.0,


In [8]:
df.fillna(method='pad')

Unnamed: 0,name,gender,age,employee
0,F,M,35.0,
1,F,M,35.0,
2,T,M,35.0,
3,t,M,35.0,
4,J,M,21.0,
5,L,F,20.0,


In [9]:
df.fillna(method='backfill')

Unnamed: 0,name,gender,age,employee
0,F,M,35.0,
1,T,M,35.0,
2,T,M,35.0,
3,t,M,35.0,
4,J,F,21.0,
5,L,F,20.0,


In [10]:
df.fillna(method='backfill',limit=2)

Unnamed: 0,name,gender,age,employee
0,F,M,35.0,
1,T,M,35.0,
2,T,M,35.0,
3,t,M,35.0,
4,J,F,21.0,
5,L,F,20.0,


In [11]:
df2 = pd.DataFrame([[1, 870],\
                    [2, 900],\
                    [np.nan, np.nan],\
                    [4, 950],\
                    [5,1080],\
                    [6,1200]])
df2.columns = ['time', 'val']
df2

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,,
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0


In [13]:
df2.interpolate()

Unnamed: 0,time,val
0,1.0,870.0
1,2.0,900.0
2,3.0,925.0
3,4.0,950.0
4,5.0,1080.0
5,6.0,1200.0
