处理Pandas中的NaN

In [1]:
import pandas as pd

In [2]:
items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes': 8, 'suits': 45},\
        {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants': 5, 'shirts': 2, 'shoes': 5, 'suits': 7},\
        {'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes': 10}]
store_items = pd.DataFrame(items, index=['store1', 'store2', 'store3'])
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [3]:
## 统计DataFrame中的NaN的个数
## False代表不是NaN，True代表是NaN
store_items.isnull()

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,False,True,False,False,False,False,False
store2,False,False,False,False,False,False,False
store3,False,False,False,True,False,True,False


In [4]:
store_items.isnull().sum()

bikes      0
glasses    1
pants      0
shirts     1
shoes      0
suits      1
watches    0
dtype: int64

In [5]:
store_items.isnull().sum().sum()

3

In [6]:
## 非NaN的个数
print("Number of non-NaN:\n", store_items.count())

Number of non-NaN:
 bikes      3
glasses    2
pants      3
shirts     2
shoes      3
suits      2
watches    3
dtype: int64


In [7]:
## 删除包含NaN的列，不修改原对象
store_items.dropna(axis=1)

Unnamed: 0,bikes,pants,shoes,watches
store1,20,30,8,35
store2,15,5,5,10
store3,20,30,10,35


In [8]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [9]:
## 删除包含NaN的行
store_items.dropna(axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store2,15,50.0,5,2.0,5,7.0,10


替换DataFrame中的NaN

In [10]:
## 全部替换成指定值
## 不修改原对象
## fillna()接受标量和字典（包括Series）
store_items.fillna(0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,0.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,0.0,10,0.0,35


In [14]:
store_items.fillna('FF')

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,FF,30,15,8,45,35
store2,15,50,5,2,5,7,10
store3,20,4,30,FF,10,FF,35


In [18]:
store_items.fillna({'glasses': 3, 'shirts': 5, 'suits': 6})

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,3.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,5.0,10,6.0,35


前向替换：使用上一行或列的值替换NaN

In [20]:
## 上一行的值替换NaN，第一行的NaN没法替换
store_items.fillna(method='ffill', axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,2.0,10,7.0,35


In [21]:
## 上一列替换，第1列的没法替换
store_items.fillna(method='ffill', axis=1)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20.0,20.0,30.0,15.0,8.0,45.0,35.0
store2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store3,20.0,4.0,30.0,30.0,10.0,10.0,35.0


后向替换：使用后一行或列的值替换NaN

In [22]:
## 后一行
store_items.fillna(method='backfill', axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,50.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [23]:
## 后一列
store_items.fillna(method='backfill', axis=1)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20.0,30.0,30.0,15.0,8.0,45.0,35.0
store2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store3,20.0,4.0,30.0,10.0,10.0,35.0,35.0


使用不同的插值方式来替换NaN

In [24]:
## 线性插值
store_items.interpolate(method='linear', axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,2.0,10,7.0,35


In [25]:
store_items.interpolate(method='linear', axis=1)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20.0,25.0,30.0,15.0,8.0,45.0,35.0
store2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store3,20.0,4.0,30.0,20.0,10.0,22.5,35.0


In [26]:
### 使用每列的平均值替换
print(store_items.mean())

bikes      18.333333
glasses    27.000000
pants      21.666667
shirts      8.500000
shoes       7.666667
suits      26.000000
watches    26.666667
dtype: float64


In [27]:
store_items.fillna(store_items.mean())

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,27.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,8.5,10,26.0,35


In [28]:
store_items[store_items == 10]

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,,,,,,,
store2,,,,,,,10.0
store3,,,,,10.0,,


In [32]:
store_items[(store_items == 10).any(axis=1)]

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35
