# 数据清洗

In [51]:
import numpy as np
import pandas as pd

## 检测与处理重复值
检测subset中的重复值
默认删除重复值，如果keep='first'，则保留第一次出现的重复值
inplace=False，默认为False，不修改原始数据，如果为True，则修改原始数据

In [52]:
# 造一组数据
df = pd.DataFrame({'Name':['zs', 'zs', 'ls', 'ww','ww'], 'Age':[10, 10, 12, 14,20]})
df

Unnamed: 0,Name,Age
0,zs,10
1,zs,10
2,ls,12
3,ww,14
4,ww,20


In [53]:
# 只要名字相同就删除
df.drop_duplicates(subset='Name', keep='first')

Unnamed: 0,Name,Age
0,zs,10
2,ls,12
3,ww,14


In [54]:
# 需要名字和年龄都相同才删除
df.drop_duplicates(subset=['Name', 'Age'], keep='first')

Unnamed: 0,Name,Age
0,zs,10
2,ls,12
3,ww,14
4,ww,20


## 检测与处理缺失值

In [55]:
ratings = pd.read_json('ratings.json')
ratings

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,,3,3.0,
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,,3.0,2,,
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,


In [56]:
# 统计为空的数量
ratings.isnull().sum()

John Carson          0
Michelle Peterson    0
William Reynolds     2
Jillian Hobart       1
Melissa Jones        0
Alex Roberts         1
Michael Henry        3
dtype: int64

### 删除法

In [57]:
ratings.dropna(axis=1)

Unnamed: 0,John Carson,Michelle Peterson,Melissa Jones
Inception,2.5,3.0,3
Pulp Fiction,3.5,3.5,4
Anger Management,3.0,1.5,2
Fracture,3.5,5.0,3
Serendipity,2.5,3.5,2
Jerry Maguire,3.0,3.0,3


### 替换法

In [58]:
# 替换填充
ratings.fillna(value=0)

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,0.0,3,3.0,0.0
Pulp Fiction,3.5,3.5,3.0,3.5,4,4.0,4.5
Anger Management,3.0,1.5,0.0,3.0,2,0.0,0.0
Fracture,3.5,5.0,3.5,4.0,3,5.0,4.0
Serendipity,2.5,3.5,0.0,2.5,2,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3,3.0,0.0


In [59]:
# 使用下一个值进行填充
ratings.fillna(method='pad', axis=1)

Unnamed: 0,John Carson,Michelle Peterson,William Reynolds,Jillian Hobart,Melissa Jones,Alex Roberts,Michael Henry
Inception,2.5,3.0,2.5,2.5,3.0,3.0,3.0
Pulp Fiction,3.5,3.5,3.0,3.5,4.0,4.0,4.5
Anger Management,3.0,1.5,1.5,3.0,2.0,2.0,2.0
Fracture,3.5,5.0,3.5,4.0,3.0,5.0,4.0
Serendipity,2.5,3.5,3.5,2.5,2.0,3.5,1.0
Jerry Maguire,3.0,3.0,4.0,4.5,3.0,3.0,3.0


### 插值法
返回的是一个函数，接受一个值计算出对应的y

In [66]:
x = np.array([30, 40, 50, 60, 65])
y = np.array([100, 120, 135, 155, 170])

import scipy.interpolate as si
func = si.interp1d(x, y, kind='cubic')
func(62)

array(160.48)