In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

# 数据清洗
* 空值(null, NA)
* 重复值
* 缺失值处理
* 异常值
* 数据转换(transform)

建议阅读Hadley Wickham的[Tidy Data](http://vita.had.co.nz/papers/tidy-data.pdf)
* The names of the variables are different from what you require
* There is missing data
* Values are not in the units that you require
* The period of sampling of records is not what you need
* Variables are categorical and you need quantitative values
* There is noise in the data,
* Information is of an incorrect type
* Data is organized around incorrect axes
* Data is at the wrong level of normalization
* Data is duplicated

## 读取文件时的NA

In [2]:
eu2012 = pd.read_csv('./data/Eueo2012_na.csv')
eu2012

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,...,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,...,9,0,9,9,16
1,Czech Republic,4,-999,18,41.9%,...,7,0,11,11,19
2,Denmark,4,10,10,50.0%,...,4,0,7,7,15
3,England,5,11,18,50.0%,...,5,0,11,11,16
4,France,3,22,24,37.9%,...,6,0,11,11,19
...,...,...,...,...,...,...,...,...,...,...,...
11,Republic of Ireland,1,7,12,36.8%,...,6,1,10,10,17
12,Russia,5,9,31,22.5%,...,6,0,7,7,16
13,Spain,12,42,33,55.9%,...,11,0,17,17,18
14,xyz,5,17,19,47.2%,...,7,0,9,9,18


In [4]:
eu2012.isnull().sum()    # 如果每一行有不为0的情况，就说明有缺失值。

Team                 0
Goals                0
Shots on target      0
Shots off target     0
Shooting Accuracy    0
                    ..
Yellow Cards         0
Red Cards            0
Subs on              0
Subs off             0
Players Used         0
Length: 35, dtype: int64

In [5]:
eu2012.isnull().sum().sum()     # 这个可以检测整个数据库是否有缺失值，有两个NA和-999

2

>可以通过**na_values**来指定哪些值是代表空值的。

In [6]:
eu2012 = pd.read_csv('./data/Eueo2012_na.csv', na_values=['xyz'])
eu2012.isnull().sum().sum()

3

In [7]:
eu2012= pd.read_csv('./data/Eueo2012_na.csv', na_values=['NA', 'xyz', '-999'])
eu2012.isnull().sum().sum()

4

## 构建一个带NaN的data frame

In [8]:
df = pd.DataFrame(np.random.randint(0, 100, 15).reshape(5,3), 
                 index=['a', 'b', 'c', 'd', 'e'], 
                  columns=['c1', 'c2', 'c3'])
df

Unnamed: 0,c1,c2,c3
a,5,28,48
b,90,38,35
c,24,89,33
d,59,42,85
e,87,38,18


In [9]:
df['c4'] = np.nan
df.loc['f'] = np.arange(10, 14)
df.loc['g'] = np.nan
df['c5'] = np.nan
df['c4']['a'] = 18
df

Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,
b,90.0,38.0,35.0,,
c,24.0,89.0,33.0,,
d,59.0,42.0,85.0,,
e,87.0,38.0,18.0,,
f,10.0,11.0,12.0,13.0,
g,,,,,


### 判断NaN

In [10]:
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [11]:
df.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [12]:
df.isnull().sum(1)      # 也就是axis=1

a    1
b    2
c    2
d    2
e    2
f    1
g    5
dtype: int64

In [13]:
df.isnull().sum().sum()     # 总的空值的个数

15

In [14]:
df.count()     # 统计not null的个数

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [15]:
df.notnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,True,True,True,True,False
b,True,True,True,False,False
c,True,True,True,False,False
d,True,True,True,False,False
e,True,True,True,False,False
f,True,True,True,True,False
g,False,False,False,False,False


## 过滤或者去掉缺失值

In [16]:
df.c4[df.c4.notnull()]

a    18.0
f    13.0
Name: c4, dtype: float64

In [17]:
df.c4.dropna()      # 直接drop掉缺失值，inplace=False

a    18.0
f    13.0
Name: c4, dtype: float64

In [18]:
df.dropna()     # 因为每一列都有na，所以会整个drop掉。

Unnamed: 0,c1,c2,c3,c4,c5


>可以在`dropna`的方法里面指定`**how**`参数来说明过滤的方式。

In [19]:
df.dropna(how='all')

Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,
b,90.0,38.0,35.0,,
c,24.0,89.0,33.0,,
d,59.0,42.0,85.0,,
e,87.0,38.0,18.0,,
f,10.0,11.0,12.0,13.0,


In [20]:
# 下面这个是按列的方式drop
df.dropna(how='all', axis=1)

Unnamed: 0,c1,c2,c3,c4
a,5.0,28.0,48.0,18.0
b,90.0,38.0,35.0,
c,24.0,89.0,33.0,
d,59.0,42.0,85.0,
e,87.0,38.0,18.0,
f,10.0,11.0,12.0,13.0
g,,,,


In [22]:
df2 = df.copy()
df2.ix['g'].c1=0
df2.ix['g'].c3=0
df2

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,
b,90.0,38.0,35.0,,
c,24.0,89.0,33.0,,
d,59.0,42.0,85.0,,
e,87.0,38.0,18.0,,
f,10.0,11.0,12.0,13.0,
g,0.0,,0.0,,


> 另外一种处理的规则`**any**`,只要列或行存在nan，就drop掉

In [23]:
df2.dropna(how='any', axis=1)

Unnamed: 0,c1,c3
a,5.0,48.0
b,90.0,35.0
c,24.0,33.0
d,59.0,85.0
e,87.0,18.0
f,10.0,12.0
g,0.0,0.0


> 可以通过`**thresh**`来指定超过多少个的时候舍弃掉。

In [24]:
df2.dropna(thresh=5, axis=1)      # 指定超过多少个空值的时候舍弃掉。

Unnamed: 0,c1,c2,c3
a,5.0,28.0,48.0
b,90.0,38.0,35.0
c,24.0,89.0,33.0
d,59.0,42.0,85.0
e,87.0,38.0,18.0
f,10.0,11.0,12.0
g,0.0,,0.0


# 计算中的NaN的处理

In [25]:
a = np.array([np.nan, 1, 2, np.nan, 3])
a

array([ nan,   1.,   2.,  nan,   3.])

In [26]:
s = pd.Series(a)
s

0    NaN
1    1.0
2    2.0
3    NaN
4    3.0
dtype: float64

In [27]:
a.mean(), s.mean()    # numpy中直接去不都是nan，Pandas中会把nan的数据去掉

(nan, 2.0)

In [28]:
df2.c4 + 1

a    19.0
b     NaN
c     NaN
d     NaN
e     NaN
f    14.0
g     NaN
Name: c4, dtype: float64

In [29]:
df2.c4.cumsum()

a    18.0
b     NaN
c     NaN
d     NaN
e     NaN
f    31.0
g     NaN
Name: c4, dtype: float64

# 填充缺失值

In [30]:
fill_0 = df.fillna(0)       # 把nan替换成的值
fill_0

Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,0.0
b,90.0,38.0,35.0,0.0,0.0
c,24.0,89.0,33.0,0.0,0.0
d,59.0,42.0,85.0,0.0,0.0
e,87.0,38.0,18.0,0.0,0.0
f,10.0,11.0,12.0,13.0,0.0
g,0.0,0.0,0.0,0.0,0.0


In [31]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,
b,90.0,38.0,35.0,,
c,24.0,89.0,33.0,,
d,59.0,42.0,85.0,,
e,87.0,38.0,18.0,,
f,10.0,11.0,12.0,13.0,
g,,,,,


> 通过在`fillna`方法里面传入参数`**limit**`来指定要替换最多几个nan！

In [32]:
df.fillna(0, limit=3)

Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,0.0
b,90.0,38.0,35.0,0.0,0.0
c,24.0,89.0,33.0,0.0,0.0
d,59.0,42.0,85.0,0.0,
e,87.0,38.0,18.0,,
f,10.0,11.0,12.0,13.0,
g,0.0,0.0,0.0,,


In [33]:
df.c4

a    18.0
b     NaN
c     NaN
d     NaN
e     NaN
f    13.0
g     NaN
Name: c4, dtype: float64

> 在`fillna`方法中，使用`**method**`参数来指定替换的规则，其中`ffill`是指用前一个的填充；`bfill`是指用后一个值来填充。**这两种方式是基于时间的**

In [34]:
df.c4.fillna(method='ffill')

a    18.0
b    18.0
c    18.0
d    18.0
e    18.0
f    13.0
g    13.0
Name: c4, dtype: float64

In [35]:
df.c4.fillna(method='bfill')

a    18.0
b    13.0
c    13.0
d    13.0
e    13.0
f    13.0
g     NaN
Name: c4, dtype: float64

### 除了上面的填充的方式外，还可以通过索引来进行填充

In [36]:
fill_values = pd.Series([1, 2], index=['b', 'c'])

In [37]:
df.c4.fillna(fill_values)

a    18.0
b     1.0
c     2.0
d     NaN
e     NaN
f    13.0
g     NaN
Name: c4, dtype: float64

> 一般进行填充的话，更常见的是使用均值去填充

In [38]:
df.fillna(df.mean())

Unnamed: 0,c1,c2,c3,c4,c5
a,5.0,28.0,48.0,18.0,
b,90.0,38.0,35.0,15.5,
c,24.0,89.0,33.0,15.5,
d,59.0,42.0,85.0,15.5,
e,87.0,38.0,18.0,15.5,
f,10.0,11.0,12.0,13.0,
g,45.833333,41.0,38.5,15.5,


## 插值(也是为了解决空值的方式)

In [39]:
s = pd.Series([1, 2, np.nan, 5, np.nan, 9])
s

0    1.0
1    2.0
2    NaN
3    5.0
4    NaN
5    9.0
dtype: float64

In [40]:
s.interpolate()    # 根据现有的序列，来对缺失值进行添加

0    1.0
1    2.0
2    3.5
3    5.0
4    7.0
5    9.0
dtype: float64

In [41]:
import datetime

In [42]:
ts = pd.Series([1, np.nan, 2], index=[datetime.datetime(2016, 1, 1), 
                                     datetime.datetime(2016, 2, 1),
                                     datetime.datetime(2016, 4, 1)])
ts

2016-01-01    1.0
2016-02-01    NaN
2016-04-01    2.0
dtype: float64

In [43]:
ts.interpolate()

2016-01-01    1.0
2016-02-01    1.5
2016-04-01    2.0
dtype: float64

> 如果在`interpolate`方法里面传入参数`**method**`，并且赋值是`**'time'**`的话，就会根据时间进行插值!

In [45]:
ts.interpolate(method='time')

2016-01-01    1.000000
2016-02-01    1.340659
2016-04-01    2.000000
dtype: float64

In [46]:
s = pd.Series([0, np.nan, 20], index=[0, 1, 10])
s

0      0.0
1      NaN
10    20.0
dtype: float64

In [47]:
s.interpolate()

0      0.0
1     10.0
10    20.0
dtype: float64

> 如果在`interpolate`方法里面传入参数`**method**`，并且赋值是`**values**`的话，就会根据数值进行插值！根据index的值进行插值！！

In [48]:
s.interpolate(method='values')

0      0.0
1      2.0
10    20.0
dtype: float64

# 重复值

In [49]:
data = pd.DataFrame({'a': ['x'] * 3 + ['y'] * 4, 'b': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,a,b
0,x,1
1,x,1
2,x,2
3,y,3
4,y,3
5,y,4
6,y,4


> 在pandas中，可以通过`**duplicated**`方法来过滤重复的数据

In [50]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

> 在Pandas里面，可以通过`**drop_duplicates**`方法来过滤掉重复的数据。

In [51]:
data.drop_duplicates()

Unnamed: 0,a,b
0,x,1
2,x,2
3,y,3
5,y,4


> 在`drop_duplicates`方法里面，通过指定`**keep**`参数的值为`'last'`来过滤前面的。

In [52]:
data.drop_duplicates(keep='last')

Unnamed: 0,a,b
1,x,1
2,x,2
4,y,3
6,y,4


In [53]:
data['c'] = np.arange(7)
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [55]:
data

Unnamed: 0,a,b,c
0,x,1,0
1,x,1,1
2,x,2,2
3,y,3,3
4,y,3,4
5,y,4,5
6,y,4,6


> 通过在`drop_duplicates`方法里面传入直接的值，可以直接以这个具体的值作为判断重复的条件。

In [54]:
data.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,x,1,0
2,x,2,2
3,y,3,3
5,y,4,5


## transform