## Pandas-数据清洗和准备

内容介绍:

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 示例数据
s0 = pd.Series(range(5),index=['d','b','c','a','e'])
print(s0)
df0 = pd.DataFrame(np.random.randint(-9,9,size=(4,3)),index=['d','b','c','a'],columns=['B','A','C'])
df0

d    0
b    1
c    2
a    3
e    4
dtype: int64


Unnamed: 0,B,A,C
d,-4,-2,1
b,-1,-9,-7
c,-9,-3,2
a,4,7,-8


### 1.缺失值处理

笔记15#针对缺失值的处理流程:（1）发现缺失值、（2）丢弃缺失值、（3）填充/替换缺失值。

### 1-(2)丢弃缺失值

dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False)

In [5]:
df0.loc['c','A'] = None
df0.loc['b'] = None
df0

Unnamed: 0,B,A,C
d,-4.0,-2.0,1.0
b,,,
c,-9.0,,2.0
a,4.0,7.0,-8.0


In [6]:
#关于丢弃缺失值的补充方法：丢弃整行为Nan的行
# how : {'any', 'all'}, default 'any'
df0.dropna(how='all')

Unnamed: 0,B,A,C
d,-4.0,-2.0,1.0
c,-9.0,,2.0
a,4.0,7.0,-8.0


In [7]:
#按照列删除全是nan值的列
#由于没有全是nan的列，那么没有删除任何列
df0.dropna(axis=1,how='all')

Unnamed: 0,B,A,C
d,-4.0,-2.0,1.0
b,,,
c,-9.0,,2.0
a,4.0,7.0,-8.0


In [8]:
#针对某一列去除nan值使用，subset参数
df0.dropna(subset=['A'])

Unnamed: 0,B,A,C
d,-4.0,-2.0,1.0
a,4.0,7.0,-8.0


In [5]:
df1=pd.DataFrame(np.random.rand(7,3))
df1.loc[:4,1]=np.nan
df1.loc[:2,2]=np.nan
df1

Unnamed: 0,0,1,2
0,0.8641,,
1,0.771696,,
2,0.406316,,
3,0.533904,,0.727263
4,0.727816,,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [24]:
# thresh : int, optional
#        Require that many non-NA values.
# 指定达到缺失值的数量才能删除
df1.dropna(thresh=2)

Unnamed: 0,0,1,2
3,0.533904,,0.727263
4,0.727816,,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [25]:
#dropna函数，不改变原数据的索引号
df1.dropna()

Unnamed: 0,0,1,2
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [7]:
#drop()函数不改变元数据
df1

Unnamed: 0,0,1,2
0,0.8641,,
1,0.771696,,
2,0.406316,,
3,0.533904,,0.727263
4,0.727816,,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


### 1-（3）缺失值填充

In [23]:
#不添加其他参数的填充方法
df1.fillna(0)

Unnamed: 0,0,1,2
0,0.792499,0.0,0.0
1,0.555113,0.0,0.0
2,0.57894,0.0,0.0
3,0.669352,0.0,0.937611
4,0.329004,0.0,0.443623
5,0.501356,0.122923,0.587066
6,0.666037,0.617876,0.434711


In [13]:
#针对某个列空缺值填充不同的数值的方法
df1.fillna({1:0.9,2:0.8})

Unnamed: 0,0,1,2
0,0.8641,0.9,0.8
1,0.771696,0.9,0.8
2,0.406316,0.9,0.8
3,0.533904,0.9,0.727263
4,0.727816,0.9,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [14]:
df1

Unnamed: 0,0,1,2
0,0.8641,,
1,0.771696,,
2,0.406316,,
3,0.533904,,0.727263
4,0.727816,,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [11]:
# fillna函数默认操作时不对原数据进行更改。需要针对源数据更改添加参数inplace=True
df2=df1.copy()
df2.fillna({1:0.9,2:0.8})

Unnamed: 0,0,1,2
0,0.8641,0.9,0.8
1,0.771696,0.9,0.8
2,0.406316,0.9,0.8
3,0.533904,0.9,0.727263
4,0.727816,0.9,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [15]:
df2

Unnamed: 0,0,1,2
0,0.8641,,
1,0.771696,,
2,0.406316,,
3,0.533904,,0.727263
4,0.727816,,0.767004
5,0.847191,0.499481,0.548473
6,0.770602,0.358454,0.519032


In [19]:
df3 = pd.DataFrame(np.random.randn(6,3))
df3.iloc[2:,1] = np.nan
df3.iloc[4:,2] = np.nan
df3

Unnamed: 0,0,1,2
0,-1.434808,-0.319881,1.019222
1,0.019279,-1.243475,-0.466217
2,0.532758,,-0.273716
3,1.63372,,0.7735
4,-1.104303,,
5,0.480561,,


In [22]:
# 按照列的缺失值最上面一个有效值填充
df3.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-1.434808,-0.319881,1.019222
1,0.019279,-1.243475,-0.466217
2,0.532758,-1.243475,-0.273716
3,1.63372,-1.243475,0.7735
4,-1.104303,-1.243475,0.7735
5,0.480561,-1.243475,0.7735


In [23]:
#按照列上的缺失值上最后一个有效值填充，设置向下填充的个数
df3.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,-1.434808,-0.319881,1.019222
1,0.019279,-1.243475,-0.466217
2,0.532758,-1.243475,-0.273716
3,1.63372,-1.243475,0.7735
4,-1.104303,,0.7735
5,0.480561,,0.7735


In [11]:
help(pd.DataFrame.dropna)

Help on function dropna in module pandas.core.frame:

dropna(self, axis: 'Axis' = 0, how: 'str' = 'any', thresh=None, subset=None, inplace: 'bool' = False)
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. versionchanged:: 1.0.0
    
           Pass tuple or list to drop on multiple axes.
           Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.
    
        * 'any' : If any NA values are present, drop that row