## Pandas:数据规整

内容介绍:

In [2]:
import numpy as np
import pandas as pd

In [3]:
# 示例数据
s0 = pd.Series(range(10),index=['d','b','c','a','e','d','b','c','a','e'])
print(s0)
df0 = pd.DataFrame(np.random.randint(-9,9,size=(4,3)),index=['d','b','c','a'],columns=['B','A','C'])
df0

d    0
b    1
c    2
a    3
e    4
d    5
b    6
c    7
a    8
e    9
dtype: int64


Unnamed: 0,B,A,C
d,-7,-1,-9
b,2,-6,6
c,8,-5,-4
a,-1,-7,-1


### 1.层次化索引-知识回顾(可以查看06文档详细讲解)

In [4]:
#添加一个层次化索引
dic = {'a':1,'b':2,'c':3,'d':4,'e':5}
s0_index_2 = s0.index.map(dic)
#构建一个新的带有层次化索引的数据
s = pd.Series(s0.values,index=[s0.index, s0_index_2])

In [5]:
s0_index_2

Int64Index([4, 2, 3, 1, 5, 4, 2, 3, 1, 5], dtype='int64')

In [None]:
# 索引排序
s1 = s.sort_index()
s1

In [29]:
#层次化索引的选取操作
s1['a':'c']

a  1    3
   1    8
b  2    1
   2    6
c  3    2
   3    7
dtype: int64

In [32]:
#离散选取
s1[['b','d']]

b  2    1
   2    6
d  4    0
   4    5
dtype: int64

In [34]:
#直接索引内层
s1.loc[:,2]

b    1
b    6
dtype: int64

In [46]:
#设置新索引
df_idx = df0.index.map(dic)
df0['new_idx'] = df_idx
#set_index()函数中的参数,drop=False时原来转换成索引的列继续保留
df1 = df0.set_index(['new_idx',df0.index])
df1.sort_index

<bound method DataFrame.sort_index of            B  A  C
new_idx           
4       d -8 -2 -2
2       b -1  2 -2
3       c  7  6 -1
1       a -1  3  5>

In [47]:
#将转换成索引的列还原成原来的列
df1.reset_index

<bound method DataFrame.reset_index of            B  A  C
new_idx           
4       d -8 -2 -2
2       b -1  2 -2
3       c  7  6 -1
1       a -1  3  5>

In [51]:
df1.index

MultiIndex([(4, 'd'),
            (2, 'b'),
            (3, 'c'),
            (1, 'a')],
           names=['new_idx', None])

### 2.数据连接-连接列的方式

merge(left, right, how: str = 'inner', on=None, left_on=None, right_on=None, left_index: bool = False, right_index: bool = False, sort: bool = False, suffixes=('_x', '_y'), copy: bool = True, indicator: bool = False, validate=None) -> 'DataFrame'

* left,需要连接的左方数据
* right,需要连接的右方数据
* how:连接方式，包括inner,outer,left,right

In [80]:
df21 = pd.DataFrame(np.random.randint(0,9,size=(4,3)),columns=['D','E','F'])
df21['key'] = ['yy','xx','mm','snn']
df21

Unnamed: 0,D,E,F,key
0,5,8,1,yy
1,3,0,1,xx
2,7,4,4,mm
3,5,0,7,snn


In [77]:
df22 = pd.DataFrame(np.random.randint(0,9,size=(4,3)),columns=['B','A','C'])
df22['key'] = ['xx','yy','mm','nn']
df22

Unnamed: 0,B,A,C,key
0,1,4,7,xx
1,6,4,7,yy
2,0,8,3,mm
3,8,4,0,nn


In [89]:
#pd.merge根据单个或多个键连接两个表格。默认列明相同的列为连接键。
#需要关键列一致
#也可以使用参数on设定需要连接的键列
#默认：在键列中，两者不一致的数据，不会被连接。即内连接形式。
pd.merge(df21,df22)

Unnamed: 0,D,E,F,key,B,A,C
0,5,8,1,yy,6,4,7
1,3,0,1,xx,1,4,7
2,7,4,4,mm,0,8,3


In [90]:
#内连接，指定参数进行。内连接也是默认值。-->交集的概念
pd.merge(df21,df22,how='inner',on='key')

Unnamed: 0,D,E,F,key,B,A,C
0,5,8,1,yy,6,4,7
1,3,0,1,xx,1,4,7
2,7,4,4,mm,0,8,3


In [85]:
#左连接
#以前面(即左面的)的参数为主，查找右面的数据，存在相同的则连接数据；如果左边键有数据，右边没有那么不连接数据。
pd.merge(df21,df22,how='left',on='key')

Unnamed: 0,D,E,F,key,B,A,C
0,5,8,1,yy,6.0,4.0,7.0
1,3,0,1,xx,1.0,4.0,7.0
2,7,4,4,mm,0.0,8.0,3.0
3,5,0,7,snn,,,


In [87]:
#右连接，方式同左连接相反。
pd.merge(df21,df22,how='right',on='key')

Unnamed: 0,D,E,F,key,B,A,C
0,3.0,0.0,1.0,xx,1,4,7
1,5.0,8.0,1.0,yy,6,4,7
2,7.0,4.0,4.0,mm,0,8,3
3,,,,nn,8,4,0


In [91]:
#外连接，方式为两者结合最大的范围。-->并集的概念
pd.merge(df21,df22,how='outer',on='key')

Unnamed: 0,D,E,F,key,B,A,C
0,5.0,8.0,1.0,yy,6.0,4.0,7.0
1,3.0,0.0,1.0,xx,1.0,4.0,7.0
2,7.0,4.0,4.0,mm,0.0,8.0,3.0
3,5.0,0.0,7.0,snn,,,
4,,,,nn,8.0,4.0,0.0


In [96]:
#连接中处理重复的列名
df23 = df21.copy()
df23.loc['4'] = [3,5,6,'yy']
df23.loc['5'] = [3,50,99,'yy']
df23.loc['6'] = [13,22,6,'yy']
df23

Unnamed: 0,D,E,F,key
0,5,8,1,yy
1,3,0,1,xx
2,7,4,4,mm
3,5,0,7,snn
4,3,5,6,yy
5,3,50,99,yy
6,13,22,6,yy


In [98]:
#数据中出现重复的key关键字时，处理方式如下：
#关键字匹配时，没有关键字重复的数据进行多次匹配
pd.merge(df23,df22)

Unnamed: 0,D,E,F,key,B,A,C
0,5,8,1,yy,6,4,7
1,3,5,6,yy,6,4,7
2,3,50,99,yy,6,4,7
3,13,22,6,yy,6,4,7
4,3,0,1,xx,1,4,7
5,7,4,4,mm,0,8,3


In [99]:
#按索引连接和关键字key连接。默认使用内连接。
pd.merge(df23,df22,left_on='E',right_index=True)

Unnamed: 0,D,E,F,key_x,B,A,C,key_y
1,3,0,1,xx,1,4,7,xx
3,5,0,7,snn,1,4,7,xx


### 3.

In [3]:
help(pd.merge)

Help on function merge in module pandas.core.reshape.merge:

merge(left, right, how: str = 'inner', on=None, left_on=None, right_on=None, left_index: bool = False, right_index: bool = False, sort: bool = False, suffixes=('_x', '_y'), copy: bool = True, indicator: bool = False, validate=None) -> 'DataFrame'
    Merge DataFrame or named Series objects with a database-style join.
    
    The join is done on columns or indexes. If joining columns on
    columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
    on indexes or indexes on a column or columns, the index will be passed on.
    When performing a cross merge, no column specifications to merge on are
    allowed.
    
    Parameters
    ----------
    left : DataFrame
    right : DataFrame or named Series
        Object to merge with.
    how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
        Type of merge to be performed.
    
        * left: use only keys from left frame, similar to 