In [1]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def make_df(index,columns):
    data = {col:[col+str(idx) for idx in index] for col in columns}
    df = DataFrame(data=data,index=index)    
    return df

df = make_df(index=[1,2,3,4],columns='ABCD')
df

Unnamed: 0,A,B,C,D
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4


### 使用pd.concat()级联

pandas使用pd.concat函数，与np.concatenate函数类似，只是多了一些参数：  
+ objs  
+ axis  
+ join='outer'  
+ join_axes=None  
+ ignore_index=False  
+ key=[value1,value2...]

### 简单级联

In [3]:
df1 = make_df(index=[1,2,3,4],columns='ABCD')
df2 = make_df(index=[1,2,3,4],columns='ABCD')

In [4]:
# axis=0 默认增加行数
pd.concat(objs=[df1,df2]) # 索引重复

Unnamed: 0,A,B,C,D
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4


In [5]:
# ignore_index=True 丢弃原来的索引，重新分配索引
pd.concat(objs=[df1,df2],ignore_index=True)

Unnamed: 0,A,B,C,D
0,A1,B1,C1,D1
1,A2,B2,C2,D2
2,A3,B3,C3,D3
3,A4,B4,C4,D4
4,A1,B1,C1,D1
5,A2,B2,C2,D2
6,A3,B3,C3,D3
7,A4,B4,C4,D4


In [6]:
# keys=['df1','df2'] 使用多层索引区分
pd.concat(objs=[df1,df2],keys=['df1','df2'])

Unnamed: 0,Unnamed: 1,A,B,C,D
df1,1,A1,B1,C1,D1
df1,2,A2,B2,C2,D2
df1,3,A3,B3,C3,D3
df1,4,A4,B4,C4,D4
df2,1,A1,B1,C1,D1
df2,2,A2,B2,C2,D2
df2,3,A3,B3,C3,D3
df2,4,A4,B4,C4,D4


In [7]:
pd.concat(objs=[df1,df2],axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
1,A1,B1,C1,D1,A1,B1,C1,D1
2,A2,B2,C2,D2,A2,B2,C2,D2
3,A3,B3,C3,D3,A3,B3,C3,D3
4,A4,B4,C4,D4,A4,B4,C4,D4


In [8]:
pd.concat(objs=[df1,df2],axis=1,ignore_index=True)

Unnamed: 0,0,1,2,3,4,5,6,7
1,A1,B1,C1,D1,A1,B1,C1,D1
2,A2,B2,C2,D2,A2,B2,C2,D2
3,A3,B3,C3,D3,A3,B3,C3,D3
4,A4,B4,C4,D4,A4,B4,C4,D4


In [9]:
pd.concat(objs=[df1,df2],axis=1,keys=['df1','df2'])

Unnamed: 0_level_0,df1,df1,df1,df1,df2,df2,df2,df2
Unnamed: 0_level_1,A,B,C,D,A,B,C,D
1,A1,B1,C1,D1,A1,B1,C1,D1
2,A2,B2,C2,D2,A2,B2,C2,D2
3,A3,B3,C3,D3,A3,B3,C3,D3
4,A4,B4,C4,D4,A4,B4,C4,D4


=========================================  

练习13：  
1. 想一想级联的应用场景  
2. 使用昨日的额知识，建立一个期中考试张三李四成绩表ddd  
3. 假设新增考试学科‘计算机’，如何实现
4. 新增王老五同学，如何实现

=========================================  

In [10]:
# 2
data = np.random.randint(1,150,size=(4,4))
index = ['张三','李四','王五','赵六']
columns = ['语文','数学','英语','体育']
ddd = DataFrame(data,index,columns)
ddd

Unnamed: 0,语文,数学,英语,体育
张三,117,127,74,128
李四,86,123,76,98
王五,56,25,48,124
赵六,106,13,147,69


In [11]:
# 3
data = np.random.randint(1,150,size=(4,1))
index = ['张三','李四','王五','赵六']
columns = ['计算机']
computer = DataFrame(data,index,columns)
pd.concat(objs=[ddd,computer],axis=1)

Unnamed: 0,语文,数学,英语,体育,计算机
张三,117,127,74,128,19
李四,86,123,76,98,74
王五,56,25,48,124,110
赵六,106,13,147,69,137


In [12]:
data = np.random.randint(1,150,size=(1,4))
index = ['王老五']
columns = ['语文','数学','英语','体育']
wlw = DataFrame(data,index,columns)
pd.concat(objs=[ddd,wlw],axis=0)

Unnamed: 0,语文,数学,英语,体育
张三,117,127,74,128
李四,86,123,76,98
王五,56,25,48,124
赵六,106,13,147,69
王老五,93,68,21,91


### 不匹配级联

In [13]:
df1 = make_df(index=[1,2,3,4],columns='ABCD')
df2 = make_df(index=[2,3,4,5],columns='BCDE')
display(df1,df2)

Unnamed: 0,A,B,C,D
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4


Unnamed: 0,B,C,D,E
2,B2,C2,D2,E2
3,B3,C3,D3,E3
4,B4,C4,D4,E4
5,B5,C5,D5,E5


In [14]:
pd.concat(objs=[df1,df2])

Unnamed: 0,A,B,C,D,E
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,A4,B4,C4,D4,
2,,B2,C2,D2,E2
3,,B3,C3,D3,E3
4,,B4,C4,D4,E4
5,,B5,C5,D5,E5


In [15]:
pd.concat(objs=[df1,df2],axis=1)

Unnamed: 0,A,B,C,D,B.1,C.1,D.1,E
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,B2,C2,D2,E2
3,A3,B3,C3,D3,B3,C3,D3,E3
4,A4,B4,C4,D4,B4,C4,D4,E4
5,,,,,B5,C5,D5,E5


有3种连接方式  
+ 外连接：全部匹配，为空的补NaN（默认方式）

In [16]:
pd.concat(objs=[df1,df2],axis=0,join='outer')

Unnamed: 0,A,B,C,D,E
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,A4,B4,C4,D4,
2,,B2,C2,D2,E2
3,,B3,C3,D3,E3
4,,B4,C4,D4,E4
5,,B5,C5,D5,E5


+ 内连接：inner(匹配到的显示出来)

In [17]:
pd.concat(objs=[df1,df2],axis=0,join='inner')

Unnamed: 0,B,C,D
1,B1,C1,D1
2,B2,C2,D2
3,B3,C3,D3
4,B4,C4,D4
2,B2,C2,D2
3,B3,C3,D3
4,B4,C4,D4
5,B5,C5,D5


+ 指定连接轴

In [18]:
pd.concat(objs=[df1,df2],axis=0,join='inner',ignore_index=True).reindex(index=df1.index)

Unnamed: 0,B,C,D
1,B2,C2,D2
2,B3,C3,D3
3,B4,C4,D4
4,B2,C2,D2


=========================================  

练习14：  
1. 假设【期末】考试ddd2的成绩表没有张三的，只有李四、王老五，赵小六的，使用多种方法级联ddd

=========================================  

In [19]:
# 1
ddd

Unnamed: 0,语文,数学,英语,体育
张三,117,127,74,128
李四,86,123,76,98
王五,56,25,48,124
赵六,106,13,147,69


In [20]:
data = np.random.randint(1,150,size=(3,4))
index = ['李四','王老五','赵小六']
columns = ['语文','数学','英语','体育']
ddd2 = DataFrame(data,index,columns)
ddd2

Unnamed: 0,语文,数学,英语,体育
李四,80,33,123,8
王老五,48,22,85,73
赵小六,50,29,87,55


In [21]:
pd.concat(objs=[ddd,ddd2],keys=['期中','期末'])

Unnamed: 0,Unnamed: 1,语文,数学,英语,体育
期中,张三,117,127,74,128
期中,李四,86,123,76,98
期中,王五,56,25,48,124
期中,赵六,106,13,147,69
期末,李四,80,33,123,8
期末,王老五,48,22,85,73
期末,赵小六,50,29,87,55


In [22]:
pd.concat(objs=[ddd,ddd2],axis=1,keys=['期中','期末'])

Unnamed: 0_level_0,期中,期中,期中,期中,期末,期末,期末,期末
Unnamed: 0_level_1,语文,数学,英语,体育,语文,数学,英语,体育
张三,117.0,127.0,74.0,128.0,,,,
李四,86.0,123.0,76.0,98.0,80.0,33.0,123.0,8.0
王五,56.0,25.0,48.0,124.0,,,,
赵六,106.0,13.0,147.0,69.0,,,,
王老五,,,,,48.0,22.0,85.0,73.0
赵小六,,,,,50.0,29.0,87.0,55.0


In [23]:
pd.concat(objs=[ddd,ddd2],axis=1,keys=['期中','期末'],sort=True).reindex(index=ddd2.index)

Unnamed: 0_level_0,期中,期中,期中,期中,期末,期末,期末,期末
Unnamed: 0_level_1,语文,数学,英语,体育,语文,数学,英语,体育
李四,86.0,123.0,76.0,98.0,80.0,33.0,123.0,8.0
王老五,,,,,48.0,22.0,85.0,73.0
赵小六,,,,,50.0,29.0,87.0,55.0


### 使用append()函数添加  

由于在后面级联的使用非常普遍，因此有一个函数append专门用来在后面添加  

【注意】：concat是pd的方法，append是DataFrame对象的方法

In [24]:
ddd.append(ddd2)

Unnamed: 0,语文,数学,英语,体育
张三,117,127,74,128
李四,86,123,76,98
王五,56,25,48,124
赵六,106,13,147,69
李四,80,33,123,8
王老五,48,22,85,73
赵小六,50,29,87,55


In [25]:
# verify_integrity=True 表示有重复的就会报错
# ddd.append(ddd2,verify_integrity=True)

In [26]:
# ignore_index=True 表示有重复的就会生成索引
ddd.append(ddd2,ignore_index=True)

Unnamed: 0,语文,数学,英语,体育
0,117,127,74,128
1,86,123,76,98
2,56,25,48,124
3,106,13,147,69
4,80,33,123,8
5,48,22,85,73
6,50,29,87,55


### 归纳总结：  
pandas的级联：  
+ pd.concat(objs=[df1,df2,df3,...])
+ ignore_index(重新生成索引)
+ keys (生成二级索引)
+ join (外连接：outner,内连接：inner)

### 2.使用pd.merge()合并  
merge与concat的区别在于，merge需要依据某一共同的行或列来进行合并  
使用pd.merge()合并时，会自动根据两者相同的column名称那一列，作为key来进行合并  
【注意】：每一列元素顺序不要求一致

#### (1).一对一合并

In [30]:
data={'name':['张三','李四','Chales'],'id':[1,2,3],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'sex':['男','男','女'],'id':[2,3,4],'group':['sale','search','service']}
df2 = DataFrame(data=data)
display(df1,df2)

Unnamed: 0,name,id,age
0,张三,1,22
1,李四,2,25
2,Chales,3,28


Unnamed: 0,sex,id,group
0,男,2,sale
1,男,3,search
2,女,4,service


In [31]:
pd.merge(left=df1,right=df2)

Unnamed: 0,name,id,age,sex,group
0,李四,2,25,男,sale
1,Chales,3,28,男,search


In [32]:
df1.merge(df2)

Unnamed: 0,name,id,age,sex,group
0,李四,2,25,男,sale
1,Chales,3,28,男,search


#### (2).多对一合并

In [35]:
data={'name':['张三','李四','Chales'],'id':[1,2,2],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'sex':['男','男','女'],'id':[2,3,4],'group':['sale','search','service']}
df2 = DataFrame(data=data)
display(df1,df2)

Unnamed: 0,name,id,age
0,张三,1,22
1,李四,2,25
2,Chales,2,28


Unnamed: 0,sex,id,group
0,男,2,sale
1,男,3,search
2,女,4,service


In [36]:
df1.merge(df2)

Unnamed: 0,name,id,age,sex,group
0,李四,2,25,男,sale
1,Chales,2,28,男,sale


#### (3).多对多合并

In [38]:
data={'name':['张三','李四','张三'],'salary':[10000,20000,25000],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'sex':['男','男','女'],'name':['张三','张三','帆帆'],'group':['sale','search','service']}
df2 = DataFrame(data=data)
display(df1,df2)

Unnamed: 0,name,salary,age
0,张三,10000,22
1,李四,20000,25
2,张三,25000,28


Unnamed: 0,sex,name,group
0,男,张三,sale
1,男,张三,search
2,女,帆帆,service


In [39]:
df1.merge(df2)

Unnamed: 0,name,salary,age,sex,group
0,张三,10000,22,男,sale
1,张三,10000,22,男,search
2,张三,25000,28,男,sale
3,张三,25000,28,男,search


#### (4).key的规划范

In [42]:
data={'name':['张三','李四','张三'],'salary':[10000,20000,25000],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'age':[25,30,28],'name':['张三','张三','帆帆'],'group':['sale','search','service']}
df2 = DataFrame(data=data)
display(df1,df2)

Unnamed: 0,name,salary,age
0,张三,10000,22
1,李四,20000,25
2,张三,25000,28


Unnamed: 0,age,name,group
0,25,张三,sale
1,30,张三,search
2,28,帆帆,service


In [44]:
# 因为age和name列都有相同的元素，merge不知道该合并哪一列了
df1.merge(df2)

Unnamed: 0,name,salary,age,group


In [45]:
# 当有多个列相同时，需要使用on来指定具体使用哪一列合并
df1.merge(df2,on='name')

Unnamed: 0,name,salary,age_x,age_y,group
0,张三,10000,22,25,sale
1,张三,10000,22,30,search
2,张三,25000,28,25,sale
3,张三,25000,28,30,search


当列名不同，但值相同时，使用left_on和right_on

In [46]:
data={'name':['张三','李四','张三'],'salary':[10000,20000,25000],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'年龄':[25,30,28],'姓名':['张三','张三','帆帆'],'group':['sale','search','service']}
df2 = DataFrame(data=data)
display(df1,df2)

Unnamed: 0,name,salary,age
0,张三,10000,22
1,李四,20000,25
2,张三,25000,28


Unnamed: 0,年龄,姓名,group
0,25,张三,sale
1,30,张三,search
2,28,帆帆,service


In [50]:
df1.merge(df2,left_on='name',right_on='姓名')

Unnamed: 0,name,salary,age,年龄,姓名,group
0,张三,10000,22,25,张三,sale
1,张三,10000,22,30,张三,search
2,张三,25000,28,25,张三,sale
3,张三,25000,28,30,张三,search


当左边列值的和右边的index值相同的时候，使用right_index=True

In [51]:
data={'name':['张三','李四','张三'],'salary':[10000,20000,25000],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'年龄':[25,30,28],'姓名':['张三','张三','帆帆'],'group':['sale','search','service']}
df2 = DataFrame(data=data,index=[22,25,28])
display(df1,df2)

Unnamed: 0,name,salary,age
0,张三,10000,22
1,李四,20000,25
2,张三,25000,28


Unnamed: 0,年龄,姓名,group
22,25,张三,sale
25,30,张三,search
28,28,帆帆,service


In [53]:
df1.merge(df2,left_on='age',right_index=True)

Unnamed: 0,name,salary,age,年龄,姓名,group
0,张三,10000,22,25,张三,sale
1,李四,20000,25,30,张三,search
2,张三,25000,28,28,帆帆,service


===============================================  

练习16：  
1. 假设有两份成绩单，除了ddd是张三李四王老五之外，还有ddd2是张三和赵小六的成绩单，如何合并？
2. 如果ddd中张三的名字被打错了，成了张十三，该怎么办？
3. 自行练习多对一，多对多的情况
4. 自学left_index,right_index

===============================================

In [85]:
# 1
data = [[112,55,40],[55,50,81],[125,115,44]]
index = ['张三','李四','王老五']
columns = ['语文','数学','英语']
ddd = DataFrame(data,index,columns)
ddd

Unnamed: 0,语文,数学,英语
张三,112,55,40
李四,55,50,81
王老五,125,115,44


In [86]:
data = [[112,55,40],[130,68,98]]
index = ['张三','赵小六']
columns = ['语文','数学','英语']
ddd2 = DataFrame(data,index,columns)
ddd2

Unnamed: 0,语文,数学,英语
张三,112,55,40
赵小六,130,68,98


In [102]:
# ddd.merge(ddd2,how='outer')
ddd.join(ddd2,how='outer',lsuffix='ddd')

Unnamed: 0,语文ddd,数学ddd,英语ddd,语文,数学,英语
张三,112.0,55.0,40.0,112.0,55.0,40.0
李四,55.0,50.0,81.0,,,
王老五,125.0,115.0,44.0,,,
赵小六,,,,130.0,68.0,98.0


#### 外合并  
默认内合并

In [77]:
data={'name':['张三','李四','张三'],'salary':[10000,20000,25000],'age':[22,25,28]}
df1 = DataFrame(data=data)
data={'年龄':[25,30,28],'姓名':['张三','张三','帆帆'],'group':['sale','search','service']}
df2 = DataFrame(data=data,index=[22,25,28])
display(df1,df2)

Unnamed: 0,name,salary,age
0,张三,10000,22
1,李四,20000,25
2,张三,25000,28


Unnamed: 0,年龄,姓名,group
22,25,张三,sale
25,30,张三,search
28,28,帆帆,service


In [88]:
df1.merge(df2,left_on='name',right_on='姓名',how='outer')

Unnamed: 0,name,salary,age,年龄,姓名,group
0,张三,10000.0,22.0,25.0,张三,sale
1,张三,10000.0,22.0,30.0,张三,search
2,张三,25000.0,28.0,25.0,张三,sale
3,张三,25000.0,28.0,30.0,张三,search
4,李四,20000.0,25.0,,,
5,,,,28.0,帆帆,service


pd.merge()  
+ left,合并的左表
+ right,合并的右表
+ how='inner',左右内外合并
+ on=None,指定合并的列
+ left_on=None,指定左表合并的列
+ right_on=None,指定右表合并的列
+ left_index=False,跟左表的索引值匹配合并
+ right_index=False,跟右表的索引值匹配合并
+ sort=False,
+ suffixes=('_x', '_y'),指定相同列名后缀

## 案例分析：每个各州人口数据分析

unique() 去重函数  
只能用于一维

In [89]:
s = Series(['Tom','Lucy','Tom','Kangkang','Lucy'])
s

0         Tom
1        Lucy
2         Tom
3    Kangkang
4        Lucy
dtype: object

In [90]:
s.unique()

array(['Tom', 'Lucy', 'Kangkang'], dtype=object)

query条件查询函数

In [98]:
df = DataFrame(data={'name':['Tom','Lucy','Tom','Kangkang','Lucy'],
                     'age':[20,25,27,26,12]})
df

Unnamed: 0,name,age
0,Tom,20
1,Lucy,25
2,Tom,27
3,Kangkang,26
4,Lucy,12


In [99]:
# 查询名字是Lucy，并且年龄大于22
df.query("name=='Lucy'&age>13")

Unnamed: 0,name,age
1,Lucy,25
