In [None]:
"""
pd.merge(): 根据单个或多个键将不同DataFrame的行连接起来,类似数据库的连接操作
pd.concat(): 沿轴方向将多个对象合并到一起
duplicated(): 判断是否是重复行    drop_duplicates(): 过滤重复行
replace(): 根据值的内容进行替换
"""

In [1]:
import numpy as np
import pandas as pd

In [41]:
# 创建DataFrame对象
df1 = pd.DataFrame({
        "key": ["a", "b", "c", "b", "a", "b", "c"],
        "data1": np.random.randint(low=1, high=10, size=(7,))
    })
df2 = pd.DataFrame({
        "key": ["a", "b", "c", "d"],
        "data2": np.random.randint(low=10, high=20, size=(4,))
    })

In [42]:
df1

Unnamed: 0,data1,key
0,6,a
1,6,b
2,4,c
3,1,b
4,6,a
5,4,b
6,8,c


In [43]:
df2

Unnamed: 0,data2,key
0,12,a
1,16,b
2,14,c
3,14,d


In [44]:
# 1.默认使用相同列名作为外键,且是内连接
pd.merge(df1, df2, on="key")

Unnamed: 0,data1,key,data2
0,6,a,12
1,6,a,12
2,6,b,16
3,1,b,16
4,4,b,16
5,4,c,14
6,8,c,14


In [45]:
# 更改列名返回新的df
df1 = df1.rename(columns={"key": "key1"})
df2 = df2.rename(columns={"key": "key2"})

In [46]:
# 2.如果两张表没有相同列名,需要手动指定左右两个表的外键
pd.merge(df1, df2, left_on="key1", right_on="key2")  # 默认how="inner"内连接

Unnamed: 0,data1,key1,data2,key2
0,6,a,12,a
1,6,a,12,a
2,6,b,16,b
3,1,b,16,b
4,4,b,16,b
5,4,c,14,c
6,8,c,14,c


In [47]:
# 3.可以用how指定是(内/外/左/右)连接,默认内连接取交集
pd.merge(df1, df2, left_on="key1", right_on="key2", how="outer")  # 外连接

Unnamed: 0,data1,key1,data2,key2
0,6.0,a,12,a
1,6.0,a,12,a
2,6.0,b,16,b
3,1.0,b,16,b
4,4.0,b,16,b
5,4.0,c,14,c
6,8.0,c,14,c
7,,,14,d


In [48]:
pd.merge(df1, df2, left_on="key1", right_on="key2", how="left")  # 左连接

Unnamed: 0,data1,key1,data2,key2
0,6,a,12,a
1,6,b,16,b
2,4,c,14,c
3,1,b,16,b
4,6,a,12,a
5,4,b,16,b
6,8,c,14,c


In [49]:
pd.merge(df1, df2, left_on="key1", right_on="key2", how="right")  # 右连接

Unnamed: 0,data1,key1,data2,key2
0,6.0,a,12,a
1,6.0,a,12,a
2,6.0,b,16,b
3,1.0,b,16,b
4,4.0,b,16,b
5,4.0,c,14,c
6,8.0,c,14,c
7,,,14,d


In [51]:
# 更改列名
df1 = df1.rename(columns={"data1": "data"})
df2 = df2.rename(columns={"data2": "data"})

In [53]:
# 处理两个表的重复列名：suffixes接收一个可迭代对象(tuple,list,set等),分别给左右表的同名数据列名加后缀
pd.merge(df1, df2, left_on="key1", right_on="key2", suffixes=("_left", "_right"))

Unnamed: 0,data_left,key1,data_right,key2
0,6,a,12,a
1,6,a,12,a
2,6,b,16,b
3,1,b,16,b
4,4,b,16,b
5,4,c,14,c
6,8,c,14,c


In [54]:
print("=" * 100)



In [60]:
# 创建DataFrame对象
df1 = pd.DataFrame({"key": ["a", "b", "c", "d", "a", "b"], "data1": np.random.randn(6)})
df2 = pd.DataFrame({"data2": np.random.randint(10, 20, 3)}, index=["a", "b", "c"])

In [61]:
df1

Unnamed: 0,data1,key
0,-0.579363,a
1,-0.123259,b
2,0.385364,c
3,-1.031612,d
4,1.956418,a
5,0.081332,b


In [62]:
df2

Unnamed: 0,data2
a,11
b,17
c,12


In [63]:
# 按行索引进行连接: right_index=True表示将右表的行索引做为外键连接
pd.merge(df1, df2, left_on="key", right_index=True, how="left")

Unnamed: 0,data1,key,data2
0,-0.579363,a,11.0
1,-0.123259,b,17.0
2,0.385364,c,12.0
3,-1.031612,d,
4,1.956418,a,11.0
5,0.081332,b,17.0


In [64]:
# 1.创建ndarray对象
arr1 = np.random.randint(10, 20, (3, 4))
arr2 = np.random.randint(10, 20, (3, 4))

In [65]:
np.concatenate([arr1, arr2])  # 默认axis=0按列合并(列数不变,行数增加)

array([[12, 17, 17, 10],
       [10, 12, 14, 15],
       [17, 16, 19, 17],
       [14, 17, 18, 11],
       [16, 10, 14, 13],
       [14, 11, 13, 11]])

In [66]:
np.concatenate([arr1, arr2], axis=1)  # 指定axis=1按行合并(行数不变,列数增加)

array([[12, 17, 17, 10, 14, 17, 18, 11],
       [10, 12, 14, 15, 16, 10, 14, 13],
       [17, 16, 19, 17, 14, 11, 13, 11]])

In [67]:
print("=" * 100)



In [68]:
# 2.创建series对象
s1 = pd.Series(np.random.randint(10, 20, 2), index=range(0, 2))
s2 = pd.Series(np.random.randint(10, 20, 3), index=range(2, 5))
s3 = pd.Series(np.random.randint(10, 20, 4), index=range(5, 9))
s4 = pd.Series(np.random.randint(10, 20, 2))
s5 = pd.Series(np.random.randint(10, 20, 3))
s6 = pd.Series(np.random.randint(10, 20, 4))

In [69]:
pd.concat([s1, s2, s3])  # 索引号不同的Series对象合并

0    16
1    13
2    17
3    12
4    15
5    10
6    18
7    13
8    13
dtype: int32

In [70]:
pd.concat([s4, s5, s6])  # 索引号相同的Series对象合并(默认按列合并)

0    19
1    14
0    12
1    18
2    14
0    16
1    11
2    18
3    16
dtype: int32

In [71]:
pd.concat([s4, s5, s6], axis=1)  # 指定axis=1按行合并

Unnamed: 0,0,1,2
0,19.0,12.0,16
1,14.0,18.0,11
2,,14.0,18
3,,,16


In [72]:
# Series对象默认外连接(数据全部保留,缺失数据用NaN表示),可以指定join="inner"为内连接
pd.concat([s4, s5, s6], axis=1, join="inner")  

Unnamed: 0,0,1,2
0,19,12,16
1,14,18,11


In [73]:
print("=" * 100)



In [82]:
# 3.创建dataframe对象
df1 = pd.DataFrame(np.random.randint(10, 20, (2, 3)), index=["A", "B"], columns=["a", "b", "c"])
df2 = pd.DataFrame(np.random.randint(10, 20, (3, 4)), index=["A", "B", "C"], columns=["a", "b", "c", "d"])

In [83]:
pd.concat([df1, df2], sort=True)  # DataFrame对象默认按列合并,外链接

Unnamed: 0,a,b,c,d
A,14,10,16,
B,12,14,12,
A,14,13,13,10.0
B,10,10,13,14.0
C,12,12,12,10.0


In [84]:
pd.concat([df1, df2], axis=1, join="inner")  # DataFrame对象可以指定按行合并,内连接

Unnamed: 0,a,b,c,a.1,b.1,c.1,d
A,14,10,16,14,13,13,10
B,12,14,12,10,10,13,14


In [85]:
# 1.创建series对象
s1 = pd.Series(np.random.randint(10, 15, 8))

In [86]:
s1.duplicated()  # 判断Series对象每行数据是否是重复数据,返回bool值

0    False
1     True
2    False
3    False
4     True
5     True
6    False
7    False
dtype: bool

In [87]:
s1.drop_duplicates()  # 可以直接过滤掉重复数据的行,只留数据首次出现的行

0    10
2    11
3    13
6    14
7    12
dtype: int32

In [88]:
print("=" * 100)



In [94]:
# 2.创建dataframe对象
df = pd.DataFrame({"data1": np.random.randint(10, 15, 8), "data2": ["a", "b", "c", "b", "b", "a", "a", "c"]})

In [95]:
df["data2"].replace("b", "d")  # 将data2列数据的b都替换成d

0    a
1    d
2    c
3    d
4    d
5    a
6    a
7    c
Name: data2, dtype: object

In [96]:
df.duplicated("data1")  # DataFrame对象需要指定列进行判断和过滤

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
dtype: bool

In [97]:
df.drop_duplicates("data2")  # 按指定列过滤数据时,其他列的数据也会相应的丢失

Unnamed: 0,data1,data2
0,11,a
1,10,b
2,12,c
