In [1]:

import pandas as pd

# Concatenate

![Concatenate](./pic/08_concat_row.svg)

In [2]:
base_url = "http://www.stata-press.com/data/r14/"
state_codes = ["ca", "il"]
end_url = "pop.dta"

# This grabs the two dataframes, one for each state
list_of_state_dfs = [pd.read_stata(base_url + state + end_url) for state in state_codes]
# Show example of first entry in list of dataframes
print(list_of_state_dfs[0])

# Concatenate the list of dataframes
# 对于相同列的，使用 axis = 0 合并索引，对于相同索引的，使用 axis = 1 合并列
df = pd.concat(list_of_state_dfs, keys=state_codes, axis=0)
df

        county      pop
0  Los Angeles  9878554
1       Orange  2997033
2      Ventura   798364


Unnamed: 0,Unnamed: 1,county,pop
ca,0,Los Angeles,9878554
ca,1,Orange,2997033
ca,2,Ventura,798364
il,0,Cook,5285107
il,1,DeKalb,103729
il,2,Will,673586


In [3]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])

df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])

In [5]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,letter,number
0,a,1
1,b,2
2,c,3
3,d,4


# Merge

将哪些合并（`on=`），以什么样的方式合并（`how=`）。

![merge](./pic/08_merge_left.svg)


- how='left' uses keys from the left dataframe only to merge.
- how='right' uses keys from the right dataframe only to merge.
- how='inner' uses keys that appear in both dataframes to merge.
- how='outer' uses the cartesian product of keys in both dataframes to merge on.

In [7]:
left = pd.DataFrame(
    {
        "key1": ["K0", "K0", "K1", "K2"],
        "key2": ["K0", "K1", "K0", "K1"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)
right = pd.DataFrame(
    {
        "key1": ["K0", "K1", "K1", "K2"],
        "key2": ["K0", "K0", "K0", "K0"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)
print(left)
print(right)
# Right merge
pd.merge(left, right, on=["key1", "key2"], how="right")

# 以 `right` 的 key1 key2 为合并规则，合并两个 df。

  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3


Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


In [10]:
pd.merge(left, right, on=["key1", "key2"], how="left")

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


In [8]:
pd.merge(left, right, on=["key1", "key2"], how="inner")

# K2 和 K0 两个 key1 key2 不存在于 left，所以没有合并。

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [9]:
pd.merge(left, right, on=["key1", "key2"], how="outer", indicator=True)

Unnamed: 0,key1,key2,A,B,C,D,_merge
0,K0,K0,A0,B0,C0,D0,both
1,K0,K1,A1,B1,,,left_only
2,K1,K0,A2,B2,C1,D1,both
3,K1,K0,A2,B2,C2,D2,both
4,K2,K1,A3,B3,,,left_only
5,K2,K0,,,C3,D3,right_only
