< [Online Version Python Data Science Handbook](https://github.com/jakevdp/PythonDataScienceHandbook)| [Menu](https://)>
# Merge, join, concatenate and compare 

* http://pandas.pydata.org/
* [Merge, join, concatenate and compare](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html)
    * [Concat > Concatenating objects](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#concatenating-objects)
    * [Concatenating using append](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#concatenating-using-append)
    * [Merging > Database-style DataFrame or named Series joining/merging](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging) 

## CONCAT

In [472]:
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [234]:
df1 = pd.DataFrame(
    {
        "A": ["A0", "A1", "A2"],
        "B": ["B0", "B1", "B2"],
        "C": ["C0", "C1", "C2"]
    },
    index=[0, 1, 2],
)
df2 = pd.DataFrame(
    {
        "A": ["A3", "A4", "A5"],
        "B": ["B3", "B4", "B5"],
        "C": ["C3", "C4", "C5"]
    },
    index=[0, 1, 2],
)

In [68]:
pd.concat([df1, df2])

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [274]:
# Ignoring the index
pd.concat([df1, df2], ignore_index=True, sort=False)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5


In [275]:
pd.concat([df1, df2], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4,5
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5


In [323]:
# Adding MultiIndex row keys
df = pd.concat({"df1": df1, "df2": df2}); df
df = pd.concat([df1, df2], keys=["df1", "df2"]); df

Unnamed: 0,Unnamed: 1,A,B,C
df1,0,A0,B0,C0
df1,1,A1,B1,C1
df1,2,A2,B2,C2
df2,0,A3,B3,C3
df2,1,A4,B4,C4
df2,2,A5,B5,C5


In [300]:
df.loc["df2"]

Unnamed: 0,A,B,C
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [324]:
df.index.levels

FrozenList([['df1', 'df2'], [0, 1, 2]])

In [321]:
# Adding MultiIndex columns keys
df = pd.concat({"df1": df1, "df2": df2}, axis=1); df
df = pd.concat([df1, df2], axis=1, keys=["df1", "df2"]); df

Unnamed: 0_level_0,df1,df1,df1,df2,df2,df2
Unnamed: 0_level_1,A,B,C,A,B,C
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5


In [305]:
df.loc[:, "df2"]

Unnamed: 0,A,B,C
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [322]:
df.columns.levels

FrozenList([['df1', 'df2'], ['A', 'B', 'C']])

### JOIN = "OUTER" > opción por defecto no pierdes información

In [276]:
df_test = df2.set_index(pd.Index([3,4,5])); df_test.rename(columns={'A':'B', 'B':'C', "C":"D"}, inplace = True); df_test

In [243]:
pd.concat([df1, df_test], join="outer")
pd.concat([df1, df_test], axis=0)
pd.concat([df1, df_test])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,
1,A1,B1,C1,
2,A2,B2,C2,
3,,A3,B3,C3
4,,A4,B4,C4
5,,A5,B5,C5


In [239]:
pd.concat([df1, df_test], axis=1)
pd.concat([df1, df_test], axis=1, join="outer")

Unnamed: 0,A,B,C,B.1,C.1,D
0,A0,B0,C0,,,
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,,,,A3,B3,C3
4,,,,A4,B4,C4
5,,,,A5,B5,C5


In [252]:
# reuse the exact index from the original DataFrame
pd.concat([df1, df_test.reindex(df1.index)], axis=1)
pd.concat([df1, df_test], axis=1).reindex(df1.index)

Unnamed: 0,A,B,C,B.1,C.1,D
0,A0,B0,C0,,,
1,A1,B1,C1,,,
2,A2,B2,C2,,,


In [249]:
# reuse the exact column from the original DataFrame
pd.concat([df1, df_test.reindex(columns=df1.columns)], axis=0)
pd.concat([df1, df_test], axis=0).reindex(columns=df1.columns)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,,A3,B3
4,,A4,B4
5,,A5,B5


### JOIN = 'INNER' > toma la intersección

In [260]:
pd.concat([df1, df_test], join="inner")
pd.concat([df1, df_test], join="inner", axis=0)

Unnamed: 0,B,C
0,B0,C0
1,B1,C1
2,B2,C2
3,A3,B3
4,A4,B4
5,A5,B5


In [261]:
pd.concat([df1, df_test], join="inner", axis=1)

Unnamed: 0,A,B,C,B.1,C.1,D


### Concatenating using append

In [55]:
# append > concatenate along axis=0
# df1.append([df2, df3])
df1.append(df2)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [266]:
df1.append(df_test, sort=True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,
1,A1,B1,C1,
2,A2,B2,C2,
3,,A3,B3,C3
4,,A4,B4,C4
5,,A5,B5,C5


In [267]:
df1.append([df2, df_test])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,
1,A1,B1,C1,
2,A2,B2,C2,
0,A3,B3,C3,
1,A4,B4,C4,
2,A5,B5,C5,
3,,A3,B3,C3
4,,A4,B4,C4
5,,A5,B5,C5


## MERGING

In [504]:
# dataset
df_left = df1.copy(); df_left["key"] = [1, 2, 3]; 
df_right = df2.copy(); df_right["key"] = [1, 2, 3]; df_right = df_right.rename(columns={'A':'D', 'B':'E', "C":"F"});
display('df_left', 'df_right')

Unnamed: 0,A,B,C,key
0,A0,B0,C0,1
1,A1,B1,C1,2
2,A2,B2,C2,3

Unnamed: 0,D,E,F,key
0,A3,B3,C3,1
1,A4,B4,C4,2
2,A5,B5,C5,3


In [477]:
# join by column
pd.merge(df_left, df_right, on="key") # pd.merge(df_left, df_right, on=["key1", "key2"])
pd.merge(df_left, df_right, how="inner", on="key")
pd.merge(df_left, df_right, how="inner", left_on="key", right_on="key")

Unnamed: 0,A,B,C,key,D,E,F
0,A0,B0,C0,1,A3,B3,C3
1,A1,B1,C1,2,A4,B4,C4
2,A2,B2,C2,3,A5,B5,C5


In [515]:
# El resultado tiene una columna redundante que podemos eliminar si lo deseamos
pd.merge(df_left, df_right, on="key").drop(columns=["D", "E", "F"])
pd.merge(df_left, df_right, on="key").drop(columns=["D", "E", "F"], axis=1)

Unnamed: 0,A,B,C,key
0,A0,B0,C0,1
1,A1,B1,C1,2
2,A2,B2,C2,3


### Merge by index

In [463]:
# dataset
df_left_test = df_left.set_index('key')
df_right_test = df_right.set_index('key')
display('df_left_test', 'df_right_test')

Unnamed: 0_level_0,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A0,B0,C0
2,A1,B1,C1
3,A2,B2,C2

Unnamed: 0_level_0,D,E,F
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,A3,B3,C3
3,A4,B4,C4
4,A5,B5,C5


In [468]:
pd.merge(df_left_test, df_right_test, how="inner", left_index=True, right_index=True)

Unnamed: 0_level_0,A,B,C,D,E,F
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,A1,B1,C1,A3,B3,C3
3,A2,B2,C2,A4,B4,C4


### outer, left and right join

In [506]:
# dataset
df_right_test = df_right.copy(); df_right_test["key"] = [2, 3, 4];
display('df_left', 'df_right_test')

Unnamed: 0,A,B,C,key
0,A0,B0,C0,1
1,A1,B1,C1,2
2,A2,B2,C2,3

Unnamed: 0,D,E,F,key
0,A3,B3,C3,2
1,A4,B4,C4,3
2,A5,B5,C5,4


In [509]:
# use intersection of keys from both frames
pd.merge(df_left, df_right_test, how="inner")
pd.merge(df_left, df_right_test, how="inner", on="key")

Unnamed: 0,A,B,C,key,D,E,F
0,A1,B1,C1,2,A3,B3,C3
1,A2,B2,C2,3,A4,B4,C4


In [455]:
# use union of keys from both frames
pd.merge(df_left, df_right_test, how="outer")
pd.merge(df_left, df_right_test, how="outer", on="key")

Unnamed: 0,A,B,C,key,D,E,F
0,A0,B0,C0,1,,,
1,A1,B1,C1,2,A3,B3,C3
2,A2,B2,C2,3,A4,B4,C4
3,,,,4,A5,B5,C5


In [452]:
# use keys from left frame only
pd.merge(df_left, df_right_test, how="left")
pd.merge(df_left, df_right_test, how="left", on="key")

Unnamed: 0,A,B,C,key,D,E,F
0,A0,B0,C0,1,,,
1,A1,B1,C1,2,A3,B3,C3
2,A2,B2,C2,3,A4,B4,C4


In [453]:
# use keys from right frame only
pd.merge(df_left, df_right_test, how="right")
pd.merge(df_left, df_right_test, how="right", on="key")

Unnamed: 0,A,B,C,key,D,E,F
0,A1,B1,C1,2,A3,B3,C3
1,A2,B2,C2,3,A4,B4,C4
2,,,,4,A5,B5,C5


### Overlapping Column Names

In [500]:
# dataset
df_test = pd.merge(df1, df2, left_index=True, right_index=True)
display('df1', 'df2', 'df_test')

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,A,B,C
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5

Unnamed: 0,A_x,B_x,C_x,A_y,B_y,C_y
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5


In [501]:
df_test_suffixes = pd.merge(df1, df2, left_index=True, right_index=True, suffixes=["_L", "_R"])
df_test_add_prefix = pd.merge(df1.add_prefix("L_"), df2.add_prefix("R_"), left_index=True, right_index=True)
df_test_add_suffix = pd.merge(df1.add_suffix("_L"), df2.add_suffix("_R"), left_index=True, right_index=True)
display('df_test_suffixes', 'df_test_add_prefix', 'df_test_add_suffix')

Unnamed: 0,A_L,B_L,C_L,A_R,B_R,C_R
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5

Unnamed: 0,L_A,L_B,L_C,R_A,R_B,R_C
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5

Unnamed: 0,A_L,B_L,C_L,A_R,B_R,C_R
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5


In [525]:
# dataset
df_right_test = df_right.copy(); df_right_test["key"] = [2, 3, 4];
df_test = pd.merge(df_left, df_right_test, how="left", on="key")
display('df_left', 'df_right_test', 'pd.merge(df_left, df_right_test, how="left", on="key")')

Unnamed: 0,A,B,C,key
0,A0,B0,C0,1
1,A1,B1,C1,2
2,A2,B2,C2,3

Unnamed: 0,D,E,F,key
0,A3,B3,C3,2
1,A4,B4,C4,3
2,A5,B5,C5,4

Unnamed: 0,A,B,C,key,D,E,F
0,A0,B0,C0,1,,,
1,A1,B1,C1,2,A3,B3,C3
2,A2,B2,C2,3,A4,B4,C4


In [528]:
df_test.isnull().any()

Unnamed: 0,A,B,C,key,D,E,F
0,False,False,False,False,True,True,True
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
