In [49]:
# Reference: 
# online free docs:          https://pandas.pydata.org/pandas-docs/stable/
#                                      https://pandas.pydata.org/pandas-docs/stable/merging.html
#                                      https://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html
# book old edition free:  https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/
# book new edition pay: https://smile.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/

In [1]:
import numpy as np
import pandas as pd

In [2]:
##### Concatenation #####
# essentially glues dataframes together

In [3]:
# concatenate rows
df1 = pd.DataFrame(np.arange(0,9).reshape(3,3),
                   index="R1 R2 R3".split(),
                   columns="C1 C2 C3".split())
df2 = pd.DataFrame(np.arange(10,19).reshape(3,3),
                   index="R4 R5 R6".split(),
                   columns="C1 C2 C3".split())
print (df1)
print (df2)

    C1  C2  C3
R1   0   1   2
R2   3   4   5
R3   6   7   8
    C1  C2  C3
R4  10  11  12
R5  13  14  15
R6  16  17  18


In [4]:
pd.concat([df1, df2], axis=0)

Unnamed: 0,C1,C2,C3
R1,0,1,2
R2,3,4,5
R3,6,7,8
R4,10,11,12
R5,13,14,15
R6,16,17,18


In [5]:
# concatenate columns
df1 = pd.DataFrame(np.arange(0,9).reshape(3,3),
                   index="R1 R2 R3".split(),
                   columns="C1 C2 C3".split())
df2 = pd.DataFrame(np.arange(10,19).reshape(3,3),
                   index="R1 R2 R3".split(),
                   columns="C4 C5 C6".split())
print (df1)
print (df2)

    C1  C2  C3
R1   0   1   2
R2   3   4   5
R3   6   7   8
    C4  C5  C6
R1  10  11  12
R2  13  14  15
R3  16  17  18


In [6]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,C1,C2,C3,C4,C5,C6
R1,0,1,2,10,11,12
R2,3,4,5,13,14,15
R3,6,7,8,16,17,18


In [9]:
# concatenating when dataframes are mis-aligned
# note: concat preserves indices by default, so you can end up with duplicate row or colmn indices
df1 = pd.DataFrame(np.arange(0,9).reshape(3,3),
                   index="R1 R2 R3".split(),
                   columns="C1 C2 C3".split())
df2 = pd.DataFrame(np.arange(10,19).reshape(3,3),
                   index="R2 R3 R4".split(),
                   columns="C2 C3 C4".split())
print (df1)
print (df2)

    C1  C2  C3
R1   0   1   2
R2   3   4   5
R3   6   7   8
    C2  C3  C4
R2  10  11  12
R3  13  14  15
R4  16  17  18


In [8]:
pd.concat([df1, df2], axis=0, sort=False) # sort=False to avoid warning

Unnamed: 0,C1,C2,C3,C4,C5,C6
R1,0.0,1.0,2.0,,,
R2,3.0,4.0,5.0,,,
R3,6.0,7.0,8.0,,,
R1,,,,10.0,11.0,12.0
R2,,,,13.0,14.0,15.0
R3,,,,16.0,17.0,18.0


In [7]:
pd.concat([df1, df2], axis=1, sort=False) # sort = False to avoid warning

Unnamed: 0,C1,C2,C3,C4,C5,C6
R1,0,1,2,10,11,12
R2,3,4,5,13,14,15
R3,6,7,8,16,17,18


In [10]:
# you can avoid duplicate row/column indices in concat 

#    - either by chosing verify_integrity=True which will catch the error
# pd.concat([df1, df2], axis=0, sort=False, verify_integrity=True)

#    - of by chosing ignore_index=True which will ignore original indices and create new default ones
pd.concat([df1, df2], axis=1, sort=False, ignore_index=True) # sort = False to avoid warning

Unnamed: 0,0,1,2,3,4,5
R1,0.0,1.0,2.0,,,
R2,3.0,4.0,5.0,10.0,11.0,12.0
R3,6.0,7.0,8.0,13.0,14.0,15.0
R4,,,,16.0,17.0,18.0


In [25]:
# when gluing together multiple DataFrames, you have a choice of how to handle the other axes 
# (other than the one being concatenated). 
# concatenation uses join='outer' by default ; we can override with join='inner'
df1 = pd.DataFrame(np.arange(0,9).reshape(3,3),
                   index="R1 R2 R3".split(),
                   columns="C1 C2 C3".split())
df2 = pd.DataFrame(np.arange(10,19).reshape(3,3),
                   index="R2 R3 R4".split(),
                   columns="C2 C3 C4".split())
print (df1)
print (df2)

    C1  C2  C3
R1   0   1   2
R2   3   4   5
R3   6   7   8
    C2  C3  C4
R2  10  11  12
R3  13  14  15
R4  16  17  18


In [26]:
pd.concat([df1,df2], axis=1, sort=False)

Unnamed: 0,C1,C2,C3,C2.1,C3.1,C4
R1,0.0,1.0,2.0,,,
R2,3.0,4.0,5.0,10.0,11.0,12.0
R3,6.0,7.0,8.0,13.0,14.0,15.0
R4,,,,16.0,17.0,18.0


In [27]:
pd.concat([df1,df2], axis=1, join='inner', sort=False)

Unnamed: 0,C1,C2,C3,C2.1,C3.1,C4
R2,3,4,5,10,11,12
R3,6,7,8,13,14,15


In [14]:
##### Joining #####
# Joins can be performed with merge(). 
# Join has parameters allowing you to specify the type of join to perform (left, right, inner, outer),
# or the columns to join on (column names or indices).

In [15]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], 'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], 'value': np.random.randn(4)})
print (df1)
print (df2)

  key     value
0   A -0.030497
1   B  0.774900
2   C  1.211334
3   D -0.683547
  key     value
0   B -0.952134
1   D  1.100892
2   D  1.046609
3   E  0.324856


In [16]:
# inner join
pd.merge(df1, df2, how='inner', on='key')

Unnamed: 0,key,value_x,value_y
0,B,0.7749,-0.952134
1,D,-0.683547,1.100892
2,D,-0.683547,1.046609


In [17]:
# outer or full join
pd.merge(df1, df2, how='outer', on='key')

Unnamed: 0,key,value_x,value_y
0,A,-0.030497,
1,B,0.7749,-0.952134
2,C,1.211334,
3,D,-0.683547,1.100892
4,D,-0.683547,1.046609
5,E,,0.324856


In [18]:
# left join
pd.merge(df1, df2, how='left', on='key')

Unnamed: 0,key,value_x,value_y
0,A,-0.030497,
1,B,0.7749,-0.952134
2,C,1.211334,
3,D,-0.683547,1.100892
4,D,-0.683547,1.046609


In [19]:
# right join
pd.merge(df1, df2, how='right', on='key')

Unnamed: 0,key,value_x,value_y
0,B,0.7749,-0.952134
1,D,-0.683547,1.100892
2,D,-0.683547,1.046609
3,E,,0.324856


In [20]:
# merge()  offers parameters for cases when you’d like to join one DataFrame’s column with another DataFrame’s index.
idf2 = df2.set_index('key')
pd.merge(df1, idf2, left_on='key', right_index=True)

Unnamed: 0,key,value_x,value_y
1,B,0.7749,-0.952134
3,D,-0.683547,1.100892
3,D,-0.683547,1.046609


In [21]:
# you can also merge on indexes of both dataframes
idf1 = df1.set_index('key')
idf2 = df2.set_index('key')
pd.merge(idf1, idf2, left_index=True, right_index=True)

Unnamed: 0_level_0,value_x,value_y
key,Unnamed: 1_level_1,Unnamed: 2_level_1
B,0.7749,-0.952134
D,-0.683547,1.100892
D,-0.683547,1.046609
