In [2]:
import numpy as np
import pandas as pd

In [2]:
data_one = {'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']}
data_two = {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}
one = pd.DataFrame(data_one)
two = pd.DataFrame(data_two)

In [3]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [4]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [5]:
# we can join the DataFrames along the columns (makes the most sense here)
pd.concat([one, two], axis = 1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [6]:
pd.concat([two, one], axis = 1)

Unnamed: 0,C,D,A,B
0,C0,D0,A0,B0
1,C1,D1,A1,B1
2,C2,D2,A2,B2
3,C3,D3,A3,B3


In [10]:
pd.concat([one, two], axis = 0)
# note that the resulting DataFrame has a lot of NaN -> prob not a good concatenation 

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


In [11]:
pd.concat([one, two], axis = 0).loc[0]

Unnamed: 0,A,B,C,D
0,A0,B0,,
0,,,C0,D0


In [12]:
# example: we want to concatenate by row, and the data of C belongs to A, and the data
# of C belongs to B. C and A as well as B and D are the same features
two.columns = one.columns

In [13]:
two

Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [15]:
mydf = pd.concat([one, two], axis=0)

In [17]:
mydf

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [31]:
mydf.index = range(len(mydf))
mydf
# mydf.reset_index() does not work, since it makes the new colum index

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,C0,D0
5,C1,D1
6,C2,D2
7,C3,D3


In [3]:
# example: data of a conference
# people register online beforehand and then login the day of the conference
# after the conference we have the registration table and the logins table 
# the respective id columns indicate the order they registered / logged in on site
# not everyone that register logged on site
# some people that logged in on site did not register 

# assumption: names are unique
# note that in the registration table the initial are A, B, C, D

# help(pd.merge) to see full manual

reg = pd.DataFrame({'reg_id':[1,2,3,4],'name':['Andrew','Bobo','Claire','David']})
log = pd.DataFrame({'log_id':[1,2,3,4],'name':['Xavier','Andrew','Yolanda','Bobo']})

In [4]:
reg

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [5]:
log

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [6]:
pd.merge(reg, log, how='inner', on='name')
# the order of the tables does not matter

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bobo,4


In [7]:
pd.merge(reg, log, how='left', on='name')
# the order of the tables does matter

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


In [8]:
pd.merge(reg, log, how='right', on='name')
# the order of the tables does matter

Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bobo,4


In [9]:
pd.merge(reg, log, how='outer', on='name')
# the order of the tables does not matter
# quick and dirty way to join everything regardless of NaN and missing data

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


In [10]:
reg = reg.set_index('name')

In [13]:
reg

Unnamed: 0_level_0,reg_id
name,Unnamed: 1_level_1
Andrew,1
Bobo,2
Claire,3
David,4


In [14]:
log

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [15]:
# we can merge on the index
pd.merge(reg, log, left_index=True, right_on='name', how='inner')

Unnamed: 0,reg_id,log_id,name
1,1,2,Andrew
3,2,4,Bobo


In [16]:
reg = reg.reset_index()

In [17]:
reg

Unnamed: 0,name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [18]:
reg.columns = ['reg_name', 'reg_id']

In [19]:
reg

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [20]:
pd.merge(reg, log, left_on='reg_name', right_on='name', how='inner')
# note that it return both reg_name and name; we can drop one of the columns afterwards

Unnamed: 0,reg_name,reg_id,log_id,name
0,Andrew,1,2,Andrew
1,Bobo,2,4,Bobo


In [21]:
reg.columns = ['name', 'id']

In [22]:
reg

Unnamed: 0,name,id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [23]:
log.columns = ['id', 'name']

In [24]:
log

Unnamed: 0,id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [25]:
pd.merge(reg, log, on='name', how='inner')
# here columsn in different tables have indentical names
# x stands for the left table, y stands for the right table
# alternatively we can specify the suffixes explicitly

Unnamed: 0,name,id_x,id_y
0,Andrew,1,2
1,Bobo,2,4


In [26]:
pd.merge(reg, log, on='name', how='inner', suffixes=('_reg', '_log'))

Unnamed: 0,name,id_reg,id_log
0,Andrew,1,2
1,Bobo,2,4
