In [191]:
import pandas as pd
import numpy as np

In [192]:
data_one = {'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']}

In [193]:
data_two = {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}

In [194]:
one = pd.DataFrame(data_one)

In [195]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [196]:
two= pd.DataFrame(data_two)

In [197]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [198]:
# concatenation two table
# concatenate based on the index
# because you can't have two values sitting at the same position for index 0
# => create duplicate index
pd.concat(objs=[one,two], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


In [199]:
# concatenation two table
pd.concat(objs=[one,two], axis=1)


Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [200]:
# match the column name to concatenate the two dataframe
# make the column A, B = C, D
two.columns = one.columns

In [201]:
two


Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [202]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [203]:
# concatenate the two dataframe after match up the column
list = pd.concat(objs=[one, two], axis=0)

In [204]:
# replace the index after concatenate
list.set_index = range(len(list))

In [205]:
list

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [206]:
# After the conference we have these two table
# The respective id columns indicate what order they registered or logged in on site
# Assume name is unique
# Registrations names' first letter go A,B,C,D
registrations = pd.DataFrame({'reg_id':[1,2,3,4],'name':['Andrew','Bobo','Claire','David']})
logins = pd.DataFrame({'log_id':[1,2,3,4],'name':['Xavier','Andrew','Yolanda','Bobo']})

# Inner merger

In [207]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [208]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


* Notice that there are some people who have been registered but not actually login
* And people not registered but login into the reference

<i> Need to use the merger to figure out who both registered and logined </i>

In [209]:
# Inner merger with the name column is present in both tables
# right is the registration table
# left is the reg_id table
pd.merge(right=registrations, left=logins, on='name', how='inner')

Unnamed: 0,log_id,name,reg_id
0,2,Andrew,1
1,4,Bobo,2


In [210]:
# left is the registration table
# right is the reg_id table
pd.merge(left=registrations, right=logins, on='name', how='inner')


Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bobo,4


# Left and right merger


In [211]:
# Left merger
# Left table is the logins
# Every factors of login will present in the table after merger
# The factors not have in the registrations => NaN
pd.merge(left=registrations, right=logins, how='left', on='name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


* Let's explore an <b>how="right"</b> condition with our two example tables
* The same as the left table but the rows not actually in exact same order because the pandas sorts the left hand tale first

In [212]:
pd.merge(left=registrations, right=logins, how='right', on='name')

Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bobo,4


In [213]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [214]:
# The pandas is smart enough to figure out what is the same column in both table
# could be not specific the "on"
pd.merge(left=registrations, right=logins, how='right')


Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bobo,4


# Outer merger
* Merger everything in both tables
* If one table do not contain the value => return Nan

In [215]:
pd.merge(left=registrations, right=logins, how='outer', on='name')

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


In [216]:

pd.merge(right=registrations, left=logins, how='outer', on='name')

Unnamed: 0,log_id,name,reg_id
0,1.0,Xavier,
1,2.0,Andrew,1.0
2,3.0,Yolanda,
3,4.0,Bobo,2.0
4,,Claire,3.0
5,,David,4.0


#Join index instead of column


In [217]:
# switch the name from column into index
registrations = registrations.set_index('name')

In [218]:
registrations

Unnamed: 0_level_0,reg_id
name,Unnamed: 1_level_1
Andrew,1
Bobo,2
Claire,3
David,4


In [219]:
# merger the index with the column
# left_on= is the column of left table
# right_on= is the column of right table
# left_index= is the index of the left table
pd.merge(left=registrations, right=logins, left_index=True, right_on='name', how='right')

Unnamed: 0,reg_id,log_id,name
0,,1,Xavier
1,1.0,2,Andrew
2,,3,Yolanda
3,2.0,4,Bobo


In [220]:
# reset the index if change permanently wrong
registrations = registrations.reset_index()


In [221]:
registrations

Unnamed: 0,name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


# Dealing with different names of column when using merger


In [222]:
# change the name of column
registrations.columns = ['reg_name', 'reg_id']

In [223]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [224]:
# merger two table without the same column name
# due to the different column name => create duplicate column
result = pd.merge(left=registrations, right=logins, left_on='reg_name', right_on='name')

In [225]:
result

Unnamed: 0,reg_name,reg_id,log_id,name
0,Andrew,1,2,Andrew
1,Bobo,2,4,Bobo


In [226]:
# drop the duplicate column
result.drop('reg_name', axis=1)

Unnamed: 0,reg_id,log_id,name
0,1,2,Andrew
1,2,4,Bobo


In [227]:
# rename the registration table into same with login table
registrations.columns = ['name', 'id']

In [231]:
logins.columns = ['id', 'name']

In [233]:
# both of tables contain column with the name 'id'
# however, the merger table contains 2 id with different name tag
# The pandas automatically the assign the duplicate id name tag
# using the suffix= - a tuple
pd.merge(left=registrations, right=logins, how='inner', on='name', suffixes=('_reg','_log'))




Unnamed: 0,name,id_reg,id_log
0,Andrew,1,2
1,Bobo,2,4
