In [11]:
import numpy as np
import pandas as pd

# One-to-one join

In [12]:
df1 = pd.DataFrame({'employees':['Bob', 
                                 'Michael', 
                                 'George',
                                'James'],
                   'group':['Engineering',
                            'Accounting', 
                            'HR',
                           'Engineering']})

In [13]:
df1

Unnamed: 0,employees,group
0,Bob,Engineering
1,Michael,Accounting
2,George,HR
3,James,Engineering


In [14]:
df2 = pd.DataFrame({'employees':['Bob', 
                                 'George', 
                                 'Michael',
                                'James'],
                   'Hire_Date':[2004,
                            2005, 
                            2006,
                           2008]})

In [15]:
df2

Unnamed: 0,employees,Hire_Date
0,Bob,2004
1,George,2005
2,Michael,2006
3,James,2008


In [18]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employees,group,Hire_Date
0,Bob,Engineering,2004
1,Michael,Accounting,2006
2,George,HR,2005
3,James,Engineering,2008


In [19]:
# Notice that pandas automatically recognies that the two dataframes
# have an 'employees' column and merges them accordingly even though
# the order of the employees is different in the two DFs

# Many-to-one joins

In [22]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})

In [23]:
df4

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [24]:
pd.merge(df3, df4)

Unnamed: 0,employees,group,Hire_Date,supervisor
0,Bob,Engineering,2004,Guido
1,James,Engineering,2008,Guido
2,Michael,Accounting,2006,Carly
3,George,HR,2005,Steve


In [20]:
# Here the join is many-to-one because we have repeating values (Engineering)
# in the group column and as you can see, the result is some duplicated values
# as it should be

# Many-to-many joins

In [26]:
# If the key column in both the left and right array contains duplicates, then
#the result is a many-to-many merge.

In [24]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
'Engineering', 'Engineering', 'HR', 'HR'], 'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})

In [25]:
df1

Unnamed: 0,employees,group
0,Bob,Engineering
1,Michael,Accounting
2,George,HR
3,James,Engineering


In [26]:
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


In [27]:
pd.merge(df1, df5)

Unnamed: 0,employees,group,skills
0,Bob,Engineering,coding
1,Bob,Engineering,linux
2,James,Engineering,coding
3,James,Engineering,linux
4,Michael,Accounting,math
5,Michael,Accounting,spreadsheets
6,George,HR,spreadsheets
7,George,HR,organization


# The 'on' keyword

In [28]:
pd.merge(df1, df2, on='employees') # only if both dfs have this column

Unnamed: 0,employees,group,Hire_Date
0,Bob,Engineering,2004
1,Michael,Accounting,2006
2,George,HR,2005
3,James,Engineering,2008


# 'left_on' and 'right_on' keywords

In [29]:
df3 = pd.DataFrame({'name': ['Bob', 'Michael', 'George', 'James'],
'salary': [70000, 80000, 120000, 90000]})

In [30]:
pd.merge(df1, df3, left_on='employees', right_on='name')

Unnamed: 0,employees,group,name,salary
0,Bob,Engineering,Bob,70000
1,Michael,Accounting,Michael,80000
2,George,HR,George,120000
3,James,Engineering,James,90000


In [31]:
# If desired, we can drop the redundant name column

In [32]:
df6 = pd.merge(df1, df3, left_on='employees', right_on='name').drop('name', axis=1)

In [33]:
df6

Unnamed: 0,employees,group,salary
0,Bob,Engineering,70000
1,Michael,Accounting,80000
2,George,HR,120000
3,James,Engineering,90000


# join() method

In [34]:
# the join() method defaults to indices merging

In [35]:
df1a = df1.set_index('employees')

In [36]:
df2a = df2.set_index('employees')

In [37]:
df1a

Unnamed: 0_level_0,group
employees,Unnamed: 1_level_1
Bob,Engineering
Michael,Accounting
George,HR
James,Engineering


In [38]:
df2a

Unnamed: 0_level_0,Hire_Date
employees,Unnamed: 1_level_1
Bob,2004
George,2005
Michael,2006
James,2008


In [39]:
df1.join(df2a)

Unnamed: 0,employees,group,Hire_Date
0,Bob,Engineering,
1,Michael,Accounting,
2,George,HR,
3,James,Engineering,
