# Pandas - Merge and Join

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue','Lisa'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR','HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

In [3]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR
4,Lisa,HR


In [4]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [5]:
pd.merge(df1,df2)

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Lisa,HR,2004
4,Sue,HR,2014


In [6]:
pd.merge(df2,df1)

Unnamed: 0,employee,hire_date,group
0,Lisa,2004,Engineering
1,Lisa,2004,HR
2,Bob,2008,Accounting
3,Jake,2012,Engineering
4,Sue,2014,HR


In [7]:
df3 = pd.merge(df2,df1)

#### Many to one joins

In [8]:
df3

Unnamed: 0,employee,hire_date,group
0,Lisa,2004,Engineering
1,Lisa,2004,HR
2,Bob,2008,Accounting
3,Jake,2012,Engineering
4,Sue,2014,HR


In [9]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})

In [10]:
df4

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [11]:
df3.merge(df4)

Unnamed: 0,employee,hire_date,group,supervisor
0,Lisa,2004,Engineering,Guido
1,Jake,2012,Engineering,Guido
2,Lisa,2004,HR,Steve
3,Sue,2014,HR,Steve
4,Bob,2008,Accounting,Carly


 ### Many to many joins

In [12]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})

In [13]:
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


In [14]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR
4,Lisa,HR


In [15]:
df1.merge(df5)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization
8,Lisa,HR,spreadsheets
9,Lisa,HR,organization


In [16]:
## Specification of the Merge Key

pd.merge(df1,df2,on='employee')

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Lisa,HR,2004
4,Sue,HR,2014


In [17]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR
4,Lisa,HR


In [18]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [19]:
##  left_on right_on keywords
## For df having no common column name


df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})

In [20]:
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [21]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR
4,Lisa,HR


In [22]:
pd.merge(df1,df3,left_on="employee",right_on="name")  

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,120000
3,Lisa,HR,Lisa,120000
4,Sue,HR,Sue,90000


In [23]:
pd.merge(df1,df3,left_on="name",right_on="employee")  

KeyError: 'employee'

In [25]:
pd.merge(df3,df1,left_on="name",right_on="employee") ## left_on takes argument from df3 and right_on takes argument from df5

Unnamed: 0,name,salary,employee,group
0,Bob,70000,Bob,Accounting
1,Jake,80000,Jake,Engineering
2,Lisa,120000,Lisa,Engineering
3,Lisa,120000,Lisa,HR
4,Sue,90000,Sue,HR


In [26]:
pd.merge(df1,df3,left_on="employee",right_on="name").drop('name',axis=1)  ### Drops the column 'name' 

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Lisa,HR,120000
4,Sue,HR,90000


In [27]:
##  left_index & right_index

df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR
4,Lisa,HR


In [28]:
df1x = df1.set_index('employee')

In [29]:
df1x

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR
Lisa,HR


In [30]:
df2x = df2.set_index('employee')
df2x

Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


In [31]:
pd.merge(df1x, df2x, left_index=True, right_index=True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Lisa,HR,2004
Sue,HR,2014


In [32]:
pd.merge(df1x, df2x, left_index=True, right_index=False)

MergeError: Must pass right_on or right_index=True

In [33]:
pd.merge(df1x, df2x, left_index=False, right_index=True)

MergeError: Must pass left_on or left_index=True

In [34]:
pd.merge(df1x, df2x)

MergeError: No common columns to perform merge on

In [35]:
pd.merge(df1x, df2x, left_index=True, right_index=True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Lisa,HR,2004
Sue,HR,2014


In [36]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue','Lisay'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR','HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

In [37]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR
4,Lisay,HR


In [38]:
df2

Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [39]:
pd.merge(df1,df2)  ### Merges only the common elements

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [40]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                   columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                   columns=['name', 'drink'])

In [41]:
df6

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


In [42]:
df7

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


In [43]:
pd.merge(df6,df7, how='inner')

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [44]:
pd.merge(df6,df7, how='outer')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


In [45]:
pd.merge(df6,df7,how='left')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine


In [46]:
pd.merge(df6,df7,how='right')

Unnamed: 0,name,food,drink
0,Mary,bread,wine
1,Joseph,,beer


In [47]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})

In [48]:
df8

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4


In [49]:
df9

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2


In [50]:
pd.merge(df8,df9,on='name', suffixes=['_L','_R'])

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [51]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],'data': range(6)}, columns=['key', 'data'])

In [52]:
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [53]:
df.groupby('key')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000008D9A1D0>

In [54]:
df.groupby('key').count()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,2
B,2
C,2


In [55]:
df.groupby('data').count()

Unnamed: 0_level_0,key
data,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
5,1


In [56]:
df.groupby('data')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000008DA2668>