## Combining Datasets: Concat and Append

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Convenience function

def make_dict(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return data
    #return pd.DataFrame(data, ind)
    
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [3]:
make_dict(range(3), 'ABC')

{0: ['0A', '0B', '0C'], 1: ['1A', '1B', '1C'], 2: ['2A', '2B', '2C']}

In [4]:
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [5]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])

# Concat the two series

pd.concat((ser1, ser2))

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [6]:
df1 = make_df('AB', [1, 2])
df2 = make_df('CD', [3, 4])

# Concat the two df's row-wise
df2.columns = df1.columns
pd.concat([df1, df2], axis=0, keys=['a', 'b'])

Unnamed: 0,Unnamed: 1,A,B
a,1,A1,B1
a,2,A2,B2
b,3,C3,D3
b,4,C4,D4


In [7]:
# Concat df1 and df2 columnwise
df2.index = df1.index
pd.concat([df1, df2], axis=1, keys=['a', 'b'])

Unnamed: 0_level_0,a,a,b,b
Unnamed: 0_level_1,A,B,A,B
1,A1,B1,C3,D3
2,A2,B2,C4,D4


In [8]:
x = make_df('AB', [0, 1])
y = make_df('CD', [2, 3])

# Make x and y have the same index
x.index = y.index

# Concatenate them to show that indexes can be the same / repeated
pd.concat([x, y])

Unnamed: 0,A,B,C,D
2,A0,B0,,
3,A1,B1,,
2,,,C2,D2
3,,,C3,D3


In [9]:
# Set an option to output an error if any indices are repeated after concatanation

pd.concat([x, y])  # verify_integrity=True

Unnamed: 0,A,B,C,D
2,A0,B0,,
3,A1,B1,,
2,,,C2,D2
3,,,C3,D3


In [10]:
# Concat x and y, but ignore the index to make a new integer index after concatanation

pd.concat([x, y], ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,,,C2,D2
3,,,C3,D3


In [11]:
# Concat x and y, but still have them be separated through multi-indexing

pd.concat([x, y], keys=['a', 'b'])

Unnamed: 0,Unnamed: 1,A,B,C,D
a,2,A0,B0,,
a,3,A1,B1,,
b,2,,,C2,D2
b,3,,,C3,D3


In [12]:
# Concat columnwise, but do set-wise intersect instead of union

df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])

pd.concat([df5, df6], join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [13]:
%whos

Variable    Type         Data/Info
----------------------------------
df1         DataFrame        A   B\n1  A1  B1\n2  A2  B2
df2         DataFrame        A   B\n1  C3  D3\n2  C4  D4
df5         DataFrame        A   B   C\n1  A1  B1  C1\n2  A2  B2  C2
df6         DataFrame        B   C   D\n3  B3  C3  D3\n4  B4  C4  D4
make_df     function     <function make_df at 0x00000223A06D9840>
make_dict   function     <function make_dict at 0x00000223A06D97B8>
np          module       <module 'numpy' from 'C:\<...>ges\\numpy\\__init__.py'>
pd          module       <module 'pandas' from 'C:<...>es\\pandas\\__init__.py'>
ser1        Series       1    A\n2    B\n3    C\ndtype: object
ser2        Series       4    D\n5    E\n6    F\ndtype: object
x           DataFrame        A   B\n2  A0  B0\n3  A1  B1
y           DataFrame        C   D\n2  C2  D2\n3  C3  D3


In [14]:
df5

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2


In [15]:
df6

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4


In [16]:
# Concatanate df5 and df6, but only keep columns of df6


In [17]:
df1
df2

# append df2 to df1

df1.append(df2)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
1,C3,D3
2,C4,D4


## Combining Datasets: Merge and Join

In [18]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'], 'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'], 'hire date': [2004, 2008, 2012, 2014]})

In [19]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [20]:
df2

Unnamed: 0,employee,hire date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [21]:
# Merge df1 and df2 into df3 so that the hire dates etc are all on the same dataframe
# Make the index of df3 be the 'employee' column

df3 = pd.merge(df1, df2)
df3.set_index('employee', inplace=True)  # Set the index to 'employee'
df3.reset_index(inplace=True)  # Send 'employee' to column, set the index to integer values
df3

Unnamed: 0,employee,group,hire date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [22]:
df3 = pd.merge(df1, df2)

In [23]:
df3

Unnamed: 0,employee,group,hire date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [24]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'], 'supervisor': ['Carly', 'Guido', 'Steve']})
df4

# Merge df4 and df3 in a many-to-one join
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


In [25]:
df5 = pd.DataFrame({'group' : ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'], 'skills' : ['math', 'spreadsheets', 'coding', 'linux', 'spreadsheets', 'organization']})
df5

# Merge df1 and df5 in a many-to-many join. This happens when the key column in both left and right array contains duplicates.


Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


### Specification of the Merge key

In [26]:
# Merge df1 and df2 specifically by employee as the key

In [27]:
df3['name'] = df3['employee']
df3.drop('employee', axis=1, inplace=True)

In [28]:
df3['Salary'] = [70000, 80000, 120000, 90000]
df3.drop(['group', 'hire date'], axis=1, inplace=True)

In [29]:
df3

Unnamed: 0,name,Salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


In [30]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [31]:
# Merge df1 and df3, using the 'name' and 'employee' columns as the key

In [32]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')

In [33]:
df1a

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


In [34]:
df2a

Unnamed: 0_level_0,hire date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


In [39]:
# Merge df1a and df2a by their indices
pd.merge(df1a, df2a, left_index=True, right_index=True)

Unnamed: 0_level_0,group,hire date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [42]:
# Merge df1a and df2a by their indices with fewer lines of code, using an object method

In [56]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                   'food': ['fish', 'beans', 'bread']},
                  columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'], 
                   'drink': ['wine', 'beer']},
                  columns=['name', 'drink'])

In [54]:
df6

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


In [57]:
df7

Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


In [58]:
pd.merge(df6, df7)  # Default join is an intersect

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [60]:
# Merge by taking union

In [63]:
# Merge, but only care about adding to the rows of df6

In [66]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                   'rank': [1, 2, 3, 4]})

df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                   'rank': [3, 1, 4, 2]})

In [68]:
df8

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4


In [69]:
df9

Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2


In [73]:
pd.merge(df8, df9)

Unnamed: 0,name,rank


In [74]:
# Merge, but just make new columns for each 'rank'

In [80]:
# Merge in the same way as above, but change the names of each new column