# Join, Merge, and Combine Datasets using Pandas

## Introduction

In this notebook, we will cover various techniques to join, merge, and combine datasets using pandas. This includes methods such as `merge`, `join`, `concat`, and others. These methods are essential for data manipulation and combining different datasets for analysis.

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np

## Creating Sample DataFrames

In [3]:
# Creating sample DataFrames
raw_data1 = {
    'subject_id': ['1', '2', '3', '4', '5'],
    'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
    'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df1 = pd.DataFrame(raw_data1, columns = ['subject_id', 'first_name', 'last_name'])

raw_data2 = {
    'subject_id': ['4', '5', '6', '7', '8'],
    'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
    'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df2 = pd.DataFrame(raw_data2, columns = ['subject_id', 'first_name', 'last_name'])

raw_data3 = {
    'subject_id': ['1', '2', '3', '6', '7'],
    'test_id': [51, 15, 15, 61, 16]}
df3 = pd.DataFrame(raw_data3, columns = ['subject_id', 'test_id'])

# Display the dataframes
df1, df2, df3

(  subject_id first_name last_name
 0          1       Alex  Anderson
 1          2        Amy  Ackerman
 2          3      Allen       Ali
 3          4      Alice      Aoni
 4          5     Ayoung   Atiches,
   subject_id first_name last_name
 0          4      Billy    Bonder
 1          5      Brian     Black
 2          6       Bran   Balwner
 3          7      Bryce     Brice
 4          8      Betty    Btisan,
   subject_id  test_id
 0          1       51
 1          2       15
 2          3       15
 3          6       61
 4          7       16)

## Concatenating DataFrames

In [4]:
# Concatenating DataFrames
df_concat = pd.concat([df1, df2])
df_concat

Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


## Merging DataFrames

In [5]:
# Merging DataFrames on a Key
df_merged = pd.merge(df1, df2, on='subject_id', how='inner')
df_merged

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


In [6]:
# Merging DataFrames on Multiple Keys
df_merge_multikey = pd.merge(df1, df2, on=['subject_id', 'first_name'], how='inner')
df_merge_multikey

Unnamed: 0,subject_id,first_name,last_name_x,last_name_y


In [7]:
# Merging with Different Join Types
# Inner Join
df_inner = pd.merge(df1, df2, on='subject_id', how='inner')
df_inner

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black


In [8]:
# Outer Join
df_outer = pd.merge(df1, df2, on='subject_id', how='outer')
df_outer

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black
5,6,,,Bran,Balwner
6,7,,,Bryce,Brice
7,8,,,Betty,Btisan


In [9]:
# Left Join
df_left = pd.merge(df1, df2, on='subject_id', how='left')
df_left

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,1,Alex,Anderson,,
1,2,Amy,Ackerman,,
2,3,Allen,Ali,,
3,4,Alice,Aoni,Billy,Bonder
4,5,Ayoung,Atiches,Brian,Black


In [10]:
# Right Join
df_right = pd.merge(df1, df2, on='subject_id', how='right')
df_right

Unnamed: 0,subject_id,first_name_x,last_name_x,first_name_y,last_name_y
0,4,Alice,Aoni,Billy,Bonder
1,5,Ayoung,Atiches,Brian,Black
2,6,,,Bran,Balwner
3,7,,,Bryce,Brice
4,8,,,Betty,Btisan


## Joining DataFrames

In [11]:
# Set the index to be the key to join on
df1.set_index('subject_id', inplace=True)
df2.set_index('subject_id', inplace=True)

# Join the two DataFrames
df_join = df1.join(df2, lsuffix='_caller', rsuffix='_other')
df_join

Unnamed: 0_level_0,first_name_caller,last_name_caller,first_name_other,last_name_other
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Alex,Anderson,,
2,Amy,Ackerman,,
3,Allen,Ali,,
4,Alice,Aoni,Billy,Bonder
5,Ayoung,Atiches,Brian,Black


## Concatenating Along Columns

In [12]:
# Concatenate along columns
df_concat_col = pd.concat([df1, df2], axis=1)
df_concat_col

Unnamed: 0_level_0,first_name,last_name,first_name,last_name
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Alex,Anderson,,
2,Amy,Ackerman,,
3,Allen,Ali,,
4,Alice,Aoni,Billy,Bonder
5,Ayoung,Atiches,Brian,Black
6,,,Bran,Balwner
7,,,Bryce,Brice
8,,,Betty,Btisan


## Append Rows of DataFrames

In [15]:
# Append rows of DataFrames using pd.concat instead of append
df_append = pd.concat([df1, df2])
df_append


Unnamed: 0,subject_id,first_name,last_name
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
3,4,Alice,Aoni
4,5,Ayoung,Atiches
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner
3,7,Bryce,Brice
4,8,Betty,Btisan


## Combining DataFrames with Combine_first

In [16]:
# Combining DataFrames with combine_first
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)
df_combine = df1.combine_first(df2)
df_combine

Unnamed: 0,index,subject_id,first_name,last_name
0,0,1,Alex,Anderson
1,1,2,Amy,Ackerman
2,2,3,Allen,Ali
3,3,4,Alice,Aoni
4,4,5,Ayoung,Atiches
