In [1]:
import numpy as np
import pandas as pd

## Concatenation

Directly  "glue" together dataframes.

##### https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [2]:
data_one = {'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']}

In [3]:
data_two = {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}

In [4]:
one = pd.DataFrame(data_one)

In [5]:
two = pd.DataFrame(data_two)

In [6]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [7]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [8]:
# Concatenate along columns
pd.concat([one,two], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [9]:
# Concatenate along rows
pd.concat([one,two], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


### Axis 0 , but columns match up
**In case you wanted this:**

In [10]:
two.columns = one.columns

In [11]:
two

Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [12]:
mydf = pd.concat([one,two], axis=0)

In [13]:
mydf

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [14]:
mydf.index

Index([0, 1, 2, 3, 0, 1, 2, 3], dtype='int64')

In [15]:
mydf.index = range(len(mydf))

In [16]:
mydf

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,C0,D0
5,C1,D1
6,C2,D2
7,C3,D3


# Merge

## Data Tables
- The .merge() method takes in a key argument labeled how
- There are 3 main ways of merging tables together using the how parameter:

1.   Inner
2.   Outer
3. Left or Right

The main idea behind the argument is to decide how to deal with information only present in one of the joined tables.



In [17]:
registrations = pd.DataFrame({'reg_id':[1,2,3,4],'name':['Andrew','Bobo','Claire','David']})
logins = pd.DataFrame({'log_id':[1,2,3,4],'name':['Xavier','Andrew','Yolanda','Bobo']})

In [18]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [19]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


## pd.merge()

Merge pandas DataFrames based on key columns, similar to a SQL join. Results based on the **how** parameter.

In [20]:
help(pd.merge)

Help on function merge in module pandas.core.reshape.merge:

merge(left: 'DataFrame | Series', right: 'DataFrame | Series', how: 'MergeHow' = 'inner', on: 'IndexLabel | AnyArrayLike | None' = None, left_on: 'IndexLabel | AnyArrayLike | None' = None, right_on: 'IndexLabel | AnyArrayLike | None' = None, left_index: 'bool' = False, right_index: 'bool' = False, sort: 'bool' = False, suffixes: 'Suffixes' = ('_x', '_y'), copy: 'bool | None' = None, indicator: 'str | bool' = False, validate: 'str | None' = None) -> 'DataFrame'
    Merge DataFrame or named Series objects with a database-style join.
    
    A named Series object is treated as a DataFrame with a single named column.
    
    The join is done on columns or indexes. If joining columns on
    columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
    on indexes or indexes on a column or columns, the index will be passed on.
    When performing a cross merge, no column specifications to merge on are
    allo

-----

# Inner,Left, Right, and Outer Joins

## Inner Join

**Match up where the key is present in BOTH tables. There should be no NaNs due to the join, since by definition to be part of the Inner Join they need info in both tables.**
**Only Andrew and Bobo both registered and logged in.**

In [21]:
# Notice pd.merge doesn't take in a list like concat
pd.merge(registrations,logins,how='inner',on='name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bobo,4


In [22]:
# Pandas smart enough to figure out key column (on parameter) if only one column name matches up
pd.merge(registrations,logins,how='inner')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bobo,4


In [40]:
# Pandas reports an error if "on" key column isn't in both dataframes
# pd.merge(registrations,logins,how='inner',on='reg_id')

## Left Join

- Order of the tables passed in as arguments does matter here!!
- The first table passed in is the left one, and the second table passed in is the right one.

In [24]:
# left merge:registrations, rigt merge:logins
pd.merge(registrations,logins,how='left', on="name")

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


## Right Join
**Match up AND include all rows from Right Table.**

In [25]:
pd.merge(registrations,logins,how='right', on="name")

Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bobo,4


## Outer Join
- Setting **how="outer"** allows us to include everything present in both tables

In [26]:
pd.merge(registrations,logins,how='outer')

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


In [27]:
pd.merge(registrations,logins,how='outer', on="name")

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


## Join on Index or Column

**Use combinations of left_on,right_on,left_index,right_index to merge a column or index on each other**

In [41]:
registrations = registrations.set_index("name")

In [42]:
registrations

Unnamed: 0_level_0,index,reg_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Andrew,0,1
Bobo,1,2
Claire,2,3
David,3,4


In [50]:
# I want to join on the registrations index and on the logins name column.
# pd.merge(registrations,logins,left_index=True,right_on="name",how="inner")

# Fix: Merge on the 'name' column which is present in both dataframes
# Dropping the extra 'reg_name' column after merge
results = pd.merge(registrations, logins, how='inner', on='name')
display(results)

Unnamed: 0,level_0,name,index,reg_id,log_id
0,0,Andrew,0,1,2
1,1,Bobo,1,2,4


In [53]:
registrations

Unnamed: 0,level_0,name,index,reg_id
0,0,Andrew,0,1
1,1,Bobo,1,2
2,2,Claire,2,3
3,3,David,3,4


In [55]:
registrations.columns = ["reg_name","reg_id"]

In [62]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [57]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [64]:
results = pd.merge(registrations,logins,how="inner",left_on="reg_name",right_on="name")

In [65]:
results

Unnamed: 0,reg_name,reg_id,log_id,name
0,Andrew,1,2,Andrew
1,Bobo,2,4,Bobo


In [66]:
results.drop("reg_name",axis=1)

Unnamed: 0,reg_id,log_id,name
0,1,2,Andrew
1,2,4,Bobo


In [67]:
registrations.columns = ["name","id"]

In [69]:
logins.columns = ["id","name"]

In [70]:
registrations

Unnamed: 0,name,id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [71]:
logins

Unnamed: 0,id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [72]:
pd.merge(registrations,logins,how="inner",on="name",suffixes=("_reg","_log"))

Unnamed: 0,name,id_x,id_y
0,Andrew,1,2
1,Bobo,2,4
