In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

## Merging DataFrame

In [None]:
"""
Setting up df from dict is like:

{
    'column1': 'row_1', 'row_2', 'row_3',
    'column2': 'row_1', 'row_2', 'row_3',
}
"""

df = DataFrame({
    'key': ['X', 'Y', 'Z', 'X', 'X', 'Z', 'T'],
    'dataset': np.arange(7)
})
df

Unnamed: 0,dataset,key
0,0,X
1,1,Y
2,2,Z
3,3,X
4,4,X
5,5,Z
6,6,T


In [79]:
df2 = DataFrame({
    'key': ['X', 'Y', 'Z', 'L'],
    'data': np.random.randn(4),
    'HOLA': np.random.randn(4)
})
df2

Unnamed: 0,HOLA,data,key
0,-0.255739,0.275839,X
1,-0.382845,-0.766646,Y
2,-0.569887,-0.812562,Z
3,0.667605,-1.117636,L


In [80]:
# merge is like SQL join
# how : {'left', 'right', 'outer', 'inner'}, default 'inner'
# left: use only keys from left frame (SQL: left outer join)
# right: use only keys from right frame (SQL: right outer join)
# outer: use union of keys from both frames (SQL: full outer join)
# inner: use intersection of keys from both frames (SQL: inner join)

pd.merge(df, df2)

# can also specify which column to merge on using 'on'
pd.merge(df, df2, on='key')

# how
pd.merge(df, df2, on='key', how='left')

Unnamed: 0,dataset,key,HOLA,data
0,0,X,-0.255739,0.275839
1,1,Y,-0.382845,-0.766646
2,2,Z,-0.569887,-0.812562
3,3,X,-0.255739,0.275839
4,4,X,-0.255739,0.275839
5,5,Z,-0.569887,-0.812562
6,6,T,,


In [81]:
# many to many merge (merge on multiple keys)
df = DataFrame({
    'key': [1, 1, 2, 3, 4],
    'zum': [10, 40, 30, 40, 50],
    'name1': ['a', 'b', 'c', 'd', 'e']
})
df

df2 = DataFrame({
    'key': [1, 1, 6, 3, 1],
    'zum': [10, 50, 310, 70, 80],
    'nam2e': ['z', 'bsfd', 'cfs', 'dsf', 'esfd']
})
df2

Unnamed: 0,key,nam2e,zum
0,1,z,10
1,1,bsfd,50
2,6,cfs,310
3,3,dsf,70
4,1,esfd,80


In [82]:
# pass list in on for many to many merge
pd.merge(df, df2, on=['key', 'zum'], how='outer')

Unnamed: 0,key,name1,zum,nam2e
0,1.0,a,10.0,z
1,1.0,b,40.0,
2,2.0,c,30.0,
3,3.0,d,40.0,
4,4.0,e,50.0,
5,1.0,,50.0,bsfd
6,6.0,,310.0,cfs
7,3.0,,70.0,dsf
8,1.0,,80.0,esfd


## Merging on indices

In [83]:
df_left = DataFrame({
    'key': ['X', 'Y', 'Z', 'X', 'Y'],
    'data': range(5),
})
df_left

Unnamed: 0,data,key
0,0,X
1,1,Y
2,2,Z
3,3,X
4,4,Y


In [84]:
df_right = DataFrame({
    'group_data': [10, 20],
}, index=['X', 'Y'])
df_right

Unnamed: 0,group_data
X,10
Y,20


In [85]:
# using right_index = True will merge 
# the key from left df and the right df's 
# index will be treated as right key
# pd.merge(df_left, df_right, on='key')  # Error
pd.merge(df_left, df_right, 
         left_on='key', right_index=True)

Unnamed: 0,data,key,group_data
0,0,X,10
3,3,X,10
1,1,Y,20
4,4,Y,20


In [86]:
df_left_hr = DataFrame({
    'key1': ['SF', 'SF', 'SF', 'LA', 'LA'],
    'key2': [10, 20, 30, 20, 30],
    'data_set': np.arange(5.)
})
df_left_hr

Unnamed: 0,data_set,key1,key2
0,0.0,SF,10
1,1.0,SF,20
2,2.0,SF,30
3,3.0,LA,20
4,4.0,LA,30


In [87]:
df_right_hr = DataFrame(np.arange(10)
                        .reshape(5, 2),
                        index=[
                            ['LA', 'LA', 'LA', 'SF', 'SF'],
                            [20, 2, 4, 10, 20]
                        ],
                        columns=['col_1', 'col2'])
df_right_hr

Unnamed: 0,Unnamed: 1,col_1,col2
LA,20,0,1
LA,2,2,3
LA,4,4,5
SF,10,6,7
SF,20,8,9


In [88]:
# Merges left df's column and right df's
# hierarchical indices
pd.merge(df_left_hr, df_right_hr,
         left_on=['key1', 'key2'],
         right_index=True)

Unnamed: 0,data_set,key1,key2,col_1,col2
0,0.0,SF,10,6,7
1,1.0,SF,20,8,9
3,3.0,LA,20,0,1


## Concatenation

In [89]:
arr = np.arange(9).reshape((3, 3))
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [91]:
# Concatenates along columns / horizontally
# if axis = 1, else row wise / vertically
# if axis = 0
np.concatenate([arr, arr], axis=0)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [92]:
df = DataFrame(np.random.rand(4, 3),
               columns=['X', 'Y', 'Z'])
df

Unnamed: 0,X,Y,Z
0,0.175578,0.501016,0.082598
1,0.90982,0.841436,0.217494
2,0.596405,0.696432,0.114723
3,0.374545,0.698198,0.967364


In [93]:
df2 = DataFrame(np.random.rand(3, 3),
                columns=['Q', 'P', 'X'])
df2

Unnamed: 0,Q,P,X
0,0.983263,0.555483,0.170222
1,0.738323,0.751799,0.400367
2,0.86006,0.985577,0.090441


In [95]:
pd.concat([df, df2])

# Doing this will also concatenate
# indices, to get rid of that, do
pd.concat([df, df2], ignore_index=True)

Unnamed: 0,P,Q,X,Y,Z
0,,,0.175578,0.501016,0.082598
1,,,0.90982,0.841436,0.217494
2,,,0.596405,0.696432,0.114723
3,,,0.374545,0.698198,0.967364
4,0.555483,0.983263,0.170222,,
5,0.751799,0.738323,0.400367,,
6,0.985577,0.86006,0.090441,,
