# 6 Working with multiple sources

In [1]:
import numpy as np
import pandas as pd

df1 = pd.DataFrame(data = {"a": [1,2,1,2,1,2], "b": [3,4,3,4,3,4]})
df2 = pd.DataFrame(data = {"a": [1,3,1,3,1], "b": [5,4,5,4,5]})

## 6.1 Concat two dataframes

In [2]:
temp =pd.concat([df1,df2]) # must be in array
temp

Unnamed: 0,a,b
0,1,3
1,2,4
2,1,3
3,2,4
4,1,3
5,2,4
0,1,5
1,3,4
2,1,5
3,3,4


Notice the index

In [3]:
temp.index

Int64Index([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4], dtype='int64')

In [4]:
temp.loc[0, :] # loc refers to the value of index/ column

Unnamed: 0,a,b
0,1,3
0,1,5


In [5]:
temp.iloc[0, :] # iloc refers to the index of index/column

a    1
b    3
Name: 0, dtype: int64

You can reset the index

In [6]:
temp.reset_index()

Unnamed: 0,index,a,b
0,0,1,3
1,1,2,4
2,2,1,3
3,3,2,4
4,4,1,3
5,5,2,4
6,0,1,5
7,1,3,4
8,2,1,5
9,3,3,4


In [7]:
# right concat 
temp = pd.concat([df1, df2], axis=1)
temp # Nans stands for Not a Number and in pandas refers to no information

Unnamed: 0,a,b,a.1,b.1
0,1,3,1.0,5.0
1,2,4,3.0,4.0
2,1,3,1.0,5.0
3,2,4,3.0,4.0
4,1,3,1.0,5.0
5,2,4,,


Note the column names

In [8]:
temp['a']

Unnamed: 0,a,a.1
0,1,1.0
1,2,3.0
2,1,1.0
3,2,3.0
4,1,1.0
5,2,


## 6.2 Joining 

In [9]:
df1 = pd.DataFrame({
    "first": [1,1,1,1],
    "second": [1,1,1,1],
    "third": [1,2,1,2],
}, index=[1,2,3,4])

df2 = pd.DataFrame({
    "first": [2,2,2,2],
    "second": [2,2,2,2],
    "fourth": [1,3,1,3],
}, index=[3,4,5,6])

In [10]:
df1

Unnamed: 0,first,second,third
1,1,1,1
2,1,1,2
3,1,1,1
4,1,1,2


In [11]:
df2

Unnamed: 0,first,second,fourth
3,2,2,1
4,2,2,3
5,2,2,1
6,2,2,3


In [12]:
# joining on columns
pd.concat([df1, df2], join='outer')

Unnamed: 0,first,second,third,fourth
1,1,1,1.0,
2,1,1,2.0,
3,1,1,1.0,
4,1,1,2.0,
3,2,2,,1.0
4,2,2,,3.0
5,2,2,,1.0
6,2,2,,3.0


In [13]:
# joining on rows
pd.concat([df1, df2], join='outer', axis=1)

Unnamed: 0,first,second,third,first.1,second.1,fourth
1,1.0,1.0,1.0,,,
2,1.0,1.0,2.0,,,
3,1.0,1.0,1.0,2.0,2.0,1.0
4,1.0,1.0,2.0,2.0,2.0,3.0
5,,,,2.0,2.0,1.0
6,,,,2.0,2.0,3.0


In [14]:
# inner join
pd.concat([df1, df2], join='inner', axis=1)

Unnamed: 0,first,second,third,first.1,second.1,fourth
3,1,1,1,2,2,1
4,1,1,2,2,2,3


In [15]:
# left join using join method. Concat can join x dataframes at once
df1.join(df2, how="left", rsuffix="_second")

Unnamed: 0,first,second,third,first_second,second_second,fourth
1,1,1,1,,,
2,1,1,2,,,
3,1,1,1,2.0,2.0,1.0
4,1,1,2,2.0,2.0,3.0


## 6.3 Reading
There so much more to understand about joins. I highly recommend checking original pandas documantation</br>
https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html