In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
df1 = pd.DataFrame(data= 1,
                   columns=['A','B','C','D'],
                   index=[0,1,2,3])

df2 = pd.DataFrame(data= 2 ,
                   columns=['A','B','C','D'],
                   index=[4,5,6,7])

df3 = pd.DataFrame(data= 3,
                   columns=['A','B','C','D'],
                   index=[4,5,6,7])

In [3]:
df1

Unnamed: 0,A,B,C,D
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1


In [4]:
df2

Unnamed: 0,A,B,C,D
4,2,2,2,2
5,2,2,2,2
6,2,2,2,2
7,2,2,2,2


In [5]:
df3

Unnamed: 0,A,B,C,D
4,3,3,3,3
5,3,3,3,3
6,3,3,3,3
7,3,3,3,3


## Concat

### `pandas.concat(objs, *, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=None)`

    Useful link
    - https://pandas.pydata.org/docs/user_guide/merging.html

In [12]:
pd.concat([df1,df2,df3],axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,1.0,1.0,1.0,1.0,,,,,,,,
1,1.0,1.0,1.0,1.0,,,,,,,,
2,1.0,1.0,1.0,1.0,,,,,,,,
3,1.0,1.0,1.0,1.0,,,,,,,,
4,,,,,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0
5,,,,,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0
6,,,,,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0
7,,,,,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0


## Merge

### `DataFrame.merge(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=None, indicator=False, validate=None)`

    - how : {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’
    - suffixes : list-like, default is (“_x”, “_y”)

In [16]:
df_1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
df_2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],
                          'C': ['C0', 'C1', 'C2', 'C4'],
                          'D': ['D0', 'D1', 'D2', 'D4']})

In [18]:
df_1

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [20]:
df_2

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K4,C4,D4


  * The first is referenced as left. The second, right.
  * 'how' is the type of merge to be performed - how: 'left','right', 'outer', 'inner'. In case you are familiar with SQL, this is equivalent to when combining SQL tables
    * It will be your job in the workplace to decide which type of merge should be performed
  * 'on' is the column to merge on

The example below shows: `how='left'`. Note that the result doesn't consider the 'key' column value, which df_1 doesn't have, like K4

In [23]:
pd.merge(df_1, df_2, how='left', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,,


In [25]:
pd.merge(df_1, df_2, how='right', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K4,,,C4,D4


In [27]:
# Inner takes only common values

pd.merge(df_1, df_2, how='inner', on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2


In [29]:
# Outer takes all values
# Indicator creates another column & shows which data is from which dataset

pd.merge(df_1, df_2, how='outer', on='key', indicator=True) 

Unnamed: 0,key,A,B,C,D,_merge
0,K0,A0,B0,C0,D0,both
1,K1,A1,B1,C1,D1,both
2,K2,A2,B2,C2,D2,both
3,K3,A3,B3,,,left_only
4,K4,,,C4,D4,right_only


In [31]:
df_3 = pd.DataFrame({'Column_key': ['K0', 'K1', 'K2', 'K4'],
                          'C': ['C10', 'C20', 'C30', 'C40'],
                          'D': ['D10', 'D20', 'D30', 'D40']}) 

df_3

Unnamed: 0,Column_key,C,D
0,K0,C10,D10
1,K1,C20,D20
2,K2,C30,D30
3,K4,C40,D40


### If you are interested in merging `df_3` and `df_1`, considering only the data that is mutual to both: `how='inner'`, where column `Key` for each dataset is different, like `key`in `df_1` and `Column_key` in `df_3`, we can do follow,

* `left_on` uses the column named `key`, which is a column name of the DataFrame `df_1`.
* `right_on` uses the column named `Column_key`, which is a column name of the DataFrame `df_3`


In [34]:
pd.merge(left=df_1, right=df_3, how='inner', left_on='key', right_on='Column_key')

Unnamed: 0,key,A,B,Column_key,C,D
0,K0,A0,B0,K0,C10,D10
1,K1,A1,B1,K1,C20,D20
2,K2,A2,B2,K2,C30,D30


## Join

### `DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False, validate=None)`


In [38]:
df_left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                        'B': ['B0', 'B1', 'B2']},
                       index=['K0', 'K1', 'K2']) 

df_right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                         'D': ['D0', 'D2', 'D3']},
                        index=['K0', 'K2', 'K3'])

In [40]:
df_left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2


In [42]:
df_right

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In [44]:
df_left.join(df_right,how='left')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [46]:
df1 = pd.read_csv("/Users/raihannasir/Documents/Pandas/New Materials/sales-data-set.csv")
df1

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-05-02,24924.50,False
1,1,1,2010-05-03,21827.90,False
2,1,1,2010-05-14,18926.74,False
3,1,1,2010-05-21,14773.04,False
4,1,1,2010-05-28,15580.43,False
...,...,...,...,...,...
20472,45,98,2010-05-03,4.50,False
20473,45,98,2010-05-21,55.86,False
20474,45,98,2010-05-28,97.28,False
20475,45,98,2010-06-08,8.25,False


In [48]:
df1['Date'].dtype

dtype('O')

In [7]:
df2 = pd.read_csv("/Users/raihannasir/Documents/Pandas/New Materials/fuel price.csv")
df2

Unnamed: 0.1,Unnamed: 0,Date,Pump price in pence/litre (ULSP),Pump price in pence/litre (ULSD),Duty rate in pence/litre (ULSP),Duty rate in pence/litre (ULSD),VAT percentage rate (ULSP),VAT percentage rate (ULSD)
0,2,09/06/2003,74.59,76.77,45.82,45.82,17.5,17.5
1,3,16/06/2003,74.47,76.69,45.82,45.82,17.5,17.5
2,4,23/06/2003,74.42,76.62,45.82,45.82,17.5,17.5
3,5,30/06/2003,74.35,76.51,45.82,45.82,17.5,17.5
4,6,07/07/2003,74.28,76.46,45.82,45.82,17.5,17.5
...,...,...,...,...,...,...,...,...
904,906,05/10/2020,113.26,118.11,57.95,57.95,20.0,20.0
905,907,12/10/2020,113.19,118.05,57.95,57.95,20.0,20.0
906,908,19/10/2020,113.18,118.08,57.95,57.95,20.0,20.0
907,909,26/10/2020,113.14,118.08,57.95,57.95,20.0,20.0


In [9]:
df2 = df2.rename(columns={'Unnamed: 0': 'Store'}, inplace=True)
df2

In [11]:
df2

In [17]:
df2 = df2.astype({'Date':"datetime64[s]"})
df2

AttributeError: 'NoneType' object has no attribute 'astype'

In [373]:
df2['Date'].dtypes

dtype('<M8[s]')

In [387]:
df1['Date'] = df1['Date'].astype({'Date':"datetime64[s]"})
df1

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-05-02,24924.50,False
1,1,1,2010-05-03,21827.90,False
2,1,1,2010-05-14,18926.74,False
3,1,1,2010-05-21,14773.04,False
4,1,1,2010-05-28,15580.43,False
...,...,...,...,...,...
20472,45,98,2010-05-03,4.50,False
20473,45,98,2010-05-21,55.86,False
20474,45,98,2010-05-28,97.28,False
20475,45,98,2010-06-08,8.25,False


In [389]:
df1['Date'].dtype

dtype('<M8[s]')

In [391]:
pd.merge(df1,df2,how='left', on='Date')

Unnamed: 0.1,Store,Dept,Date,Weekly_Sales,IsHoliday,Unnamed: 0,Pump price in pence/litre (ULSP),Pump price in pence/litre (ULSD),Duty rate in pence/litre (ULSP),Duty rate in pence/litre (ULSD),VAT percentage rate (ULSP),VAT percentage rate (ULSD)
0,1,1,2010-05-02,24924.50,False,,,,,,,
1,1,1,2010-05-03,21827.90,False,,,,,,,
2,1,1,2010-05-14,18926.74,False,,,,,,,
3,1,1,2010-05-21,14773.04,False,,,,,,,
4,1,1,2010-05-28,15580.43,False,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
20472,45,98,2010-05-03,4.50,False,,,,,,,
20473,45,98,2010-05-21,55.86,False,,,,,,,
20474,45,98,2010-05-28,97.28,False,,,,,,,
20475,45,98,2010-06-08,8.25,False,,,,,,,
