# Joining and Merging Dataframes - p.6 Data Analysis with Python and Pandas Tutorial

https://www.youtube.com/watch?v=XMjSGGej9y8

https://www.youtube.com/watch?v=h4hOPGo4UVU

- Use merge when index does not matter
- Use join when index does matter
-  Use concat or append to elongate


- Concat gives the flexibility to join based on the axis( all rows or all columns)
- Append is the specific case(axis=0, join='outer') of concat.
- Join is based on the indexes (set by set_index) on how variable =['left','right','inner','couter']


## Functions Covered

pd.merge(df1, df2, on='HPI')

pd.merge(df1, df2, on=['HPI', 'Int_Rate'])

---

df1.set_index('HPI', inplace=True)

df3.set_index('HPI', inplace=True)

"Now the dataframes share an index but no columns.  Important for join"

joined = df1.join(df3)

---

merged = pd.merge(df4, df5, on = 'Year', how='inner', indicator=True)

merged.set_index('Year', inplace=True)

---

merged = pd.merge(df4, df5, on = 'Year', how='outer', indicator=True)

merged.set_index('Year', inplace=True)

---

pd.merge(df6, df7, on='city')

pd.merge(df6, df7, on='city', suffixes=['_left', '_right'])

In [7]:
import pandas as pd

df1 = pd.DataFrame({
    'HPI' : [80, 85, 88, 85],
    'Int_Rate' : [2, 3, 2, 2],
    'US_GDP_Thousands' : [50, 55, 65, 55]},
    index = [2001, 2002, 2003, 2004])
print(df1)

df2 = pd.DataFrame({
    'HPI' : [80, 85, 88, 85],
    'Int_Rate' : [2, 3, 2, 2],
    'US_GDP_Thousands' : [50, 55, 65, 55]},
    index = [2005, 2006, 2007, 2008])
print(df2)

df3 = pd.DataFrame({
    'HPI' : [80, 85, 88, 85],
    'Unemployment' : [7, 8, 9, 6],
    'Low_tier_HPI' : [50, 52, 50, 53]},
    index = [2001, 2002, 2003, 2004])
print(df3)

      HPI  Int_Rate  US_GDP_Thousands
2001   80         2                50
2002   85         3                55
2003   88         2                65
2004   85         2                55
      HPI  Int_Rate  US_GDP_Thousands
2005   80         2                50
2006   85         3                55
2007   88         2                65
2008   85         2                55
      HPI  Unemployment  Low_tier_HPI
2001   80             7            50
2002   85             8            52
2003   88             9            50
2004   85             6            53


In [10]:
pd.merge(df1, df2, on='HPI')

# If you do set the on variable, the merge will choose columns with
# the same names and use those as the on variable.

# df1[HPI] = 80  Matches 1 in df2[HPI]
# df1[HPI] = 85  Matches 2 in df2[HPI]
# df1[HPI] = 88  Matches 1 in df2[HPI]
# df1[HPI] = 85  Matches 2 in df2[HPI]
#
# Total matches = 6

Unnamed: 0,HPI,Int_Rate_x,US_GDP_Thousands_x,Int_Rate_y,US_GDP_Thousands_y
0,80,2,50,2,50
1,85,3,55,3,55
2,85,3,55,2,55
3,85,2,55,3,55
4,85,2,55,2,55
5,88,2,65,2,65


In [9]:
pd.merge(df1, df2, on=['HPI', 'Int_Rate'])

# df1[HPI, Int_Rate] = 80,2  Matches 1 in df2[HPI, Int_Rate]
# df1[HPI, Int_Rate] = 85,3  Matches 1 in df2[HPI, Int_Rate]
# df1[HPI, Int_Rate] = 88,2  Matches 1 in df2[HPI, Int_Rate]
# df1[HPI, Int_Rate] = 85,2  Matches 1 in df2[HPI, Int_Rate]

Unnamed: 0,HPI,Int_Rate,US_GDP_Thousands_x,US_GDP_Thousands_y
0,80,2,50,50
1,85,3,55,55
2,88,2,65,65
3,85,2,55,55


In [11]:
df1.set_index('HPI', inplace=True)
df3.set_index('HPI', inplace=True)

# Now the dataframes share an index but no columns.  Important for join
joined = df1.join(df3)
print(joined)

     Int_Rate  US_GDP_Thousands  Unemployment  Low_tier_HPI
HPI                                                        
80          2                50             7            50
85          3                55             8            52
85          3                55             6            53
85          2                55             8            52
85          2                55             6            53
88          2                65             9            50


In [14]:
df4 = pd.DataFrame({
    'Year' : [2001, 2002, 2003, 2004],
    'Int_Rate' : [2, 3, 2, 2],
    'US_GDP_Thousands' : [50, 55, 65, 55]})
print(df4)


df5 = pd.DataFrame({
    'Year' : [2001, 2003, 2004, 2005],
    'Unemployment' : [7, 8, 9, 6],
    'Low_tier_HPI' : [50, 52, 50, 53]})
print(df5)

   Year  Int_Rate  US_GDP_Thousands
0  2001         2                50
1  2002         3                55
2  2003         2                65
3  2004         2                55
   Year  Unemployment  Low_tier_HPI
0  2001             7            50
1  2003             8            52
2  2004             9            50
3  2005             6            53


In [19]:
merged = pd.merge(df4, df5, on = 'Year', how='inner', indicator=True)
merged.set_index('Year', inplace=True)
print(merged)

      Int_Rate  US_GDP_Thousands  Unemployment  Low_tier_HPI _merge
Year                                                               
2001         2                50             7            50   both
2003         2                65             8            52   both
2004         2                55             9            50   both


In [20]:
merged = pd.merge(df4, df5, on = 'Year', how='outer', indicator=True)
merged.set_index('Year', inplace=True)
print(merged)

      Int_Rate  US_GDP_Thousands  Unemployment  Low_tier_HPI      _merge
Year                                                                    
2001       2.0              50.0           7.0          50.0        both
2002       3.0              55.0           NaN           NaN   left_only
2003       2.0              65.0           8.0          52.0        both
2004       2.0              55.0           9.0          50.0        both
2005       NaN               NaN           6.0          53.0  right_only


# Left, Right, Outer, Inner merges

how = 'left' merge based on df4 keys

how = 'right' merge based on df5 keys

how = 'outer' merge based on union of df4 & df5 keys

how = 'inner' merge based on intersection of df4 & df5 keys (default)


In [23]:
df6 = pd.DataFrame({
    'city' : ['new york', 'chicago', 'orlando', 'baltimore'],
    'temperature' : [21, 14, 35, 36],
    'humidity' : [65, 68, 71, 75]})
df7 = pd.DataFrame({
    'city' : ['chicago', 'new york', 'san diego'],
    'temperature' : [21, 14, 35],
    'humidity' : [65, 68, 71]})
print(df6)
print()
print(df7)

        city  temperature  humidity
0   new york           21        65
1    chicago           14        68
2    orlando           35        71
3  baltimore           36        75 

         city  temperature  humidity
0    chicago           21        65
1   new york           14        68
2  san diego           35        71


In [24]:
pd.merge(df6, df7, on='city')

Unnamed: 0,city,temperature_x,humidity_x,temperature_y,humidity_y
0,new york,21,65,14,68
1,chicago,14,68,21,65


In [25]:
pd.merge(df6, df7, on='city', suffixes=['_left', '_right'])

Unnamed: 0,city,temperature_left,humidity_left,temperature_right,humidity_right
0,new york,21,65,14,68
1,chicago,14,68,21,65
