https://www.datacamp.com/courses/merging-dataframes-with-pandas
# 1. Merging DataFrames
pd.merge(df1, df2): extends concat( ) with the ability to align rows using multiple columns. The default strategy is an inner join.

## 4 methods of merging data
- df1.append(df2): stacking vertically
- pd.concat([df1, df2]): stacking horizontally or vertically with simple inner/outer joins on indexes
- df1.join(df2): left/right/inner/outer joins on indexes
- pd.merge([df1, df2]): many joins on multiple columns

In [82]:
import pandas as pd
import numpy as np

In [83]:
pop = pd.read_csv('datasets/pa_zipcode_population.csv')
pop.head(3)

Unnamed: 0,zipcode,2010 Census Population
0,16855,282
1,15681,5241
2,18657,11985


In [25]:
cities = pd.read_csv('datasets/pa_zipcode_city.csv')
cities.head(3)

Unnamed: 0,zipcode,City,State
0,17545,MANHEIM,PA
1,18455,PRESTON PARK,PA
2,17307,BIGLERVILE,PA


In [26]:
pd.merge(pop, cities)

Unnamed: 0,zipcode,2010 Census Population,City,State
0,16855,282,MINERAL SPRINGS,PA
1,15681,5241,SALTBURG,PA
2,18657,11985,TUNKHANNOCK,PA
3,17307,5899,BIGLERVILE,PA
4,15635,220,HANNASTOWN,PA


In [27]:
bronze = pd.read_csv('datasets/Summer Olympic medals/Bronze.csv')
bronze.head()

Unnamed: 0,NOC,Country,Total
0,USA,United States,1052.0
1,URS,Soviet Union,584.0
2,GBR,United Kingdom,505.0
3,FRA,France,475.0
4,GER,Germany,454.0


In [28]:
silver = pd.read_csv('datasets/Summer Olympic medals/Silver.csv')
gold = pd.read_csv('datasets/Summer Olympic medals/Gold.csv')

### pd.merge method 1:

In [30]:
pd.merge(bronze, gold, on ='NOC').head(8)

Unnamed: 0,NOC,Country_x,Total_x,Country_y,Total_y
0,USA,United States,1052.0,United States,2088.0
1,URS,Soviet Union,584.0,Soviet Union,838.0
2,GBR,United Kingdom,505.0,United Kingdom,498.0
3,FRA,France,475.0,France,378.0
4,GER,Germany,454.0,Germany,407.0
5,AUS,Australia,413.0,Australia,293.0
6,ITA,Italy,374.0,Italy,460.0
7,HUN,Hungary,345.0,Hungary,400.0


### pd.merge method 2: on=(['col_name1', 'col_name2'])

In [33]:
pd.merge(bronze, gold, on=['NOC', 'Country']).head(8)

Unnamed: 0,NOC,Country,Total_x,Total_y
0,USA,United States,1052.0,2088.0
1,URS,Soviet Union,584.0,838.0
2,GBR,United Kingdom,505.0,498.0
3,FRA,France,475.0,378.0
4,GER,Germany,454.0,407.0
5,AUS,Australia,413.0,293.0
6,ITA,Italy,374.0,460.0
7,HUN,Hungary,345.0,400.0


### pd.merge method 3: suffixes=['new1', 'new2']

In [37]:
pd.merge(bronze, gold, on=['NOC', 'Country'], suffixes=['_bronze', '_gold']).head()

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052.0,2088.0
1,URS,Soviet Union,584.0,838.0
2,GBR,United Kingdom,505.0,498.0
3,FRA,France,475.0,378.0
4,GER,Germany,454.0,407.0


In [39]:
counties = pd.read_csv('datasets/pa_counties.csv')
counties

Unnamed: 0,city name,county name
0,SALTBURG,INDIANA
1,MINERAL SPRINGS,CLEARFIELD
2,BIGLERVILE,ADAMS
3,HANNASTOWN,WESTMORELAND
4,TUNKHANNOCK,WYOMING


In [41]:
pd.merge(counties, cities, left_on='city name', right_on='City')

Unnamed: 0,city name,county name,zipcode,City,State
0,SALTBURG,INDIANA,15681,SALTBURG,PA
1,MINERAL SPRINGS,CLEARFIELD,16855,MINERAL SPRINGS,PA
2,BIGLERVILE,ADAMS,17307,BIGLERVILE,PA
3,HANNASTOWN,WESTMORELAND,15635,HANNASTOWN,PA
4,TUNKHANNOCK,WYOMING,18657,TUNKHANNOCK,PA


In [44]:
pd.merge(cities, counties, left_on='City', right_on='city name')

Unnamed: 0,zipcode,City,State,city name,county name
0,17307,BIGLERVILE,PA,BIGLERVILE,ADAMS
1,16855,MINERAL SPRINGS,PA,MINERAL SPRINGS,CLEARFIELD
2,15635,HANNASTOWN,PA,HANNASTOWN,WESTMORELAND
3,15681,SALTBURG,PA,SALTBURG,INDIANA
4,18657,TUNKHANNOCK,PA,TUNKHANNOCK,WYOMING


# Practice 1

In [57]:
rev = pd.read_csv('datasets/revenue.csv')
rev

Unnamed: 0,city,branch_id,revenue
0,Austin,10,100
1,Denver,20,83
2,Springfield,30,4
3,Mendocino,47,200


In [58]:
manager = pd.read_csv('datasets/manager.csv')
manager

Unnamed: 0,branch,branch_id,manager
0,Austin,10,Charlers
1,Denver,20,Joel
2,Mendocino,47,Brett
3,Springfield,31,Sally


In [62]:
pd.merge(rev, manager, on='branch_id')

Unnamed: 0,city,branch_id,revenue,branch,manager
0,Austin,10,100,Austin,Charlers
1,Denver,20,83,Denver,Joel
2,Mendocino,47,200,Mendocino,Brett


In [63]:
pd.merge(rev, manager, left_on='city', right_on='branch')

Unnamed: 0,city,branch_id_x,revenue,branch,branch_id_y,manager
0,Austin,10,100,Austin,10,Charlers
1,Denver,20,83,Denver,20,Joel
2,Springfield,30,4,Springfield,31,Sally
3,Mendocino,47,200,Mendocino,47,Brett


In [66]:
rev['state'] = ['TX','CO','IL','CA']
manager['state'] = ['TX','CO','CA','MO']
pd.merge(rev, manager, on=['branch_id', 'state'])

Unnamed: 0,city,branch_id,revenue,state,branch,manager
0,Austin,10,100,TX,Austin,Charlers
1,Denver,20,83,CO,Denver,Joel
2,Mendocino,47,200,CA,Mendocino,Brett


# 2. Joining DataFrames: df1.join(df2, how='outer')
- how='left', or right, or inner, or outer.

In [81]:
pd.merge(bronze, gold, on=['NOC', 'Country'], suffixes=['_bronze', '_gold'], how='left').head(10)

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052.0,2088.0
1,URS,Soviet Union,584.0,838.0
2,GBR,United Kingdom,505.0,498.0
3,FRA,France,475.0,378.0
4,GER,Germany,454.0,407.0
5,AUS,Australia,413.0,293.0
6,ITA,Italy,374.0,460.0
7,HUN,Hungary,345.0,400.0
8,SWE,Sweden,325.0,347.0
9,NED,Netherlands,320.0,212.0


In [85]:
rev1 = pd.read_csv('datasets/revenue1.csv')
rev1

Unnamed: 0,branch_id,city,state,revenue
0,10,Austin,TX,100
1,20,Denver,CO,83
2,30,Springfield,IL,4
3,47,Mendocino,CA,200


In [86]:
manager1 = pd.read_csv('datasets/manager1.csv')
manager1

Unnamed: 0,branch_id,branch,state,manager
0,10,Austin,TX,Charlers
1,20,Denver,CO,Joel
2,47,Mendocino,IL,Brett
3,31,Springfield,CA,Sally


In [91]:
rev1.join(manager1, lsuffix='_rev', rsuffix='_mng', how='outer')

Unnamed: 0,branch_id_rev,city,state_rev,revenue,branch_id_mng,branch,state_mng,manager
0,10,Austin,TX,100,10,Austin,TX,Charlers
1,20,Denver,CO,83,20,Denver,CO,Joel
2,30,Springfield,IL,4,47,Mendocino,IL,Brett
3,47,Mendocino,CA,200,31,Springfield,CA,Sally


In [92]:
rev1

Unnamed: 0,branch_id,city,state,revenue
0,10,Austin,TX,100
1,20,Denver,CO,83
2,30,Springfield,IL,4
3,47,Mendocino,CA,200


In [89]:
sales = pd.read_csv('datasets/sales1.csv')
sales

Unnamed: 0,city,state,manager
0,Mendocino,CA,1
1,Denver,CO,4
2,Austin,TX,2
3,Springfield,MO,5
4,Springfield,IL,1


In [90]:
pd.merge(rev1, sales, how='right', on=['city', 'state'])

Unnamed: 0,branch_id,city,state,revenue,manager
0,10.0,Austin,TX,100.0,2
1,20.0,Denver,CO,83.0,4
2,30.0,Springfield,IL,4.0,1
3,47.0,Mendocino,CA,200.0,1
4,,Springfield,MO,,5


In [100]:
sales

Unnamed: 0,city,state,manager
0,Mendocino,CA,1
1,Denver,CO,4
2,Austin,TX,2
3,Springfield,MO,5
4,Springfield,IL,1


In [99]:
manager1

Unnamed: 0,branch_id,branch,state,manager
0,10,Austin,TX,Charlers
1,20,Denver,CO,Joel
2,47,Mendocino,IL,Brett
3,31,Springfield,CA,Sally


In [98]:
pd.merge(sales, manager1, how='left', left_on=['city', 'state'], right_on=['branch', 'state'])

Unnamed: 0,city,state,manager_x,branch_id,branch,manager_y
0,Mendocino,CA,1,,,
1,Denver,CO,4,20.0,Denver,Joel
2,Austin,TX,2,10.0,Austin,Charlers
3,Springfield,MO,5,,,
4,Springfield,IL,1,,,


# 3. merge_ordered( )

In [104]:
austin = pd.read_csv('datasets/austin.csv')
houston = pd.read_csv('datasets/houston.csv')
austin

Unnamed: 0,date,ratings
0,1/1/16,Cloudy
1,2/8/16,Cloudy
2,1/17/16,Sunny


In [105]:
houston

Unnamed: 0,date,ratings
0,1/4/16,Rainy
1,1/1/16,Cloudy
2,3/1/16,Sunny


In [103]:
pd.merge_ordered(austin, houston)

Unnamed: 0,date,ratings
0,1/1/16,Cloudy
1,1/17/16,Sunny
2,1/4/16,Rainy
3,2/8/16,Cloudy
4,3/1/16,Sunny


In [108]:
pd.merge_ordered(austin, houston, on='date', suffixes=['_aus', '_hous'])

Unnamed: 0,date,ratings_aus,ratings_hous
0,1/1/16,Cloudy,Cloudy
1,1/17/16,Sunny,
2,1/4/16,,Rainy
3,2/8/16,Cloudy,
4,3/1/16,,Sunny


In [109]:
pd.merge_ordered(austin, houston, on='date', suffixes=['_aus', '_hous'], fill_method='ffill')

Unnamed: 0,date,ratings_aus,ratings_hous
0,1/1/16,Cloudy,Cloudy
1,1/17/16,Sunny,Cloudy
2,1/4/16,Sunny,Rainy
3,2/8/16,Cloudy,Rainy
4,3/1/16,Cloudy,Sunny


In [159]:
auto = pd.read_csv('datasets/automobiles.csv', parse_dates=True, index_col='yr')
auto.head(3)

Unnamed: 0_level_0,mpg,cyl,displ,hp,weight,accel,origin,name
yr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1970-01-01,18.0,8,307.0,130,3504,12.0,US,chevrolet chevelle malibu
1970-01-01,15.0,8,350.0,165,3693,11.5,US,buick skylark 320
1970-01-01,18.0,8,318.0,150,3436,11.0,US,plymouth satellite


In [160]:
oil = pd.read_csv('datasets/oil_price.csv', parse_dates=True, index_col='Date')
oil.head(3)

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
1970-01-01,3.35
1970-02-01,3.35
1970-03-01,3.35


In [161]:
merged = pd.merge_asof(auto, oil, left_on='yr', right_on='Date').head()
merged

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,origin,name,Price
0,18.0,8,307.0,130,3504,12.0,US,chevrolet chevelle malibu,3.35
1,15.0,8,350.0,165,3693,11.5,US,buick skylark 320,3.35
2,18.0,8,318.0,150,3436,11.0,US,plymouth satellite,3.35
3,16.0,8,304.0,150,3433,12.0,US,amc rebel sst,3.35
4,17.0,8,302.0,140,3449,10.5,US,ford torino,3.35


In [162]:
merged.resample('A',on='Date')[['mpg','Price']].mean()

KeyError: 'The grouper name Date is not found'