https://www.datacamp.com/courses/merging-dataframes-with-pandas
# 1. Appending & concatenating Series
## 1) Series1.append(Series2): works for series and dataframes. It can only stack vertically.
## 2) pd.concat([s1, s2, s3]): it accepts a list or sequence of several series or dataframes to concatenate. It can concatenate vertically & horizontally.
- pd.concat([s1, s2, s3]) = s1.append(s2).append(s3)
- pd.concat([s1, s2, s3], axis=0): stacking rows vertically at the bottom. axis=0 & axis='rows' are the same.
- axis=1 & axis='columns': stacking DataFrames columns horizontally.

In [166]:
import pandas as pd
import numpy as np

In [167]:
northeast = pd.Series(['CT', 'ME', 'MA', 'NH', 'RI', 'VT','NJ', 'NY', 'PA'])
south = pd.Series(['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA','DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX'])
midwest = pd.Series(['IL', 'IN', 'MN', 'MO', 'NE', 'ND','SD', 'IA', 'KS', 'MI', 'OH', 'WI'])
west = pd.Series(['AZ', 'CO', 'ID', 'MT', 'NV', 'NM','UT', 'WY', 'AK', 'CA', 'HI', 'OR','WA'])

In [168]:
# the index labels remain the same as in the original 2 series
east = northeast.append(south)
east.head(11)

0    CT
1    ME
2    MA
3    NH
4    RI
5    VT
6    NJ
7    NY
8    PA
0    DE
1    FL
dtype: object

In [169]:
east.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')

In [170]:
# It would extract the 4th element from 2 series
east.loc[3]

3    NH
3    MD
dtype: object

In [171]:
# drop=True: it discards the old index and reset the indexes
northeast.append(south).reset_index(drop=True).head(11)

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object

In [172]:
# It also contains repeated indexes
east1 = pd.concat([northeast, south])
east1.head(11)

0    CT
1    ME
2    MA
3    NH
4    RI
5    VT
6    NJ
7    NY
8    PA
0    DE
1    FL
dtype: object

In [173]:
east1.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')

In [174]:
# ignore_index=True: it can reset the indexes easily without using .reset_index(drop=True)
pd.concat([northeast, south], ignore_index=True).head(11)

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object

# Practice 1

In [175]:
jan = pd.read_csv('datasets/sales-jan-2015.csv', parse_dates=True, index_col='Date')
jan.head(2)

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-21 19:13:21,Streeplex,Hardware,11
2015-01-09 05:23:51,Streeplex,Service,8


In [176]:
feb = pd.read_csv('datasets/sales-feb-2015.csv', parse_dates=True, index_col='Date')
mar = pd.read_csv('datasets/sales-mar-2015.csv', parse_dates=True, index_col='Date')

## 1) Merging datasets: series.append(series).append(series)

In [177]:
Q1 = jan.Units.append(feb.Units).append(mar.Units)
Q1.head()

Date
2015-01-21 19:13:21    11
2015-01-09 05:23:51     8
2015-01-06 17:19:34    17
2015-01-02 09:51:06    16
2015-01-11 14:51:02    11
Name: Units, dtype: int64

In [178]:
Q1.loc['jan 27, 2015': 'Feb 2, 2015']

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64

In [179]:
Q1.loc['feb 26, 2015':'mar 7, 2015']

Date
2015-02-26 08:57:45     4
2015-02-26 08:58:51     1
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64

In [180]:
Q1.sum()

642

## 2) Merging datasets: looping

In [181]:
units = []
for i in [jan, feb, mar]:
    units.append(i['Units'])
    
quarter1 = pd.concat(units, axis='rows')
quarter1.loc['jan 27, 2015':'feb 2, 2015']

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64

# Practice 2

In [182]:
name1 = pd.read_csv('datasets/Baby names/names1881.csv', header=None)
name2 = pd.read_csv('datasets/Baby names/names1981.csv', header=None)

In [183]:
name1.head(2)

Unnamed: 0,0,1,2
0,Mary,F,6919
1,Anna,F,2698


In [184]:
name2.head(2)

Unnamed: 0,0,1,2
0,Jennifer,F,57032
1,Jessica,F,42519


In [185]:
names = pd.concat([name1, name2], ignore_index=True)
names.head(3)

Unnamed: 0,0,1,2
0,Mary,F,6919
1,Anna,F,2698
2,Emma,F,2034


In [186]:
labels = ['name', 'gender', 'total']
names.columns = labels

In [187]:
names.shape

(21390, 3)

In [188]:
names.head()

Unnamed: 0,name,gender,total
0,Mary,F,6919
1,Anna,F,2698
2,Emma,F,2034
3,Elizabeth,F,1852
4,Margaret,F,1658


In [189]:
names.loc[names['name']=='Morgan']

Unnamed: 0,name,gender,total
1283,Morgan,M,23
2096,Morgan,F,1769
14390,Morgan,M,766


# 3. Concatenation, keys, & MultiIndexes
- <b>pd.concat([s1, s2], keys=[col_name1, col_name2], axis=0):</b> the argument keys=[col_name1, col_name2] assigns an outer index label associated with each of hte original input df. 
- <b>pd.concat([s1, s2], keys=[col_name1, col_name2], axis=1):</b> the result has a multi-level colunn index.
- <b>pd.concat({col_name1: s1, col_name2: s2}, axis='columns'):</b> using dictionary input.
<p>Note: the order of the list of keys must match the order of the lsit of df.</p>

# Practice 3

In [190]:
medals=[]
for medal in medal_types:
    file_name = "%s_top5.csv"%medal   
    # Read file_name into a DataFrame: medal_df
    medal_df = pd.read_csv(file_name,index_col='Country')   
    # Append medal_df to medals
    medals.append(medal_df)
    
# Concatenate medals: medals
medals = pd.concat(medals,keys=['bronze', 'silver', 'gold'])

# Print medals in entirety
medals.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United States,1052.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,France,475.0
bronze,Germany,454.0
silver,United States,1195.0


In [191]:
medals.sort_index(level=0).loc[('bronze', 'Germany')]

Total    454.0
Name: (bronze, Germany), dtype: float64

In [192]:
medals.sort_index(level=0).loc['silver']

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,461.0
Italy,394.0
Soviet Union,627.0
United Kingdom,591.0
United States,1195.0


### pd.IndexSlice: to extract specific slices. It is required when slicing on the inner level of a MultiIndex.

In [193]:
medals.sort_index(level=0).loc[pd.IndexSlice[:, 'United Kingdom'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United Kingdom,505.0
gold,United Kingdom,498.0
silver,United Kingdom,591.0


## Concatenating horizontally to get MultiIndexed columns

In [194]:
# Concatenate dataframes: february
february = pd.concat([jan, feb, mar],keys=['Hardware', 'Software', 'Service'],axis=1)
february.head(10)

Unnamed: 0_level_0,Hardware,Hardware,Hardware,Software,Software,Software,Service,Service,Service
Unnamed: 0_level_1,Company,Product,Units,Company,Product,Units,Company,Product,Units
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2015-01-01 07:31:20,Acme Coporation,Software,18.0,,,,,,
2015-01-02 09:51:06,Hooli,Hardware,16.0,,,,,,
2015-01-03 18:00:19,Hooli,Service,19.0,,,,,,
2015-01-06 13:47:37,Acme Coporation,Software,16.0,,,,,,
2015-01-06 17:19:34,Initech,Hardware,17.0,,,,,,
2015-01-09 05:23:51,Streeplex,Service,8.0,,,,,,
2015-01-11 14:51:02,Hooli,Hardware,11.0,,,,,,
2015-01-13 05:36:12,Hooli,Service,7.0,,,,,,
2015-01-15 02:38:25,Acme Coporation,Service,16.0,,,,,,
2015-01-15 15:33:40,Mediacore,Hardware,7.0,,,,,,


In [195]:
# Assign pd.IndexSlice: idx
idx = pd.IndexSlice

# Create the slice: slice_2_8
february.loc[' Feb. 2, 2015':'Feb. 8, 2015', idx[:, 'Company']]

Unnamed: 0_level_0,Hardware,Software,Service
Unnamed: 0_level_1,Company,Company,Company
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2015-02-02 08:33:01,,Hooli,
2015-02-02 20:54:49,,Mediacore,
2015-02-03 14:14:18,,Initech,
2015-02-04 15:36:29,,Streeplex,
2015-02-04 21:52:45,,Acme Coporation,
2015-02-05 01:53:06,,Acme Coporation,
2015-02-05 22:05:03,,Hooli,
2015-02-07 22:58:10,,Acme Coporation,


# Concatenating DataFrames from a dict

In [196]:
quarter = pd.concat([jan, feb, mar], axis=0)
quarter.head(2)

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-21 19:13:21,Streeplex,Hardware,11
2015-01-09 05:23:51,Streeplex,Service,8


In [197]:
monthdict = {}
monthlist = [('JAN', jan), ('FEB', feb), ('MAR', mar)]
for i, h in monthlist:
    monthdict[i] = quarter.groupby('Company').sum()
pd.concat(monthdict).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Units
Unnamed: 0_level_1,Company,Unnamed: 2_level_1
FEB,Acme Coporation,115
FEB,Hooli,137
FEB,Initech,135
FEB,Mediacore,128
FEB,Streeplex,127
JAN,Acme Coporation,115
JAN,Hooli,137
JAN,Initech,135
JAN,Mediacore,128
JAN,Streeplex,127


In [198]:
pd.concat(monthdict).loc[pd.IndexSlice[:, 'Mediacore'],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Units
Unnamed: 0_level_1,Company,Unnamed: 2_level_1
FEB,Mediacore,128
JAN,Mediacore,128
MAR,Mediacore,128


# 4. Outer & inner joins
## 1) Using arrays

In [199]:
np.arange(8)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [200]:
a = np.arange(8).reshape(2, 4) + 0.1
a

array([[0.1, 1.1, 2.1, 3.1],
       [4.1, 5.1, 6.1, 7.1]])

In [201]:
b = np.arange(6).reshape(2,3) + 0.2
b

array([[0.2, 1.2, 2.2],
       [3.2, 4.2, 5.2]])

In [202]:
c = np.arange(12).reshape(3,4) + 0.3
c

array([[ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

### 2) Stacking arrays horizontally

In [203]:
# Method 1:
np.hstack([b, a])

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [204]:
# Method 2:
np.concatenate([b, a], axis=1)

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

### 3) Stacking arrays vertically

In [205]:
# Method 1:
np.vstack([a, c])

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [206]:
# Method 2:
np.concatenate([a, c], axis=0)

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

### Note above: the arrays must have same # of rows or columns.

# Practice 4

In [208]:
bronze = pd.read_csv('datasets/Summer Olympic medals/Bronze.csv')
bronze.head()

Unnamed: 0,NOC,Country,Total
0,USA,United States,1052.0
1,URS,Soviet Union,584.0
2,GBR,United Kingdom,505.0
3,FRA,France,475.0
4,GER,Germany,454.0


In [209]:
silver = pd.read_csv('datasets/Summer Olympic medals/Silver.csv')
gold = pd.read_csv('datasets/Summer Olympic medals/Gold.csv')

## 1) join='inner': to keep only rows that share common index labels

In [216]:
medals = [bronze, silver, gold]
all_medals = pd.concat(medals, keys=['bronze', 'silver', 'gold'], axis=1, join='inner')
all_medals.head(3)

Unnamed: 0_level_0,bronze,bronze,bronze,silver,silver,silver,gold,gold,gold
Unnamed: 0_level_1,NOC,Country,Total,NOC,Country,Total,NOC,Country,Total
0,USA,United States,1052.0,USA,United States,1195.0,USA,United States,2088.0
1,URS,Soviet Union,584.0,URS,Soviet Union,627.0,URS,Soviet Union,838.0
2,GBR,United Kingdom,505.0,GBR,United Kingdom,591.0,GBR,United Kingdom,498.0


In [217]:
all_medals.shape

(138, 9)

In [219]:
all_medals1 = pd.concat(medals, keys=['bronze', 'silver', 'gold'], axis=1, join='outer')
all_medals1.shape

(138, 9)

## Resampling & concatenating DataFrames with inner join

In [229]:
china = pd.read_csv('datasets/GDP/gdp_china.csv', parse_dates=True, index_col='Year')
china.head(3)

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-01-01,59.184116
1961-01-01,49.55705
1962-01-01,46.685179


In [230]:
us = pd.read_csv('datasets/GDP/gdp_usa.csv', parse_dates=True, index_col='DATE')
us.head(3)

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1


In [235]:
china.resample('A').last().head(3)

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-12-31,59.184116
1961-12-31,49.55705
1962-12-31,46.685179


In [243]:
china.resample('A').last().pct_change(10).head(12)

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-12-31,
1961-12-31,
1962-12-31,
1963-12-31,
1964-12-31,
1965-12-31,
1966-12-31,
1967-12-31,
1968-12-31,
1969-12-31,


In [244]:
china_annual = china.resample('A').last().pct_change(10).dropna()
china_annual.head(3)

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1970-12-31,0.546128
1971-12-31,0.98886
1972-12-31,1.402472


In [246]:
us_annual = us.resample('A').last().pct_change(10).dropna()
us_annual.head(3)

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1957-12-31,0.827507
1958-12-31,0.782686
1959-12-31,0.953137


In [253]:
gdp0 = pd.concat([china_annual, us_annual], axis=1, join='outer')
gdp0.head()

Unnamed: 0_level_0,GDP,VALUE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1957-12-31,,0.827507
1958-12-31,,0.782686
1959-12-31,,0.953137
1960-12-31,,0.689354
1961-12-31,,0.630959


In [254]:
gdp = pd.concat([china_annual, us_annual], axis=1, join='inner')
gdp.head()

Unnamed: 0_level_0,GDP,VALUE
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,0.546128,1.017187
1971-12-31,0.98886,1.05227
1972-12-31,1.402472,1.172566
1973-12-31,1.730085,1.258858
1974-12-31,1.408556,1.295246


Print the result of resampling gdp every decade using .resample('10A') and aggregating with the method .last()

In [259]:
gdp.resample('10A').last()

Unnamed: 0_level_0,GDP,VALUE
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,0.546128,1.017187
1980-12-31,1.072537,1.742556
1990-12-31,0.89282,1.012126
2000-12-31,2.357522,0.738632
2010-12-31,4.011081,0.454332
2020-12-31,3.789936,0.36178
