In [1]:
import os

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr

% matplotlib inline
sns.set()
sns.set_color_codes()

# Construct xarray for real gdp series (PPP)

### Normalize data sets to be constant to 2015

* Construct xarray for PPP values (PWT, WB to start)

    * a) Compute GDP index (PPP-real series) **<span style="color:white; background:maroon;">STARTED</span>**
        * WB **<span style="color:white; background:limegreen;">DONE</span>**

        * PWT
        * any other?? (IMF using PPPs ourselves?)
    * b) Retrieve GDP nominal/current series

* Multiply a) by b) to calculate real series constant to same year! 

* Analysis

    a) world total 
    
    b) world total, countries with missing data removed
    
    c) mortality
    

| source     | PPP year/source|  
|---------|------|
| WB      |  ICP 2011+ OECD/Eurostats |  
| PWT_rgdpne| ICP 2011+ OECD/Eurostats|

#### Table1: PPP values used by the sources

Status:
**<span style="color:lime; background:green;">In Progress</span>** 
**<span style="color:white; background:maroon;">STARTED</span>**
**<span style="color:white; background:limegreen;">DONE</span>**


In [14]:
dir_path = os.environ['ZERG'] + '{}'                                                                  

In [66]:
# note

In [143]:
def set_col_idx_name(_df, col_idx_name='year'):
    _df.columns.name = col_idx_name
    return _df

convert_to_float = lambda _df: _df.astype(float)

def set_col_as_index(_df, _col_name):
    return _df.set_index(_col_name)

stack_year = lambda _df: _df.stack('year')


In [113]:
# download available data - PWT vs WB
# 1) download Wb data

wb_ppp_constant = dir_path.format('data/wb/gdp/gdp_ppp_constant/API_NY.GDP.MKTP.PP.KD_DS2_en_csv_v2.csv') # KD for constant
wb_ppp_current = dir_path.format('data/wb/gdp/gdp_ppp_current/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv') # CD for current
# saved in `compare_gdp_market` file. 
wb_mkt_current = dir_path.format('data/output_data/wb_gdp_current_prepped.csv')
wb_mkt_constant = dir_path.format('data/output_data/wb_gdp_constant_prepped.csv')

# 2) download pwt data
pwt = dir_path.format('data/output_data/pwt_iso_all_cols.csv') # both pwt rgdpe and rgpdo

In [63]:
wb_ppp_const = pd.read_csv(dir_path.format('data/output_data/gdp_ppp_constant_countries_only1.csv'))

In [123]:
wb_ppp_const_indexed = set_col_idx_name(wb_ppp_const.set_index('iso'))

# all 2017 nan, drop it
assert wb_ppp_const_indexed['2017'].all() == True
del wb_ppp_const_indexed['2017']

wb_ppp_const_prepped = convert_to_float(wb_ppp_const_indexed)
wb_ppp_const_prepped.to_csv(dir_path.format('data/output_data/wb_ppp_constant_prepped.csv'))

In [101]:
wb_ppp_curr = pd.read_csv(dir_path.format('data/output_data/gdp_ppp_current_countries_only1.csv'))

In [105]:
wb_ppp_curr_indexed = set_col_idx_name(wb_ppp_curr.set_index('iso'))

# all 2017 nan, drop it
assert wb_ppp_curr_indexed['2017'].all() == True
del wb_ppp_curr_indexed['2017']

wb_ppp_curr_prepped = convert_to_float(wb_ppp_curr_indexed)
wb_ppp_curr_prepped.columns = [int(x) for x in wb_ppp_curr_prepped.columns]
wb_ppp_curr_prepped.columns
wb_ppp_curr_prepped.to_csv(dir_path.format('data/output_data/wb_ppp_current_prepped.csv'))

## Prep PWT

In [119]:
pwt9 = pd.read_csv(pwt)
pwt_iso = set_col_as_index(pwt9.assign(iso=pwt9['countrycode']), 'iso')  

pwt_sub_e = pwt_iso[['year', 'rgdpe']]
pwt_sub_o = pwt_iso[['year', 'rgdpo']]

## col names already int no need to convert
#rgdpna values already float no need

pwt_multiindex_e = pwt_sub_e.set_index('year', append=True)
pwt_multiindex_o = pwt_sub_o.set_index('year', append=True)

pwt_rgdpe_prepped = pwt_multiindex_e.reset_index().set_index(['iso', 'year']).unstack('year')['rgdpe']
pwt_rgdpo_prepped = pwt_multiindex_o.reset_index().set_index(['iso', 'year']).unstack('year')['rgdpo']


## combine data

In [146]:
combined = pd.concat({
    'pwt_expenditure': stack_year(pwt_rgdpe_prepped)*10**6,
    'pwt_output': stack_year(pwt_rgdpo_prepped)*10**6,
    'wb ': stack_year(wb_ppp_const_prepped)
}, axis=0, names=['source'])

In [128]:
assert [x for x in combined.index.get_level_values('iso') if x in ['Total', 'World', np.nan, 'nan']]  ==[]
# final = combined[np.in1d(combined14.index.get_level_values('iso'), ['Total', 'World', np.nan, 'nan'], invert=True)]
# no need to filter then.


In [145]:
assert combined[pd.isnull(combined.index.get_level_values('iso'))].empty == True

# no need to excluded null rows. no rows empty?

assert combined.index.get_level_values('source').nunique() == 3

In [155]:
da = combined.to_xarray()

real_gdp_2011_path = dir_path.format('data/output_data/real_gdp_2011_ppp_two_sources.nc')

if os.path.exists(real_gdp_2011_path):
    os.remove(real_gdp_2011_path)
    
da.to_netcdf(real_gdp_2011_path, mode='w')

### Calcuate GDI - not needed for now (same 2011 base year)

In [120]:
# calculate GDP index
def calculate_real_gdp_index(_df, _source):
    """
        Converts real constant gdp series to real gdp index. 
        For each country (row), gdp time series is divided by the gdp value from the given base year,
        resulting in gdp index series. Pandas apply method with axis=columns is used.
        
        Parameters
        ----------
        _df : DataFrame
            dataframe containing real constant gdp values. Rows are countries and columns and years.
            two indices are year (df.column.name) and iso (for primary index)
        _source : str
            string acronym for given data source. one of ['un', 'wb', 'imf_weo']
        Keeps nan values for any missing value.
        
        Returns
        -------
        df_gdi: DataFrame
            contains real gdp series indices
    """
    # base year of a given data source
    # base year in this context means vaue that real GDP series is constant with respect to. 
    get_gdp_base_year = {
        'wb': 2010,
        'pwt': 2011 # pwt9
    }
    base_year = get_gdp_base_year[_source]
    df_gdi = _df.div(_df[base_year], axis=0)
    return df_gdi

In [None]:
# pwt_gdi = calculate_real_gdp_index(pwt_rgdpna_prepped, 'pwt')
# ls = [un_gdi_norm_meta, 
#  wb_gdi_norm_meta, 
#  weo_gdi_norm_meta, 
#  pwt_gdi_norm_meta] = normalize_real_gdi([(un_gdi, 'un'), (wb_gdi,'wb'), (weo_gdi, 'weo'), (pwt_gdi, 'pwt')], 2014)
 