In [2]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr

% matplotlib inline

## Objective: To save a munged GDP to munged_data/un_gdp_constant_mkt

In [3]:
# Open files
# NY.GDP.PCAP.PP.KD is WDI indicator
'''
NY = national accounts: income
MKTP = market prices
PCAP = per capita
PP = purchasing power (no PP means not PP)
KD = constant (vs CD = current)
'''
# df_gdp is gdp market constant
df_gdp_cst_mkt = pd.read_csv('./gdp/gdp_constant/API_NY.GDP.MKTP.KD_DS2_en_csv_v2.csv', skiprows=4)
meta_data = pd.read_csv('./gdp/gdp_constant/Metadata_Country_API_NY.GDP.MKTP.KD_DS2_en_csv_v2.csv')

In [4]:
df_gdp_cst_mkt.head(3) #checking 

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,Unnamed: 62
0,Aruba,ABW,GDP (constant 2010 US$),NY.GDP.MKTP.KD,,,,,,,...,,2467704000.0,,,,,,,,
1,Afghanistan,AFG,GDP (constant 2010 US$),NY.GDP.MKTP.KD,,,,,,,...,14697330000.0,15936800000.0,16911130000.0,19352200000.0,19731340000.0,19990320000.0,20212720000.0,20663920000.0,,
2,Angola,AGO,GDP (constant 2010 US$),NY.GDP.MKTP.KD,,,,,,,...,79753200000.0,82470910000.0,85702620000.0,90120960000.0,96261430000.0,100886300000.0,103919900000.0,103919900000.0,,


## Convenience functions ported from WB analysis**

In [5]:
def filter_non_countries(_df, _metadata):
    '''
    _df : pd.DataFrame
        either income or population data
        
    _metadata : pd.DataFrame
        metadata on a list of entries including countries and non-countries 
        data source is from the World Bank
        has IncomeGroup column that is not null for countries (217)
    '''
    # Country Code in both dataframes
    _merged = _df.merge(_metadata, on='Country Code')
    # non-countries have no IncomeGroup
    non_country_mask = _merged['IncomeGroup'].isnull()
    merged_country_only = _merged[~non_country_mask]
    return merged_country_only

In [6]:
def select_relevant_cols(_df):
    # returns df with 'Country Name', 'Country Code' and all years (1960 to 2016)
    _cols = ['Country Name'] + ['Country Code'] + [str(yr) for yr in range(1960, 2017)]
    return _df[_cols]

## Look at meta data

In [7]:
meta_data['IncomeGroup'].unique()

array(['High income', 'Low income', 'Lower middle income',
       'Upper middle income', nan], dtype=object)

In [8]:
meta_data['Country Code'].nunique()

263

In [9]:
meta_data['IncomeGroup'].isnull().value_counts() # 217 False (countries) # 46 nan

False    217
True      46
Name: IncomeGroup, dtype: int64

## 1. Filter aggregates (not countries)

In [10]:
df_gdp_countries = filter_non_countries(df_gdp_cst_mkt, meta_data)
df_gdp_countries.shape

(217, 68)