In [422]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import requests
import seaborn as sns
import xarray as xr

% matplotlib inline

### WB Data Analysis

**<span style="color:red; background:yellow;"></span>**

**Data & Scope:** downloaded on (date/time)
Two types of datasets:

1) Population (easy)
* years 1960-2016
* definition: total # residents (Regardless of citizenship/legal status)
* sources: (mid-year value)

    1) United Nations Population Division. World Population Prospects
    
    2) Census reports and other statistical publications from national statistical offices
    
    3) Eurostat: Demographic Statistics
    
    4) United Nations Statistical Division. Population and Vita

2) GDP (slightly more tricky)
- years: 1960 - 2016
- four types of `gdp per capita` I looked at:

    * current
    * constant
    * PPP current
    * PPP constant

**Source:** World Bank

**Assumptions/Expectations**
-

**Analysis Goals**
- Task 3 Deliverable: ADM0 population & real income estimates from `1950-2017`  **<span style="color:gray; background:lime;">DONE</span>**

- Find out which countries are missing data for which years. 
    Population **<span style="color:gray; background:lime;">DONE</span>**
    GDP
- Or alternatively, which years are missing certain *countries*

Possible convenience functions:
* Find years missing for a given country (to check important countries)
* Create filter for population > 10 mil: look at countries exit/enter data on which years. (Mike's suggesiton)
* From metadata, get a list of country code that are not *countries* to filter them from the list **<span style="color:gray; background:lime;">DONE</span>**


**Conclusion**
1. Population. Population data exists from **1960 to 2016.**
    
```  
Countries out of 217 that are missing population data (nan):

West Bank and Gaza: 1960 - 1989
Serbia: 1960 - 1989
Sint Maarten (Dutch part): 1960 -1998
Kuwait: 1992-1994
Eritrea: 2012-2016

```

2. GDP 

```
Countries out of 217 that are missing gdp data

a. GDP per capita PPP (constant) **<span style="color:gray; background:yellow;">In Progress</span>**

b. GDP per capita PPP (current)

c. GDP per capita (constant)

d. GDP per capita (current)
```


**Questions**
1. How are different sources of population data used to result in one final set? (are there any overlaps between 4 sources?) i.e. what is the methodology for compilation?

2. How often are population/income data updated?



# A. Population data

Total population (absolute units) - based on national census and extrapolation and interpolation for missing values (based on data from United Nations, other census organizations, Eurostats and WB methodology). Subject to undercounting/biases for both high and low/mid income countries. 

Interpolation and extrapolation done by World Bank/UN (??-confirm the responsible party) for certain years/countries that are missing census data, or missing pre/post census information for given time frame. Uses demographic models, etc. 

In [423]:
df2 = pd.read_csv('population/API_SP.POP.TOTL_DS2_en_csv_v2.csv', skiprows=3)
del df2['Unnamed: 62'] # remove extraneous data
df2.columns


Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017'],
      dtype='object')

In [424]:
df2
df2['2017'].unique() # nan -> all 2017 values nan

array([ nan])

In [425]:
cols = ['Country Name'] + ['Country Code'] + [str(yr) for yr in range(1960, 2017)]
#print (cols)
df_pop = df2[cols]
df_pop # includes Country Name and all years (1960-2016)
df_pop.shape # 264 countries???, 58 columns 

(264, 59)

In [426]:
df_pop.head(5)   # Show top 5 rows
df_pop.isnull().sum()
# note that nan values per year changes from 1-4, 1989 and prior are all 4,
# after that there is a shifts six times:
# 4 (60-89) -> 2 (90-91)-> 3 (92-94) -> 2 (95-97) -> 1 (98-2011)-> 2 (2012-2016)-> 264 (2017)

# Compare two years (diff countries)

df_pop['1989'].isnull()
pd.isnull(df_pop).any(1).nonzero()[0]

array([ 67, 108, 125, 194, 212, 223])

NOTE on nans:

* `df.isnull().sum()`: returns total number of nan values for each column (whole df) <- what I used
* comparison of nan == np.nan to find nan values does not work (returns False)
* can use np.isnan(val) for a single value OR possibly use `apply` method for whole col (did not try)

In [427]:
df_pop.iloc[67] # Eritrea missing 2012-2016
df_pop.iloc[108] # Not classified missing all # what is this....?
df_pop.iloc[194] # West Bank and Gaza missing 1960 - 1989
df_pop.iloc[212] # Serbia missing 1960 - 1989
df_pop.iloc[223] # Sint Maarten (Dutch part) 1960 -1998
df_pop.iloc[125] # Kuwait missing 1992-1994

# doing this manual painful way for now
df_pop['1960'].isnull().sum() # 4 
df_pop['1990'].isnull().sum() # expect 2 (excluding West Bank/Serbia)
df_pop['1993'].isnull().sum() # expect 3 (Kuwait missing too)
df_pop['1995'].isnull().sum() # expect 2 (Kuwait back again)
df_pop['1999'].isnull().sum() # expect 1 Sint Maarten back
df_pop['2012'].isnull().sum() # expect 2 (Eritrea goes missing)
df_pop['2016'].isnull().sum() # expect 2

# drop unclassified country since it's contributing 0 population 
#df_pop.drop(df_pop.index[108])

#print(df_pop['Country Name'].tolist())

2

# B. Meta data (Filter non-countries)

Both population and income dataset includes list of countries (217) plus various classifications as entry data. These include:

Task:

- get a list of country codes for *non-countries* from metadata
- use that list to filter those country codes from income or population data



In [437]:
os.listdir("./population/")

# 263 - 46 # total # 217 countries

# Q - why is this suddenly failing?
#meta_country = pd.read_csv('./population/Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2.csv')

['API_SP.POP.TOTL_DS2_en_csv_v2.csv',
 'Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2.csv',
 'Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2.csv']

In [438]:
def filter_non_countries(_df, _metadata):
    '''
    _df : pd.DataFrame
        either income or population data
        
    _metadata : pd.DataFrame
        metadata on a list of entries including countries and non-countries 
        data source is from the World Bank
        has IncomeGroup column that is not null for countries (217)
    
    '''
    _merged = _df.merge(_metadata, on='Country Code')
    
    non_country_mask = _merged['IncomeGroup'].isnull()
    merged_country_only = _merged[~non_country_mask]
    return merged_country_only

def select_relevant_cols(_df):
    _cols = ['Country Name'] + ['Country Code'] + [str(yr) for yr in range(1960, 2017)]
    return _df[_cols]

df_country_only = filter_non_countries(df2, meta_country)
select_relevant_cols(df_country_only)[200:205]

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
244,Uganda,UGA,6788214.0,7006633.0,7240174.0,7487429.0,7746198.0,8014401.0,8292776.0,8580676.0,...,30590487.0,31663896.0,32771895.0,33915133.0,35093648.0,36306796.0,37553726.0,38833338.0,40144870.0,41487965.0
245,Ukraine,UKR,42662149.0,43203635.0,43749470.0,44285899.0,44794327.0,45261935.0,45682308.0,46060452.0,...,46509350.0,46258200.0,46053300.0,45870700.0,45706100.0,45593300.0,45489600.0,45271947.0,45154029.0,45004645.0
247,Uruguay,URY,2538651.0,2571690.0,2603887.0,2635129.0,2665390.0,2694537.0,2722877.0,2750093.0,...,3339741.0,3350824.0,3362755.0,3374415.0,3385624.0,3396777.0,3408005.0,3419546.0,3431552.0,3444006.0
248,United States,USA,180671000.0,183691000.0,186538000.0,189242000.0,191889000.0,194303000.0,196560000.0,198712000.0,...,301231207.0,304093966.0,306771529.0,309348193.0,311663358.0,313998379.0,316204908.0,318563456.0,320896618.0,323127513.0
249,Uzbekistan,UZB,8549493.0,8837349.0,9138097.0,9454250.0,9788986.0,10143740.0,10520879.0,10917446.0,...,26868000.0,27302800.0,27767400.0,28562400.0,29339400.0,29774500.0,30243200.0,30757700.0,31298900.0,31848200.0


##  Playing around with data

### Getting data for country

In [439]:
# Returns all values of the 'Country Name' column as an array
#df2['Country Name'].values #lists all countries
# United States
# China
# India

china = get_row_by_country_name(df2, 'China')
india = get_row_by_country_name(df2, 'India')
usa = get_row_by_country_name(df2, 'United States')

def get_first_and_last_years(_country):
    # _country is a dataframe (1 row)
    return _country[['1960', '2016']]
    
    
get_first_and_last_years(china)
get_first_and_last_years(usa)
get_first_and_last_years(india)

# manual confirmation: these values match online WB values (https://data.worldbank.org/indicator/SP.POP.TOTL?year_high_desc=true)

Unnamed: 0,1960,2016
107,449480608.0,1324171000.0


In [440]:
def get_missing_years(country_name):
    '''
    Calculate the number of years the country has no
    data points (ie. has nan value)
    
    Parameters
    ----------
    country_name : Str
        name of given country
        
    Returns 
    -------
    Missing years in default range (1960-2017), where 
    each year is a string : List
    
    ex. ['1990',  '2005'']

'''
    country = df.loc[df['Country Name'] == country_name]
    return country

# Get countries as dataframe (subset of original) by first letter
def get_by_first_letter(df, first_letter):
    return df.loc[df['Country Name'].str.startswith(first_letter)]

# Get country by name
def get_row_by_country_name(_df, country_name):
    return _df[_df['Country Name'] == country_name]

# Get country row from dataframe by index
def get_by_index(df, country_idx):
    return df.iloc[country_idx]

# Get all countries that start with S
s_countries = get_by_first_letter(df, 'S')
#s_countries
    
# Get_by_first_letter(df, 'K') # example South Korea is index 124
# get_by_index(df, 124)


# C. GDP data

1) GDP per capita, PPP constant (NY.GDP.PCAP.PP.KD)
### GDP per capita PPP (constant at 2011 `$`)

In [441]:
# Open files
# NY.GDP.PCAP.PP.KD is WDI indicator
'''
PCAP = per capita
PP = purchasing power (no PP means not PP)
KD = constant (vs CD = current)
'''
df = pd.read_csv('./gdp/gdp_pcap_ppp_constant/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2.csv', skiprows=3)
gdp_meta = pd.read_csv('./gdp/gdp_pcap_ppp_constant/Metadata_Country_API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2.csv')

In [442]:
# Get variables (column names)
df.columns # years 1960-2017
df.shape


(264, 63)

### Filter non countries (should return 217 entries) and subset only interested columns

In [443]:
df_gdp_country_only = filter_non_countries(df, gdp_meta)
df_gdp = select_relevant_cols(df_gdp_country_only)
df_gdp.shape # 217, 59

(217, 59)

### TASK: Get missing years/countries


In [444]:
pd.isnull(df_gdp).any(1).nonzero()[0]

df_gdp

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Aruba,ABW,,,,,,,,,...,,,,,35973.780510,,,,,
1,Afghanistan,AFG,,,,,,,,,...,1284.775213,1298.143159,1531.173993,1614.255001,1660.739856,1839.273579,1814.155825,1780.382366,1747.978457,1739.583177
2,Angola,AGO,,,,,,,,,...,5443.126215,5978.334873,5908.051427,5895.114088,5911.254334,5998.638601,6185.013829,6260.132681,6231.067992,6024.726138
3,Albania,ALB,,,,,,,,,...,8447.882285,9153.981440,9524.609811,9927.135147,10207.700674,10369.761659,10504.093089,10715.329581,11024.915108,11424.628319
4,Andorra,AND,,,,,,,,,...,,,,,,,,,,
6,United Arab Emirates,ARE,,,,,,,,,...,79283.052704,71724.277611,61118.903796,57579.835168,58404.015591,59813.016525,62532.734910,64126.892086,65975.375548,67133.065522
7,Argentina,ARG,,,,,,,,,...,17900.706342,18436.862467,17168.378937,18712.063077,19629.351845,19224.874400,19482.190295,18797.547947,19101.297392,18479.442211
8,Armenia,ARM,,,,,,,,,...,7010.627056,7558.361648,6532.964231,6702.848006,7022.103943,7511.132482,7727.929092,7971.117928,8180.050057,8174.366732
9,American Samoa,ASM,,,,,,,,,...,,,,,,,,,,
10,Antigua and Barbuda,ATG,,,,,,,,,...,24245.754411,23969.566844,20836.179961,19147.970556,18602.425358,19105.115398,18862.816760,19521.300576,20113.753554,20777.613083


In [445]:
xr_gdp = df_gdp.to_xarray()

xr_gdp['Country Code']
xr_gdp['Country Name']
xr_nn = xr_gdp.notnull()
xr_nn.argwhere(~xr_nn)


AttributeError: 'Dataset' object has no attribute 'argwhere'

### TASK: Get list of years missing for a country

Super naive EDA



In [None]:
# Series.value_counts() -> get unique counts of values

# Count nan
sorted(df.isnull().sum(axis=1).unique().tolist()) # 17 unique values
# Why do some countries have randomly missing years? (source of inconsistency?)

In [None]:
# Get all rows 
# Get selected columns (ranging from 1960 to 2017)
df_1960_2017 = df.loc[:, '1960':'2017']

#df_1960_2017.apply(lambda s: s.value_counts(), axis=0)

df_1960_2017.apply(lambda s: s.value_counts(), axis=0)
#print(len(df_1960_2017.isnull().sum(axis=1).tolist())) # 264 = # countries

# returns list of # nan per country
num_nan_per_country = df_1960_2017.isnull().sum(axis=1).tolist()
print(df_1960_2017.isnull().sum(axis=1).tolist())

list_of_countries = [x for x in df['Country Name'].values]
num_countries = len(list_of_countries)

# not really useful!!!!!

#countries = {list_of_countries[i]: {'num_missing': num_nan_per_country[i]} for i in range(0, num_countries)}
#countries

In [None]:
# Write a function that counts values for each column
# Purpose: to count how many NA values are there in each row
df['1960'].value_counts()
type(df.iloc[0]['1960']) # numpy.float64


# single value check
np.isnan(df.iloc[0]['1960']) # True 



# find the countries that have nan values in 1989 and prior years, 
# but don't have 


### Fetching data via API

This approach is abandoned because json data is harder to manipulate for this analysis.
Pandas rocks.

In [None]:
#r = requests.get('http://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL?format=json', auth=('user', 'pass'))

In [None]:
#r.headers['Content-Type'] # 'application/json;charset=utf-8'
#r.json()