In [306]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import requests
import seaborn as sns
import xarray as xr

% matplotlib inline

### WB Data Analysis

**<span style="color:red; background:yellow;"></span>**

**Data & Scope:** downloaded on (date/time)
Two types of datasets:

1) Population (easy)
* years 1960-2016
* definition: total # residents (Regardless of citizenship/legal status)
* sources: (mid-year value)

    1) United Nations Population Division. World Population Prospects
    
    2) Census reports and other statistical publications from national statistical offices
    
    3) Eurostat: Demographic Statistics
    
    4) United Nations Statistical Division. Population and Vita

2) GDP (slightly more tricky)
- years: 1960 - 2016
- four types of `gdp per capita` I looked at:

    * current
    * constant
    * PPP current
    * PPP constant

**Source:** World Bank

**Assumptions/Expectations**
-

**Analysis Goals**
- Task 3 Deliverable: ADM0 population & real income estimates from `1950-2017`  **<span style="color:gray; background:lime;">DONE</span>**

- Find out which countries are missing data for which years. 
    Population **<span style="color:gray; background:lime;">DONE</span>**
    GDP
- Or alternatively, which years are missing certain *countries*

Possible convenience functions:
* Find years missing for a given country (to check important countries)
* Create filter for population > 10 mil: look at countries exit/enter data on which years. (Mike's suggesiton)
* From metadata, get a list of country code that are not *countries* to filter them from the list **<span style="color:gray; background:yellow;">In Progress</span>**


**Conclusion**
1. Population. Population data exists from **1960 to 2016.**
    
```  
Countries out of 217 that are missing population data (nan):

West Bank and Gaza: 1960 - 1989
Serbia: 1960 - 1989
Sint Maarten (Dutch part): 1960 -1998
Kuwait: 1992-1994
Eritrea: 2012-2016

```

2. GDP


**Questions**
1. How are different sources of population data used to result in one final set? (are there any overlaps between 4 sources?) i.e. what is the methodology for compilation?

2. How often are population/income data updated?



# A. Population data

Total population (absolute units) - based on national census and extrapolation and interpolation for missing values (based on data from United Nations, other census organizations, Eurostats and WB methodology). Subject to undercounting/biases for both high and low/mid income countries. 

Interpolation and extrapolation done by World Bank/UN (??-confirm the responsible party) for certain years/countries that are missing census data, or missing pre/post census information for given time frame. Uses demographic models, etc. 

In [307]:
df2 = pd.read_csv('population/API_SP.POP.TOTL_DS2_en_csv_v2.csv', skiprows=3)
del df2['Unnamed: 62'] # remove extraneous data
df2.columns


Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017'],
      dtype='object')

In [308]:
df2
df2['2017'].unique() # nan -> all 2017 values nan

array([ nan])

In [309]:
cols = ['Country Name'] + ['Country Code'] + [str(yr) for yr in range(1960, 2017)]
#print (cols)
df_pop = df2[cols]
df_pop # includes Country Name and all years (1960-2016)
df_pop.shape # 264 countries???, 58 columns 

(264, 59)

In [310]:
df_pop.head(5)   # Show top 5 rows
df_pop.isnull().sum()
# note that nan values per year changes from 1-4, 1989 and prior are all 4,
# after that there is a shifts six times:
# 4 (60-89) -> 2 (90-91)-> 3 (92-94) -> 2 (95-97) -> 1 (98-2011)-> 2 (2012-2016)-> 264 (2017)

# Compare two years (diff countries)

df_pop['1989'].isnull()
pd.isnull(df_pop).any(1).nonzero()[0]

array([ 67, 108, 125, 194, 212, 223])

In [311]:
df_pop.iloc[67] # Eritrea missing 2012-2016
df_pop.iloc[108] # Not classified missing all # what is this....?
df_pop.iloc[194] # West Bank and Gaza missing 1960 - 1989
df_pop.iloc[212] # Serbia missing 1960 - 1989
df_pop.iloc[223] # Sint Maarten (Dutch part) 1960 -1998
df_pop.iloc[125] # Kuwait missing 1992-1994

# doing this manual painful way for now
df_pop['1960'].isnull().sum() # 4 
df_pop['1990'].isnull().sum() # expect 2 (excluding West Bank/Serbia)
df_pop['1993'].isnull().sum() # expect 3 (Kuwait missing too)
df_pop['1995'].isnull().sum() # expect 2 (Kuwait back again)
df_pop['1999'].isnull().sum() # expect 1 Sint Maarten back
df_pop['2012'].isnull().sum() # expect 2 (Eritrea goes missing)
df_pop['2016'].isnull().sum() # expect 2

# drop unclassified country since it's contributing 0 population 
#df_pop.drop(df_pop.index[108])

#print(df_pop['Country Name'].tolist())

2

# B. Meta data (Filter non-countries)

Both population and income dataset includes list of countries (217) plus various classifications as entry data. These include:

Task:

- get a list of country codes for *non-countries* from metadata
- use that list to filter those country codes from income or population data



In [312]:
os.listdir("./population/")

# 263 - 46 # total # 217 countries

meta_country = pd.read_csv('./population/Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2.csv')
#print(meta_country['Region'].nunique()) #7 excluding nan
print(meta_country['Region'].unique())  
'''['Latin America & Caribbean' 
 'South Asia' 
 'Sub-Saharan Africa'
 'Europe & Central Asia' /// nan 
 'Middle East & North Africa'
 'East Asia & Pacific' 
 'North America']

'''

#print(meta_country['IncomeGroup'].nunique()) #4
meta_country['IncomeGroup'].unique() 
'''['High income', 'Low income', 'Lower middle income', 'Upper middle income', nan]'''

meta_country['Country Code'].nunique() # 263 entries 
meta_country['IncomeGroup'].isnull().value_counts() # 217 False (countries) # 46 nan
# False values are countries -- i.e. all countries belong to an IncomeGroup
# Validation: we know from (World Bank 2017) that total of 217 countries were included


['Latin America & Caribbean' 'South Asia' 'Sub-Saharan Africa'
 'Europe & Central Asia' nan 'Middle East & North Africa'
 'East Asia & Pacific' 'North America']


False    217
True      46
Name: IncomeGroup, dtype: int64

In [329]:
def filter_non_countries(_df, _metadata):
    '''
    _df : pd.DataFrame
        either income or population data
        
    _metadata : pd.DataFrame
        metadata on a list of entries including countries and non-countries 
        data source is from the World Bank
        has IncomeGroup column that is not null for countries (217)
    
    '''
    merged = _df.merge(_metadata, on='Country Code')
    
    non_country_mask = merged['IncomeGroup'].isnull()
    merged_country_only = merged[~non_country_mask]
    return merged_country_only

def select_relevant_cols(_df):
    _cols = ['Country Name'] + ['Country Code'] + [str(yr) for yr in range(1960, 2017)]
    return merged_country_only[_cols]

df_country_only = filter_non_countries(df2, meta_country)
select_relevant_cols(df_country_only).colu

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Aruba,ABW,54211.0,55438.0,56225.0,56695.0,57032.0,57360.0,57715.0,58055.0,...,1.012200e+05,1.013530e+05,1.014530e+05,1.016690e+05,1.020530e+05,1.025770e+05,1.031870e+05,1.037950e+05,1.043410e+05,104822.0
1,Afghanistan,AFG,8996351.0,9166764.0,9345868.0,9533954.0,9731361.0,9938414.0,10152331.0,10372630.0,...,2.661679e+07,2.729403e+07,2.800433e+07,2.880317e+07,2.970860e+07,3.069696e+07,3.173169e+07,3.275802e+07,3.373649e+07,34656032.0
2,Angola,AGO,5643182.0,5753024.0,5866061.0,5980417.0,6093321.0,6203299.0,6309770.0,6414995.0,...,2.099769e+07,2.175942e+07,2.254955e+07,2.336913e+07,2.421856e+07,2.509615e+07,2.599834e+07,2.692047e+07,2.785930e+07,28813463.0
3,Albania,ALB,1608800.0,1659800.0,1711319.0,1762621.0,1814135.0,1864791.0,1914573.0,1965598.0,...,2.970017e+06,2.947314e+06,2.927519e+06,2.913021e+06,2.905195e+06,2.900401e+06,2.895092e+06,2.889104e+06,2.880703e+06,2876101.0
4,Andorra,AND,13411.0,14375.0,15370.0,16412.0,17469.0,18549.0,19647.0,20758.0,...,8.268300e+04,8.386100e+04,8.446200e+04,8.444900e+04,8.375100e+04,8.243100e+04,8.078800e+04,7.922300e+04,7.801400e+04,77281.0
6,United Arab Emirates,ARE,92634.0,101078.0,112472.0,125566.0,138529.0,150362.0,160481.0,170283.0,...,6.044067e+06,6.894278e+06,7.666393e+06,8.270684e+06,8.672475e+06,8.900453e+06,9.006263e+06,9.070867e+06,9.154302e+06,9269612.0
7,Argentina,ARG,20619075.0,20953077.0,21287682.0,21621840.0,21953929.0,22283390.0,22608748.0,22932203.0,...,3.997022e+07,4.038239e+07,4.079941e+07,4.122389e+07,4.165688e+07,4.209674e+07,4.253992e+07,4.298152e+07,4.341776e+07,43847430.0
8,Armenia,ARM,1874120.0,1941491.0,2009526.0,2077575.0,2144998.0,2211316.0,2276031.0,2339124.0,...,2.933056e+06,2.908220e+06,2.888584e+06,2.877311e+06,2.875581e+06,2.881922e+06,2.893509e+06,2.906220e+06,2.916950e+06,2924816.0
9,American Samoa,ASM,20013.0,20486.0,21117.0,21882.0,22698.0,23520.0,24321.0,25116.0,...,5.790300e+04,5.703000e+04,5.622700e+04,5.563700e+04,5.532000e+04,5.523000e+04,5.530700e+04,5.543700e+04,5.553700e+04,55599.0
10,Antigua and Barbuda,ATG,55339.0,56144.0,57144.0,58294.0,59524.0,60781.0,62059.0,63360.0,...,9.138100e+04,9.247800e+04,9.358100e+04,9.466100e+04,9.571900e+04,9.677700e+04,9.782400e+04,9.887500e+04,9.992300e+04,100963.0


In [326]:

   
# entries /rows with no value for IncomeGroup AND no value for Region are
# ones that are not countries (categorization/ region, etc.)
country_mask = meta_country[meta_country['IncomeGroup'].isnull()]
country_mask.shape # 46
#country_mask

#df_pop.join(country_mask, on='Country Code')
meta_country.columns
merged = df2.merge(meta_country, on='Country Code')
merged_country_only = merged[~merged['IncomeGroup'].isnull()]

merged_country_only.shape # 217 / 67
merged_country_only_select_cols = merged_country_only[cols]
#merged_country_only_select_cols

# Just for fun, convert df to xarray
#merged_country_only_select_cols.to_xarray()

### Getting data for country

In [314]:
# Returns all values of the 'Country Name' column as an array
#df2['Country Name'].values #lists all countries
# United States
# China
# India

china = get_row_by_country_name(df2, 'China')
india = get_row_by_country_name(df2, 'India')
usa = get_row_by_country_name(df2, 'United States')

def get_first_and_last_years(_country):
    # _country is a dataframe (1 row)
    return _country[['1960', '2016']]
    
    
get_first_and_last_years(china)
get_first_and_last_years(usa)
get_first_and_last_years(india)

# manual confirmation: these values match online WB values (https://data.worldbank.org/indicator/SP.POP.TOTL?year_high_desc=true)

Unnamed: 0,1960,2016
107,449480608.0,1324171000.0


# GDP data

### GDP per capita PPP (constant at 2011 `$`)

In [315]:
# Open file
df = pd.read_csv('./gdp/gdp_pcap_ppp_constant/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2.csv', skiprows=3)

In [316]:
# Get variables (column names)
len(df.columns) # 63 variables

df.columns # years 1960-2017

#len(df['Country Name']) # 264 entries (217 are countries and 46 other representations)

df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,Unnamed: 62
0,Aruba,ABW,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,,,35973.780510,,,,,,,
1,Afghanistan,AFG,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,1531.173993,1614.255001,1660.739856,1839.273579,1814.155825,1780.382366,1747.978457,1739.583177,,
2,Angola,AGO,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,5908.051427,5895.114088,5911.254334,5998.638601,6185.013829,6260.132681,6231.067992,6024.726138,,
3,Albania,ALB,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,9524.609811,9927.135147,10207.700674,10369.761659,10504.093089,10715.329581,11024.915108,11424.628319,,
4,Andorra,AND,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,,,,,,,,,,
5,Arab World,ARB,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,14048.787528,14379.003480,14463.512827,14852.626033,15037.569575,15140.192880,15328.978041,15533.694766,,
6,United Arab Emirates,ARE,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,61118.903796,57579.835168,58404.015591,59813.016525,62532.734910,64126.892086,65975.375548,67133.065522,,
7,Argentina,ARG,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,17168.378937,18712.063077,19629.351845,19224.874400,19482.190295,18797.547947,19101.297392,18479.442211,,
8,Armenia,ARM,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,6532.964231,6702.848006,7022.103943,7511.132482,7727.929092,7971.117928,8180.050057,8174.366732,,
9,American Samoa,ASM,"GDP per capita, PPP (constant 2011 internation...",NY.GDP.PCAP.PP.KD,,,,,,,...,,,,,,,,,,


### Get list of years missing for a country



In [317]:
# Series.value_counts() -> get unique counts of values

# Count nan
sorted(df.isnull().sum(axis=1).unique().tolist()) # 17 unique values
# Why do some countries have randomly missing years? (source of inconsistency?)

[32, 33, 34, 35, 36, 37, 38, 39, 42, 43, 44, 46, 49, 51, 58, 59]

In [318]:
# Get all rows 
# Get selected columns (ranging from 1960 to 2017)
df_1960_2017 = df.loc[:, '1960':'2017']

#df_1960_2017.apply(lambda s: s.value_counts(), axis=0)

df_1960_2017.shape # 264, 58 -> 264 countries and 58 years
num_years = 2017 - 1960 + 1 # 58
#print('num years is {}'.format(num_years))
#df_1960_2017.shape[1] # 58 (counts columns) # 1-> colum

df_1960_2017.apply(lambda s: s.value_counts(), axis=0)
#print(len(df_1960_2017.isnull().sum(axis=1).tolist())) # 264 = # countries

# returns list of # nan per country
num_nan_per_country = df_1960_2017.isnull().sum(axis=1).tolist()
print(df_1960_2017.isnull().sum(axis=1).tolist())

list_of_countries = [x for x in df['Country Name'].values]
num_countries = len(list_of_countries)

# not really useful

countries = {list_of_countries[i]: {'num_missing': num_nan_per_country[i]} for i in range(0, num_countries)}
#countries

[57, 43, 31, 31, 58, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 35, 31, 31, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 58, 58, 57, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 38, 31, 36, 31, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 58, 31, 58, 31, 31, 31, 31, 31, 36, 37, 32, 31, 31, 31, 31, 31, 31, 58, 31, 58, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 34, 31, 31, 31, 37, 31, 31, 31, 31, 45, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 36, 31, 36, 31, 58, 31, 58, 36, 31, 42, 31, 31, 31, 31, 31, 31, 31, 31, 32, 38, 31, 58, 31, 31, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 48, 31, 31, 32, 41, 31, 31, 31, 31, 32, 33, 31, 31, 34, 58, 31, 31, 58, 31, 31, 58, 41, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 58, 58, 36, 31, 50, 31, 41, 41, 31, 33, 36, 31, 31, 57, 31, 58, 58, 31, 31, 31, 31, 31, 31, 31, 31, 41, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,

In [319]:
def get_missing_years(country_name):
    '''
    Given a start_year and an end_year, calculate the number of years in 
    the given range (inclusive at both ends) the country has no
    data points (ie. has nan value)
    
    Parameters
    ----------
    country_name : Str
        name of given country
        
    Returns 
    -------
    Missing years in default range (1960-2017), where 
    each year is a string : List
    
    ex. ['1990',  '2005'']

'''
    country = df.loc[df['Country Name'] == country_name]
    return country

# Get countries as dataframe (subset of original) by first letter
def get_by_first_letter(df, first_letter):
    return df.loc[df['Country Name'].str.startswith(first_letter)]

# Get country by name
def get_row_by_country_name(_df, country_name):
    return _df[_df['Country Name'] == country_name]

# Get country row from dataframe by index
def get_by_index(df, country_idx):
    return df.iloc[country_idx]

# Get all countries that start with S
s_countries = get_by_first_letter(df, 'S')
#s_countries
    
# Get_by_first_letter(df, 'K') # example South Korea is index 124
get_by_index(df, 124)


Country Name                                            Korea, Rep.
Country Code                                                    KOR
Indicator Name    GDP per capita, PPP (constant 2011 internation...
Indicator Code                                    NY.GDP.PCAP.PP.KD
1960                                                            NaN
1961                                                            NaN
1962                                                            NaN
1963                                                            NaN
1964                                                            NaN
1965                                                            NaN
1966                                                            NaN
1967                                                            NaN
1968                                                            NaN
1969                                                            NaN
1970                                            

In [320]:
# Write a function that counts values for each column
# Purpose: to count how many NA values are there in each row
df['1960'].value_counts()
type(df.iloc[0]['1960']) # numpy.float64

# note: comparison of nan == np.nan does not work (returns False)
# need to check np.isnan(val)

# single value check
np.isnan(df.iloc[0]['1960']) # True 

# returns total number of nan values for each column (whole df)
df.isnull().sum() 

# find the countries that have nan values in 1989 and prior years, 
# but don't have 


Country Name        0
Country Code        0
Indicator Name      0
Indicator Code      0
1960              264
1961              264
1962              264
1963              264
1964              264
1965              264
1966              264
1967              264
1968              264
1969              264
1970              264
1971              264
1972              264
1973              264
1974              264
1975              264
1976              264
1977              264
1978              264
1979              264
1980              264
1981              264
1982              264
1983              264
1984              264
1985              264
                 ... 
1989              264
1990               55
1991               53
1992               51
1993               50
1994               49
1995               41
1996               40
1997               39
1998               39
1999               37
2000               32
2001               31
2002               30
2003      

In [321]:
189 + 28 # 217 number of countries

217

## B. Population DATA

### Getting data for country

In [322]:
# Returns all values of the 'Country Name' column as an array
#df2['Country Name'].values #lists all countries
# United States
# China
# India

china = get_row_by_country_name(df2, 'China')
india = get_row_by_country_name(df2, 'India')
usa = get_row_by_country_name(df2, 'United States')

def get_first_and_last_years(_country):
    # _country is a dataframe (1 row)
    return _country[['1960', '2016']]
    
    
get_first_and_last_years(china)
get_first_and_last_years(usa)
get_first_and_last_years(india)

# manual confirmation: these values match online WB values (https://data.worldbank.org/indicator/SP.POP.TOTL?year_high_desc=true)

Unnamed: 0,1960,2016
107,449480608.0,1324171000.0


### Creating timeseries visualization

In [323]:
sns.set(color_codes=True)
x = np.linspace(1960, 2016, 57) # returns array of values from 1960 to 2016 (inclusive)
# 57 determines the total number of values we want (i.e. (2016-1960+1)/57 = 1)
# same as np.arange(1960, 2017, 1) # 1960 to 2017 (not including 2017), specifies step=1
# same as np.arange(1960, 2017) # default step is 1

# For fun, plot China, US, and India population change
#ax = sns.tsplot(data=china)


### Fetching data via API

This approach is abandoned because json data is harder to manipulate for this analysis.
Pandas rocks.

In [324]:
#r = requests.get('http://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL?format=json', auth=('user', 'pass'))

In [325]:
#r.headers['Content-Type'] # 'application/json;charset=utf-8'
#r.json()