In [120]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import requests
import seaborn as sns
import xarray as xr

% matplotlib inline

### WB Data Analysis

**<span style="color:red; background:yellow;"></span>**

**Data & Scope:** downloaded on (date/time)
Two types of datasets:

1) Population (easy)
* years 1960-2016
* definition: total # residents (Regardless of citizenship/legal status)
* sources: (mid-year value)

    1) United Nations Population Division. World Population Prospects
    
    2) Census reports and other statistical publications from national statistical offices
    
    3) Eurostat: Demographic Statistics
    
    4) United Nations Statistical Division. Population and Vita

2) GDP (slightly more tricky)
- years: 
- version:
- variables

**Source:** World Bank ()

**Assumptions/Expectations**
-

**Analysis Goals**
- [] Task 3 Deliverable: ADM0 population & real income estimates from `1950-2017`
- [] Evaluate which years data exist for (and that `1950-2017` are present). 
- [] Note any missing *years*
- [] Evaluate if missing *countries* in any years. why?
- [] Quantify any data availability differences between countries (years per country, etc.)
- [] Compare WB (Chicago) data with WB (from website)

Possible convenience functions:

* Find years missing for a given country
* Return missing countries for a year
* Evaluate whether 1950-2017 data is present
  given a range, evaluate whether data exists for the range
  * per country
  * all countries
* ...
* Get a starting year for each country
* Calculate slope of population increase per country -> rank depending on country growth

*Currently Out of Scope*
- future projection

**Conclusion**
1. Population. Population data exists from 1960 to 2016. 
    There are few 'nan'/missing population values each year. Number of missing values change six times:
```  
4 (60-89) -> 2 (90-91)-> 3 (92-94) -> 2 (95-97) -> 1 (98-'11)-> 2 ('12-'16)-> 264 ('17)
```
**Task: Find out why numbers go missing those years**
2. GDP


**Questions**
1. How are different sources of population data used to result in one final set? (are there any overlaps between 4 sources?) i.e. what is the methodology for compilation?

2. How often are population data updated?



In [11]:
# Open file
df = pd.read_csv('./WB_API_v2/API_NY.GDP.MKTP.PP.CD_DS2_en_csv_v2.csv', skiprows=3)

In [36]:
# Get variables (column names)
len(df.columns) # 63 variables

df.columns # years 1960-2017

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', 'Unnamed: 62'],
      dtype='object')

### Get list of years missing for a country



In [152]:
# Series.value_counts() -> get unique counts of values

# Count nan
sorted(df.isnull().sum(axis=1).unique().tolist()) # 17 unique values
# Why do some countries have randomly missing years? (source of inconsistency?)

[32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 46, 49, 51, 58, 59]

In [110]:
# Get all rows 
# Get selected columns (ranging from 1960 to 2017)
df_1960_2017 = df.loc[:, '1960':'2017']

#df_1960_2017.apply(lambda s: s.value_counts(), axis=0)

df_1960_2017.shape # 264, 58 -> 264 countries and 58 years
num_years = 2017 - 1960 + 1 # 58
#print('num years is {}'.format(num_years))
#df_1960_2017.shape[1] # 58 (counts columns) # 1-> colum

df_1960_2017.apply(lambda s: s.value_counts(), axis=0)
#print(len(df_1960_2017.isnull().sum(axis=1).tolist())) # 264 = # countries

# returns list of # nan per country
num_nan_per_country = df_1960_2017.isnull().sum(axis=1).tolist()
print(df_1960_2017.isnull().sum(axis=1).tolist())

list_of_countries = [x for x in df['Country Name'].values]
num_countries = len(list_of_countries)

countries = {list_of_countries[i]: {'num_missing': num_nan_per_country[i]} for i in range(0, num_countries)}

countries

[57, 43, 31, 31, 58, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 35, 31, 31, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 58, 58, 57, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 38, 31, 36, 31, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 58, 31, 58, 31, 31, 31, 31, 31, 36, 37, 32, 31, 31, 31, 31, 31, 31, 58, 31, 58, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 39, 45, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 36, 31, 36, 31, 58, 31, 58, 36, 31, 42, 31, 31, 31, 31, 31, 31, 31, 31, 32, 41, 31, 58, 31, 31, 31, 31, 31, 31, 31, 58, 31, 31, 31, 31, 31, 31, 48, 31, 31, 32, 41, 31, 31, 31, 31, 32, 33, 31, 31, 34, 58, 31, 31, 58, 31, 31, 58, 41, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 58, 58, 36, 31, 50, 31, 41, 42, 31, 33, 36, 31, 31, 57, 31, 58, 58, 31, 31, 31, 31, 31, 31, 31, 31, 42, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,

{'Afghanistan': {'num_missing': 43},
 'Albania': {'num_missing': 31},
 'Algeria': {'num_missing': 31},
 'American Samoa': {'num_missing': 58},
 'Andorra': {'num_missing': 58},
 'Angola': {'num_missing': 31},
 'Antigua and Barbuda': {'num_missing': 31},
 'Arab World': {'num_missing': 31},
 'Argentina': {'num_missing': 31},
 'Armenia': {'num_missing': 31},
 'Aruba': {'num_missing': 57},
 'Australia': {'num_missing': 31},
 'Austria': {'num_missing': 31},
 'Azerbaijan': {'num_missing': 31},
 'Bahamas, The': {'num_missing': 31},
 'Bahrain': {'num_missing': 32},
 'Bangladesh': {'num_missing': 31},
 'Barbados': {'num_missing': 31},
 'Belarus': {'num_missing': 31},
 'Belgium': {'num_missing': 31},
 'Belize': {'num_missing': 31},
 'Benin': {'num_missing': 31},
 'Bermuda': {'num_missing': 34},
 'Bhutan': {'num_missing': 31},
 'Bolivia': {'num_missing': 31},
 'Bosnia and Herzegovina': {'num_missing': 35},
 'Botswana': {'num_missing': 31},
 'Brazil': {'num_missing': 31},
 'British Virgin Islands':

In [220]:
def get_missing_years(country_name, start_year, end_year):
    '''
    Given a start_year and an end_year, calculate the number of years in 
    the given range (inclusive at both ends) the country has no
    data points (ie. has nan value)
    
    Parameters
    ----------
    country_name : Str
        name of given co=untry
             name ofc duu
               str_format
            
    
    Returns 
    -------
    Missing years, each year is a string : List
    
    ex. ['1990',  '2005'']

'''
    country = df.loc[df['Country Name'] == country_name]
    return country

# Get countries as dataframe (subset of original) by first letter
def get_by_first_letter(df, first_letter):
    return df.loc[df['Country Name'].str.startswith(first_letter)]

# Get country by name
def get_row_by_country_name(_df, country_name):
    return _df[_df['Country Name'] == country_name]

# Get country row from dataframe by index
def get_by_index(df, country_idx):
    return df.iloc[country_idx]

# Get all countries that start with S
s_countries = get_by_first_letter(df, 'S')
#s_countries
    
#get_by_first_letter(df, 'K') # example South Korea is inde 124
get_by_index(df, 124)

from collections import OrderedDict
dd = OrderedDict()

def count_num_missing_years():
    dd[df['Country Name']: df['']]    



In [26]:
# Write a function that counts values for each column
# Purpose: to count how many NA values are there in each row
df['1960'].value_counts()
type(df.iloc[0]['1960']) # numpy.float64

# note: comparison of nan == np.nan does not work (returns False)
# need to check np.isnan(val)

# single value check
np.isnan(df.iloc[0]['1960']) # True 

# returns total number of nan values for each column (whole df)
df.isnull().sum() 

# find the countries that have nan values in 1989 and prior years, 
# but don't have 


Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', 'Unnamed: 62'],
      dtype='object')

In [111]:
189 + 28 # 217 number of countries

217

## B. Population DATA

In [165]:
os.listdir('./')

df2 = pd.read_csv('./WB_total_pop_v2/API_SP.POP.TOTL_DS2_en_csv_v2.csv', skiprows=3)
df2.columns
del df2['Unnamed: 62'] # remove extraneous data

In [187]:
#df2['2017'].unique() # returns array([nan]) -> all 2017 values currently nan

#df2.head(5)   # Show top 5 rows
#df2.isnull().sum()
# note that nan values per year changes from 1-4, 1989 and prior are all 4,
# after that there is a shifts six times:
# 4 (60-89) -> 2 (90-91)-> 3 (92-94) -> 2 (95-97) -> 1 (98-2011)-> 2 (2012-2016)-> 264 (2017)

df.shape # 264 rows and 63 columns -> 264 countries

(264, 63)

### Getting data for country

In [None]:
# Returns all values of the 'Country Name' column as an array
#df2['Country Name'].values #lists all countries
# United States
# China
# India

china = get_row_by_country_name(df2, 'China')
india = get_row_by_country_name(df2, 'India')
usa = get_row_by_country_name(df2, 'United States')

def get_first_and_last_years(_country):
    # _country is a dataframe (1 row)
    return _country[['1960', '2016']]
    
    
get_first_and_last_years(china)
get_first_and_last_years(usa)
get_first_and_last_years(india)

# manual confirmation: these values match online WB values (https://data.worldbank.org/indicator/SP.POP.TOTL?year_high_desc=true)

### Creating timeseries visualization

In [229]:
sns.set(color_codes=True)
x = np.linspace(1960, 2016, 57) # returns array of values from 1960 to 2016 (inclusive)
# 57 determines the total number of values we want (i.e. (2016-1960+1)/57 = 1)
# same as np.arange(1960, 2017, 1) # 1960 to 2017 (not including 2017), specifies step=1
# same as np.arange(1960, 2017) # default step is 1

# For fun, plot China, US, and India population change
#ax = sns.tsplot(data=china)


In [239]:
china_x = china[:,'1960':'2016']

# does not work
# def get_years_1960_2016(_df):
#     return _df.loc[:, '1960' : '2016']

# get_years_1960_2016(china)


TypeError: unhashable type: 'slice'

In [214]:
sns.set(color_codes=True)

sns.tsplot(china, x=)
'''
sns.tsplot
 sns.set(color_codes=True)
>>> x = np.linspace(0, 15, 31)
>>> data = np.sin(x) + np.random.rand(10, 31) + np.random.randn(10, 1)
>>> ax = sns.tsplot(data=data)

>>> gammas = sns.load_dataset("gammas")
>>> ax = sns.tsplot(time="timepoint", value="BOLD signal",
...                 unit="subject", condition="ROI",
...                 data=gammas)

'''

'\nsns.tsplot\n sns.set(color_codes=True)\n>>> x = np.linspace(0, 15, 31)\n>>> data = np.sin(x) + np.random.rand(10, 31) + np.random.randn(10, 1)\n>>> ax = sns.tsplot(data=data)\n\n>>> gammas = sns.load_dataset("gammas")\n>>> ax = sns.tsplot(time="timepoint", value="BOLD signal",\n...                 unit="subject", condition="ROI",\n...                 data=gammas)\n\n'

### Fetching data via API

This approach is abandoned because json data is harder to manipulate for this analysis.
Pandas rocks.

In [130]:
#r = requests.get('http://api.worldbank.org/v2/countries/all/indicators/SP.POP.TOTL?format=json', auth=('user', 'pass'))

In [133]:
#r.headers['Content-Type'] # 'application/json;charset=utf-8'
#r.json()