In [39]:
# Data handling libraries
import pandas as pd
pd.options.display.max_rows = 10000
import numpy as np
import boto3
import io
import requests as req

# Initialize S3 client, location of files for this project
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

s3_bucket = "wri-public-data"

WB_DATA = "resourcewatch/world_bank_data_long_and_wide/"
CONVERSIONS = "resourcewatch/blog_data/GHG-GDP_Divergence_D3/Conversions/"

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

def write_to_S3(df, bucket, key):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer)
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

In [14]:
## World Bank data series codes and names

data_names_and_codes = {'EG.ELC.ACCS.ZS': 'Access to electricity (% of population)',
 'EG.FEC.RNEW.ZS': 'Renewable energy consumption (% of total final energy consumption)',
 'IT.NET.USER.ZS': 'Individuals using the Internet (% of population)',
 'NE.CON.PRVT.PC.KD': 'Household final consumption expenditure per capita (constant 2010 US$)',
 'NV.IND.TOTL.KD': 'Industry, value added (constant 2010 US$)',
 'NY.GDP.TOTL.RT.ZS': 'Total natural resources rents (% of GDP)',
 'SG.GEN.PARL.ZS': 'Proportion of seats held by women in national parliaments (%)',
 'SL.EMP.TOTL.SP.ZS': 'Employment to population ratio, 15+, total (%) (modeled ILO estimate)',
 'SM.POP.NETM': 'Net migration',
 'SP.DYN.LE00.IN': 'Life expectancy at birth, total (years)',
 'SP.URB.TOTL.IN.ZS': 'Urban population (% of total)',
 'TM.VAL.MRCH.CD.WT': 'Merchandise imports (current US$)',
 'NY.GDP.MKTP.CD': 'GDP (current US$)'}

In [17]:
# Load conversions from wb_name to iso3
wb_name_to_iso3_conversion = read_from_S3(s3_bucket, CONVERSIONS+"World Bank to ISO3 name conversion.csv")

# Provide function to map from wb_name to ISO3
def add_iso(name):
    try:
        return(wb_name_to_iso3_conversion.loc[name,"ISO"])
    except:
        return(np.nan)

In [24]:
wb_name_to_iso3_conversion

Unnamed: 0,ISO
Afghanistan,AFG
Albania,ALB
Algeria,DZA
American Samoa,ASM
Andorra,AND
Angola,AGO
Anguila,AIA
Antigua and Barbuda,ATG
Argentina,ARG
Armenia,ARM


## Data in long form

In [21]:
indicators = list(data_names_and_codes.keys())
seed = indicators[0]
print(seed)
res = req.get("http://api.worldbank.org/countries/all/indicators/{}?format=json&per_page=10000".format(seed))
#print(res.text)
data = pd.io.json.json_normalize(res.json()[1])
data = data[["country.value", "date", "value"]]
value_name = data_names_and_codes[seed]
data.columns = ["Country Name", "Year", value_name]
data = data.set_index(["Country Name", "Year"])
all_world_bank_data = data

for indicator in indicators[1:]:
    # Results are paginated
    print(indicator)
    res = req.get("http://api.worldbank.org/countries/all/indicators/{}?format=json&per_page=10000".format(indicator))
    #print(res.text)
    data = pd.io.json.json_normalize(res.json()[1])
    data = data[["country.value", "date", "value"]]
    value_name = data_names_and_codes[indicator]
    data.columns = ["Country Name", "Year", value_name]
    data = data.set_index(["Country Name", "Year"])
    all_world_bank_data = all_world_bank_data.join(data, how="outer")
    
all_world_bank_data = all_world_bank_data.reset_index()
all_world_bank_data["ISO3"] = list(map(add_iso, all_world_bank_data["Country Name"]))
all_world_bank_data = all_world_bank_data.loc[pd.notnull(all_world_bank_data["ISO3"])]
all_world_bank_data = all_world_bank_data.set_index(["Country Name", "Year"])

EG.ELC.ACCS.ZS
EG.FEC.RNEW.ZS
IT.NET.USER.ZS
NE.CON.PRVT.PC.KD
NV.IND.TOTL.KD
NY.GDP.TOTL.RT.ZS
SG.GEN.PARL.ZS
SL.EMP.TOTL.SP.ZS
SM.POP.NETM
SP.DYN.LE00.IN
SP.URB.TOTL.IN.ZS
TM.VAL.MRCH.CD.WT
NY.GDP.MKTP.CD


In [23]:
all_world_bank_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Access to electricity (% of population),Renewable energy consumption (% of total final energy consumption),Individuals using the Internet (% of population),Household final consumption expenditure per capita (constant 2010 US$),"Industry, value added (constant 2010 US$)",Total natural resources rents (% of GDP),Proportion of seats held by women in national parliaments (%),"Employment to population ratio, 15+, total (%) (modeled ILO estimate)",Net migration,"Life expectancy at birth, total (years)",Urban population (% of total),Merchandise imports (current US$),GDP (current US$),ISO3
Country Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
European Union,2016,,,80.7691050337866,19968.1369826023,3988821725434.18,,28.418549063461,52.3869840451532,,,75.0256292946603,5238161617520,16397979816576.1,EUN
European Union,2015,,,78.4329000006293,19583.8758139882,3933732615792.92,0.209417246513485,28.3813206756945,52.0274972322686,,81.1263925006756,74.7994125343407,5240014279500,16334844026788.0,EUN
European Union,2014,100.0,16.053026214829,77.1139914118683,19245.8917008979,3807777737722.81,0.319622359791876,27.7298367973218,51.647309286029,,80.9222240747457,74.5762777055544,6039961349150,18588239232261.0,EUN
European Union,2013,100.0,15.1442661475667,75.5428368269042,19082.6827482441,3731658584714.89,0.373596153789426,27.0906547133931,51.1848462482815,,80.5325582676824,74.3643904260975,5919933634170,18002706275463.5,EUN
European Union,2012,100.0,14.3811492809133,73.7061519154293,19160.1520852802,3776290013765.67,0.441742736714659,25.7243875884594,51.4305598227984,2939221.0,80.2510983094042,74.1557351122214,5863894780480,17271715977529.1,EUN


In [41]:
for code, name in data_names_and_codes.items():
    long_form = all_world_bank_data[name]
    long_form = long_form.reset_index()
    long_form = long_form[pd.notnull(long_form[name])]
    write_to_S3(long_form, s3_bucket, WB_DATA + "wb_data_long_{}.csv".format(name.replace(" ", "_")))

## Data in wide form

In [45]:
df = all_world_bank_data.reset_index()
index_by_countryName = df.columns[0]
columns_by_year = df.columns[1]

# Only go up to -1 b/c we don't need to do this for the ISO column
names = df.columns[2:-1]

for name in names:
    wide_form = df.pivot(index=index_by_countryName,
           columns=columns_by_year,
           values=name)
    wide_form = wide_form.dropna(how="all", axis=1)
    wide_form = wide_form.reset_index()
    wide_form["ISO3"] = list(map(add_iso, wide_form["Country Name"]))
    wide_form = wide_form[pd.notnull(wide_form["ISO3"])]
    print(wide_form.head())
    write_to_S3(wide_form, s3_bucket, WB_DATA + "wb_data_wide_{}.csv".format(name.replace(" ", "_")))

Year    Country Name              1990              1991             1992  \
0        Afghanistan              None              None             None   
1            Albania               100               100              100   
2            Algeria  92.9900512695313  93.3932571411133  93.796012878418   
3     American Samoa              None              None             None   
4            Andorra               100               100              100   

Year              1993             1994              1995              1996  \
0                 None             None              None              None   
1                  100              100               100               100   
2     94.1960525512695  94.590690612793  94.9772109985352  95.3529052734375   
3                 None             None              None              None   
4                  100              100               100               100   

Year              1997              1998 ...               200