In [1]:
# Data handling libraries
import pandas as pd
pd.options.display.max_rows = 10000
import numpy as np
import boto3
import io
import requests as req

In [None]:
## Deciding on WB datasets by their unique id, 

In [2]:
data_names_and_codes = {'EG.ELC.ACCS.ZS': 'Access to electricity (% of population)',
 'EG.FEC.RNEW.ZS': 'Renewable energy consumption (% of total final energy consumption)',
 'IT.NET.USER.ZS': 'Individuals using the Internet (% of population)',
 'NE.CON.PRVT.PC.KD': 'Household final consumption expenditure per capita (constant 2010 US$)',
 'NV.IND.TOTL.KD': 'Industry, value added (constant 2010 US$)',
 'NY.GDP.TOTL.RT.ZS': 'Total natural resources rents (% of GDP)',
 'SG.GEN.PARL.ZS': 'Proportion of seats held by women in national parliaments (%)',
 'SL.EMP.TOTL.SP.ZS': 'Employment to population ratio, 15+, total (%) (modeled ILO estimate)',
 'SM.POP.NETM': 'Net migration',
 'SP.DYN.LE00.IN': 'Life expectancy at birth, total (years)',
 'SP.URB.TOTL.IN.ZS': 'Urban population (% of total)',
 'TM.VAL.MRCH.CD.WT': 'Merchandise imports (current US$)',
 'NY.GDP.MKTP.CD': 'GDP (current US$)'}

column_long_name_to_short_name = {
    'Renewable energy consumption (% of total final energy consumption)': 'renewable_energy_consumption_of_total_final_energy_consumpti',
    'Household final consumption expenditure per capita (constant 2010 US$)': 'household_final_consumption_expenditure_per_capita_constant_20',
    'Merchandise imports (current US$)': 'merchandise_imports_current_us_tm_val_mrch_cd_wt',
    'Industry, value added (constant 2010 US$)': 'industry_value_added_constant_2010_us_nv_ind_totl_kd',
    'Access to electricity (% of population)': 'access_to_electricity_of_population_eg_elc_accs_zs',
    'Urban population (% of total)': 'urban_population_of_total_sp_urb_totl_in_zs',
    'Employment to population ratio, 15+, total (%) (modeled ILO estimate)': 'employment_to_population_ratio_15_total_modeled_ilo_est',
    'Total natural resources rents (% of GDP)': 'total_natural_resources_rents_of_gdp_ny_gdp_totl_rt_zs',
    'Life expectancy at birth, total (years)': 'life_expectancy_at_birth_total_years_sp_dyn_le00_in',
    'Net migration': 'net_migration_sm_pop_netm',
    'Proportion of seats held by women in national parliaments (%)': 'proportion_of_seats_held_by_women_in_national_parliaments',
    'Individuals using the Internet (% of population)': 'individuals_using_the_internet_of_population_it_net_user_z',
    'GDP (current US$)': 'GDP'
}

series_code_to_data_viz_name = {}
for key, value in data_names_and_codes.items():
    series_code_to_data_viz_name[key] = column_long_name_to_short_name[value]
    
series_code_to_data_viz_name 

{'EG.ELC.ACCS.ZS': 'access_to_electricity_of_population_eg_elc_accs_zs',
 'EG.FEC.RNEW.ZS': 'renewable_energy_consumption_of_total_final_energy_consumpti',
 'IT.NET.USER.ZS': 'individuals_using_the_internet_of_population_it_net_user_z',
 'NE.CON.PRVT.PC.KD': 'household_final_consumption_expenditure_per_capita_constant_20',
 'NV.IND.TOTL.KD': 'industry_value_added_constant_2010_us_nv_ind_totl_kd',
 'NY.GDP.MKTP.CD': 'GDP',
 'NY.GDP.TOTL.RT.ZS': 'total_natural_resources_rents_of_gdp_ny_gdp_totl_rt_zs',
 'SG.GEN.PARL.ZS': 'proportion_of_seats_held_by_women_in_national_parliaments',
 'SL.EMP.TOTL.SP.ZS': 'employment_to_population_ratio_15_total_modeled_ilo_est',
 'SM.POP.NETM': 'net_migration_sm_pop_netm',
 'SP.DYN.LE00.IN': 'life_expectancy_at_birth_total_years_sp_dyn_le00_in',
 'SP.URB.TOTL.IN.ZS': 'urban_population_of_total_sp_urb_totl_in_zs',
 'TM.VAL.MRCH.CD.WT': 'merchandise_imports_current_us_tm_val_mrch_cd_wt'}

## Data in long form

In [None]:
indicators = list(series_code_to_data_viz_name.keys())
seed = indicators[0]
print(seed)
res = req.get("http://api.worldbank.org/countries/all/indicators/{}?format=json&per_page=10000".format(seed))
#print(res.text)
data = pd.io.json.json_normalize(res.json()[1])
data = data[["country.value", "date", "value"]]
value_name = series_code_to_data_viz_name[seed]
data.columns = ["Country Name", "Year", value_name]
data = data.set_index(["Country Name", "Year"])
all_world_bank_data = data

for indicator in indicators[1:]:
    # Results are paginated
    print(indicator)
    res = req.get("http://api.worldbank.org/countries/all/indicators/{}?format=json&per_page=10000".format(indicator))
    #print(res.text)
    data = pd.io.json.json_normalize(res.json()[1])
    data = data[["country.value", "date", "value"]]
    value_name = series_code_to_data_viz_name[indicator]
    data.columns = ["Country Name", "Year", value_name]
    data = data.set_index(["Country Name", "Year"])
    all_world_bank_data = all_world_bank_data.join(data, how="outer")

clean_column_names = ["Access to electricity", "Share of Renewable Energy in Total Consumption",
                     "Percentage of Individuals Using the Internet", "Household Consumption Expenditure per Capita",
                     "Industry Value Added", "Total Natural Resource Rents as Percentage of GDP",
                     "Proportion of Women in National Parliament", "Employment to Population Ratio",
                     "Net Migration", "Life Expectancy at Birth", "Urban Population", "Merchandise Import Value",
                     "GDP"]
    
all_world_bank_data.columns = clean_column_names

In [None]:
for col in clean_column_names:
    data = all_world_bank_data[col]
    data = data.reset_index()
    data = data[pd.notnull(data[col])]
    data.to_csv("/Users/nathansuberi/Desktop/RW_Data/wb_data/wb_data_{}.csv".format(col.replace(" ", "_")))

## Data in wide form

In [None]:
indicators = series_code_to_data_viz_name

all_world_bank_data = pd.DataFrame()
for indicator in indicators:
    # Results are paginated
    print(indicator)
    res = req.get("http://api.worldbank.org/countries/all/indicators/{}?date=1999:2016&format=json&per_page=10000".format(indicator))
    data = pd.io.json.json_normalize(res.json()[1])
    data = data[["country.value", "date", "value"]]
    value_name = series_code_to_data_viz_name[indicator]
    data.columns = ["Country Name", "Year", value_name]
    data = data.pivot(index="Country Name", columns="Year", values=value_name).astype(float)
    data["ISO"] = list(map(add_iso, data.index))
    data = data.loc[pd.notnull(data["ISO"])]
    data["Indicator"] = value_name 
    all_world_bank_data = all_world_bank_data.append(data)
    
    if indicator == "NY.GDP.MKTP.CD":
        year_cols = [str(yr) for yr in range(1999,2016)]
        data = data.loc[:,year_cols].pct_change(axis=1).loc[:,year_cols[1:]]
        data["Indicator"] = "GDP percent change"
        data["ISO"] = list(map(add_iso, data.index))
        all_world_bank_data = all_world_bank_data.append(data)
    
all_world_bank_data.index.name = "Country Name"
all_world_bank_data.reset_index(inplace=True)
all_world_bank_data.columns.name = ""
all_world_bank_data = all_world_bank_data.drop(["1999", "2016"], axis=1)