In [3]:
import requests as req
import pandas as pd
import numpy as np
from rw_api_tools.rw_api_tools import rw_api_tools as rw
rw = rw()

In [4]:
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000

In [None]:
carto_data = rw.get_rw_datasets(provider="cartodb")

In [None]:
wb_datasets = pd.DataFrame([carto_data.loc[ix] for ix in carto_data.index if "wb" in carto_data.loc[ix, "table_name"]])

In [None]:
wb_datasets

In [None]:
def create_query(rw_id, table_name):
    sql = "SELECT * FROM {}".format(table_name)
    query = "https://api.resourcewatch.org/v1/query/{}?sql={}".format(rw_id, sql)
    return(query)

wb_datasets["query"] = list(map(create_query, wb_datasets["rw_id"], wb_datasets["table_name"]))

In [None]:
def retrieve_data(query):
    print("RW API query:", query)
    res = req.get(query)
    data = res.json()["data"]
    print("Num results: ", len(data))
    return(data)

wb_datasets["data"] = list(map(retrieve_data, wb_datasets["query"]))

In [None]:
starter = pd.DataFrame(wb_datasets["data"].iloc[0]).sort_values(by=["time", "country_code"])
tuples = list(zip(*[starter["time"],starter["country_code"]]))
multi_index = pd.MultiIndex.from_tuples(tuples, names=["year", "country_code"])
starter.index = multi_index
starter = starter.drop(["the_geom", "the_geom_webmercator", "cartodb_id", "time_code", "time", "country_code"], axis=1)

In [None]:
def add_wb_col(old_data, new_data):
    add_on = pd.DataFrame(new_data).sort_values(by=["time", "country_code"])
    tuples = list(zip(*[add_on["time"],add_on["country_code"]]))
    multi_index = pd.MultiIndex.from_tuples(tuples, names=["year", "country_code"])
    add_on.index = multi_index
    add_on = add_on.drop(["the_geom", "the_geom_webmercator", "cartodb_id", "time_code", "time", "country_code", "country_name"], axis=1)

    return(old_data.join(add_on))

wb_data = starter

for i in range(1,wb_datasets.shape[0]):
    wb_data = add_wb_col(wb_data, wb_datasets["data"].iloc[i])

In [None]:
data_for_nate = wb_data.reset_index()

In [None]:
data_for_nate = data_for_nate.replace("..", np.nan)

In [None]:
data_for_nate_iso3 = data_for_nate["country_code"].values

In [None]:
# Import country boundary shapefile to make sure unique ID matches
country_boundaries = pd.read_json("/Users/nathansuberi/Desktop/RW_Data/all_primary_countries.json")["features"]
country_boundaries = pd.io.json.json_normalize(country_boundaries)

In [None]:
country_boundaries.head()

In [None]:
shapefile_iso3 = country_boundaries["properties.iso_a3"].values

In [None]:
shapefile_iso3

In [None]:
data_for_nate_iso3_total_pos_matches = len(data_for_nate_iso3)
shapefile_iso3_total_pos_matches = len(shapefile_iso3)

print(data_for_nate_iso3_total_pos_matches)
print(shapefile_iso3_total_pos_matches)

In [None]:
data_shapefile_matches = np.unique([iso for iso in data_for_nate_iso3 if iso in shapefile_iso3])
shapefile_data_matches = np.unique([iso for iso in shapefile_iso3 if iso in data_for_nate_iso3])

print(len(data_shapefile_matches))
print(len(shapefile_data_matches))

len(np.unique([iso for iso in data_shapefile_matches if iso in shapefile_data_matches]))

In [None]:
## Load CDIAC Data and make sure it matches here too

root_folder = "/Users/nathansuberi/Desktop/WRI_Programming/Py_Scripts/Data Packs/Materials for Nate/"
file_name = root_folder + "wb_names_only_with_iso_territory_gcb.csv"

cdiac_data = pd.read_csv(file_name)
cdiac_data_iso3 = cdiac_data["ISO"].values
cdiac_data_iso3_total_pos_matches = len(cdiac_data_iso3)

In [None]:
print(cdiac_data_iso3_total_pos_matches)

cdiac_shapefile_misses = [iso for iso in cdiac_data_iso3 if iso not in shapefile_iso3]
cdiac_wb_misses = [iso for iso in cdiac_data_iso3 if iso not in data_for_nate_iso3]

cdiac_shapefile_matches = [iso for iso in cdiac_data_iso3 if iso in shapefile_iso3]
cdiac_wb_matches = [iso for iso in cdiac_data_iso3 if iso in data_for_nate_iso3]

print(len(np.unique(cdiac_shapefile_matches)))
print(len(np.unique(cdiac_wb_matches)))

In [None]:
shapefile_data_matches

In [None]:
keep= [iso in shapefile_data_matches for iso in data_for_nate_iso3]
data_for_nate_keep = data_for_nate.loc[keep]
len(np.unique(data_for_nate_keep["country_code"]))

In [None]:
cdiac_shapefile_misses
# Zaire
# Bermuda
# Greenland
# Hong Kong
# Macau
# Romania
# World

In [None]:
cdiac_wb_misses 
# Zaire
# Romania

In [None]:
data_for_nate_keep.to_csv(root_folder + "wb_data.csv")

In [None]:
data_for_nate_keep

In [None]:
### From Nate's excel file, with tabs

In [None]:
wb_data_from_nate = pd.ExcelFile("/Users/nathansuberi/Desktop/RW_Data/correlations 9.26.17.xlsx")

In [None]:
### From Nate's csv, no tabs

# Make sure same columns are correct

# Make a table:
## rows = wb variables
## columns = variable name, whether it is % change or absolute

# Send to just Nate, with date at end of file name

# Add column for 2000-2015 % change


In [140]:
wb_data_joined = pd.read_csv("/Users/nathansuberi/Desktop/RW_Data/compiled independent variable absolute data 1999-2015.csv")
indicators_to_drop = wb_data_joined["Series Name"].unique()[-3:]
wb_data_joined.columns = ["Series Name","Series Code", "Country Name", "Country Code",'1999', '2000', '2001',
               '2002','2003','2004','2005',
               '2006', '2007', '2008', '2009',
               '2010', '2011', '2012', '2013',
               '2014', '2015', '2016']
wb_data_joined.set_index("Series Name", inplace=True)
wb_data_joined.drop(indicators_to_drop, inplace=True)
wb_data_joined = wb_data_joined.replace("..", np.nan).drop(["Series Code", "Country Name"], axis=1)

time_steps = [str(year) for year in range(1999, 2017)]
#time_steps.append("2000-2015")
countries = wb_data_joined["Country Code"].unique()

tuples = []
for country in countries:
    for time_step in time_steps:
        #tuples.append((time_step, country))
        tuples.append((country, time_step))
        
multi_index = pd.MultiIndex.from_tuples(tuples, names=["year", "country_code"])

In [36]:
#wb_data_joined.index.unique()
#time_steps
#countries

In [141]:
column_name = {
    'Renewable energy consumption (% of total final energy consumption)': 'renewable_energy_consumption_of_total_final_energy_consumpti',
    'Household final consumption expenditure per capita (constant 2010 US$)': 'household_final_consumption_expenditure_per_capita_constant_20',
    'Merchandise imports (current US$)': 'merchandise_imports_current_us_tm_val_mrch_cd_wt',
    'Industry, value added (constant 2010 US$)': 'industry_value_added_constant_2010_us_nv_ind_totl_kd',
    'Access to electricity (% of population)': 'access_to_electricity_of_population_eg_elc_accs_zs',
    'Urban population (% of total)': 'urban_population_of_total_sp_urb_totl_in_zs',
    'Employment to population ratio, 15+, total (%) (modeled ILO estimate)': 'employment_to_population_ratio_15_total_modeled_ilo_est',
    'Total natural resources rents (% of GDP)': 'total_natural_resources_rents_of_gdp_ny_gdp_totl_rt_zs',
    'Life expectancy at birth, total (years)': 'life_expectancy_at_birth_total_years_sp_dyn_le00_in',
    'Net migration': 'net_migration_sm_pop_netm',
    'Proportion of seats held by women in national parliaments (%)': 'proportion_of_seats_held_by_women_in_national_parliaments',
    'Individuals using the Internet (% of population)': 'individuals_using_the_internet_of_population_it_net_user_z'
}

pct_change_indicators = ['Renewable energy consumption (% of total final energy consumption)',
    'Industry, value added (constant 2010 US$)',
    'Merchandise imports (current US$)']

def add_indicator_column(indicator):
    data = wb_data_joined.loc[indicator].set_index("Country Code").astype(float)
    if indicator in pct_change_indicators:
        data = data.pct_change(axis=1)

    data_reformat = []
    for tuple in tuples:
        #data_reformat.append(data.loc[tuple[1], tuple[0]])
        data_reformat.append(data.loc[tuple[0], tuple[1]])
    data_reformed = pd.DataFrame(data_reformat, columns = [column_name[indicator]])
    data_reformed.index = multi_index
    return(data_reformed)

In [142]:
wb_data_joined.loc[indicators[0]].set_index("Country Code").astype(float)

Unnamed: 0_level_0,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ALB,42.718314,41.445409,39.125663,35.896295,33.752734,35.935872,36.869483,31.710194,32.100937,35.912903,37.216639,37.11533,35.962531,40.048297,41.288978,38.689497,,
AND,14.549895,14.890669,15.773873,16.221819,16.912308,16.874424,16.902442,17.485996,16.940777,17.42274,17.515951,19.090727,18.971541,19.195534,19.563701,19.886327,,
AGO,73.410799,74.618176,73.75785,72.125121,67.30612,65.493058,70.954187,65.022575,61.59969,58.107985,55.748971,54.193825,52.715687,52.245733,50.686111,50.797466,,
ATG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
ARG,10.306173,11.077953,13.040375,13.287981,12.518129,8.348046,8.957602,10.187843,8.88765,7.609687,8.57406,8.961446,10.116848,9.8667,10.220855,10.773125,,
ARM,8.358797,7.164263,5.419113,8.899368,9.811615,8.707899,6.491174,7.652836,7.017262,6.405168,7.800556,9.359817,8.009615,6.565712,6.771218,7.724734,,
AUS,8.507273,8.423342,8.370282,8.738721,7.139003,6.668828,6.711976,6.857191,6.951586,6.797059,6.765441,8.552761,8.472198,8.443705,9.320593,9.497568,,
AUT,26.438306,26.387853,25.483679,24.684865,22.578798,23.219369,24.064839,26.346414,28.815659,29.495122,30.773628,30.948879,31.272889,33.701203,34.687002,35.783927,,
AZE,2.143936,2.065646,1.752523,2.351899,2.944782,3.070785,3.365196,2.861587,3.7887,3.086307,3.269145,4.449734,3.556332,2.845522,2.526095,2.116818,,
BHS,,,0.449768,0.41596,1.688629,1.704699,1.836391,2.168173,3.706604,2.146394,1.915094,1.775283,1.539654,1.233432,0.843903,1.098197,,


In [143]:
starter_data = add_indicator_column(indicators[0])

for indicator in indicators[1:]:
    print(indicator)
    starter_data = starter_data.join(add_indicator_column(indicator))

starter_data.sort_index(inplace=True)
    
for indicator in indicators:
    tmp = wb_data_joined.loc[indicator].set_index("Country Code").astype(float)
    
    if indicator == 'Renewable energy consumption (% of total final energy consumption)':
        for country in countries:
            val = tmp.loc[country, "2014"] - tmp.loc[country, "2000"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Household final consumption expenditure per capita (constant 2010 US$)':
        for country in countries:
            val = tmp.loc[country, "2015"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Merchandise imports (current US$)':
        for country in countries:
            val = tmp.loc[country, "2015"] - tmp.loc[country, "2000"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Industry, value added (constant 2010 US$)':
        for country in countries:
            val = tmp.loc[country, "2015"] - tmp.loc[country, "2000"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Access to electricity (% of population)':
        for country in countries:
            val = tmp.loc[country, "2014"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Urban population (% of total)':
        for country in countries:
            val = tmp.loc[country, "2015"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Employment to population ratio, 15+, total (%) (modeled ILO estimate)':
        for country in countries:
            val = tmp.loc[country, "2015"] - tmp.loc[country, "2000"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Total natural resources rents (% of GDP)':
        for country in countries:
            val = tmp.loc[country, "2015"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Life expectancy at birth, total (years)':
        for country in countries:
            val = tmp.loc[country, "2015"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Net migration':
        for country in countries:
            val = tmp.loc[country, "2012"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val 
            
    elif indicator == 'Proportion of seats held by women in national parliaments (%)':
        for country in countries:
            val = tmp.loc[country, "2015"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
            
    elif indicator == 'Individuals using the Internet (% of population)':
        for country in countries:
            val = tmp.loc[country, "2015"]
            starter_data.loc[(country, "2000-2015"), column_name[indicator]] = val
        

Household final consumption expenditure per capita (constant 2010 US$)
Merchandise imports (current US$)
Industry, value added (constant 2010 US$)
Access to electricity (% of population)
Urban population (% of total)
Employment to population ratio, 15+, total (%) (modeled ILO estimate)
Total natural resources rents (% of GDP)
Life expectancy at birth, total (years)
Net migration
Proportion of seats held by women in national parliaments (%)
Individuals using the Internet (% of population)


In [147]:
starter_data.reset_index().to_csv("/Users/nathansuberi/Desktop/RW_Data/World Bank Data from 1999 - 2015 9-28-17 with 2000-2015 values.csv")

In [146]:
starter_data

renewable_energy_consumption_of_total_final_energy_consumpti     -2.382071e+01
household_final_consumption_expenditure_per_capita_constant_20             NaN
merchandise_imports_current_us_tm_val_mrch_cd_wt                  1.765254e+10
industry_value_added_constant_2010_us_nv_ind_totl_kd                       NaN
access_to_electricity_of_population_eg_elc_accs_zs                3.200000e+01
urban_population_of_total_sp_urb_totl_in_zs                       4.405000e+01
employment_to_population_ratio_15_total_modeled_ilo_est          -1.381004e+00
total_natural_resources_rents_of_gdp_ny_gdp_totl_rt_zs            1.125142e+01
life_expectancy_at_birth_total_years_sp_dyn_le00_in               6.118934e+01
net_migration_sm_pop_netm                                         8.732200e+04
proportion_of_seats_held_by_women_in_national_parliaments         3.680000e+01
individuals_using_the_internet_of_population_it_net_user_z        1.240000e+01
Name: (AGO, 2000-2015), dtype: float64

In [None]:
sheet_names = wb_data_from_nate.sheet_names
wb_dataframes = {}
for name in sheet_names:
    print(name)
    if name != "index":
        data = wb_data_from_nate.parse(name)   
        columns = ["Series Name","Series Code", "Country Name", "Country Code",'1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]',
                   '2002 [YR2002]','2003 [YR2003]','2004 [YR2004]','2005 [YR2005]',
                   '2006 [YR2006]', '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]',
                   '2010 [YR2010]', '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]',
                   '2014 [YR2014]', '2015 [YR2015]']
        column_names = ["Series Name","Series Code", "Country Name", "Country Code",'1999', '2000', '2001',
                   '2002','2003','2004','2005',
                   '2006', '2007', '2008', '2009',
                   '2010', '2011', '2012', '2013',
                   '2014', '2015']
        data = data.loc[:, columns].set_index("Country Code")
        wb_dataframes[name] = data.drop(np.nan)

In [None]:
wb_dataframes['internet use']

In [None]:
def add_column(col_name):

    data = wb_dataframes[col_name]

    # for each country code, make a list of tuples for time_steps
    time_steps = ['1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]',
                  '2002 [YR2002]','2003 [YR2003]','2004 [YR2004]','2005 [YR2005]',
                  '2006 [YR2006]', '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]',
                  '2010 [YR2010]', '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]',
                  '2014 [YR2014]', '2015 [YR2015]']

    countries = data.index.values

    tuples = []
    for country in countries:
        if (len(str(country)) == 3) & (str(country) != "nan"):
            for time_step in time_steps:
                tuples.append((time_step, country))

    data_reformat = []
    for tuple in tuples:
        data_reformat.append(data.loc[tuple[1], tuple[0]])

    multi_index = pd.MultiIndex.from_tuples(tuples, names=["year", "country_code"])

    data_reformed = pd.DataFrame(data_reformat, columns = [col_name])
    data_reformed.index = multi_index
    
    return(data_reformed)

In [None]:
starter = "renewable energy share 2000-14"
column_names = ["household expenditure 2015",
"merchandise imports 2000-15",
"imports of goods and services",
"industry value added 2000-15",
"electricity access 2014",
"urbanization rate",
"employment ratio",
"natural resource rents",
"life expectancy",
"net migration",
"women in parliament",
"internet use"]

starter_data = add_column(starter)

starter_data
for column in column_names:
    print(column)
    starter_data = starter_data.join(add_column(column))


In [None]:
starter_data.reset_index().replace("..", np.nan).to_csv("World Bank Data from 2000 - 2015.csv")