# Import Libraries

In [1]:
# Data fetching library
import requests as req
# used below: 'res' stands for 'response'

# File management library
import os

# Configure logging library
import logging
import sys
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# Data manipulation libraries
import pandas as pd
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
import json
from datetime import datetime

# Data analysis libraries
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Uploading results to Carto
import cartoframes

# Data visualization library
## Uses Vega-Lite, which can be easily put in websites
from vega3 import Vega

# Load Data from RW API

In [2]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&published=true&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

# Select all Carto datasets on the API:
provider = "cartodb"
carto_ids = (current_datasets_on_api["provider"]==provider)
carto_data = current_datasets_on_api.loc[carto_ids]

logging.info("Number of Carto datasets: " + str(carto_data.shape[0]))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org
DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/dataset?sort=slug,-provider,userId&status=saved&published=true&includes=metadata,vocabulary,widget,layer&application=rw&page%5Bsize%5D=1000 HTTP/1.1" 200 604442
INFO:root:Number of Carto datasets: 139


# Find data sets to run the analysis with

In [3]:
# Select only national level data sets

def pick_spatial_resolution(mdatas):
    for mdata in mdatas:
        if mdata['attributes']['language'] == 'en':
            return mdata['attributes']['info'].get('spatial_resolution', None)

def pick_temporal_resolution(mdatas):
    for mdata in mdatas:
        if mdata['attributes']['language'] == 'en':
            return mdata['attributes']['info'].get('frequency_of_updates', None)
        
carto_data.loc[:,'spatial_resolution'] = list(map(pick_spatial_resolution, carto_data['metadata']))
carto_data.loc[:,'temporal_resolution'] = list(map(pick_temporal_resolution, carto_data['metadata']))

national_datasets = carto_data[carto_data['spatial_resolution'] == 'National']
annual_national_datasets = national_datasets[national_datasets['temporal_resolution'] == 'Annual']

logging.info("{} annual, national level data sets".format(annual_national_datasets.shape[0]))
annual_national_datasets.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
INFO:root:32 annual, national level data sets


(32, 14)

In [4]:
# Exclude MaterialFlows as this is handled in another notebook
annual_national_datasets.drop('082e2262-c58e-46a0-b6b7-56083cfcbd34', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Find which data sets cover the right years and are "simple"

### Find year column

In [5]:
def inquire_columns(info):
    ds_id, table_name = info
    print(ds_id)
    # Template query string used to query RW datasets
    q = "https://api.resourcewatch.org/v1/query/{}?sql=SELECT * FROM {} LIMIT 1".format(ds_id,table_name)
    try:
        res = req.get(q).json()['data'][0]
        keys = res.keys()
        types = [type(val) for key, val in res.items()]
        return (keys, types)
    except:
        return (None, None)
    
# Collect column names
annual_national_datasets.loc[:, 'column_names'], annual_national_datasets.loc[:, 'column_types'] = zip(*list(map(inquire_columns, zip(annual_national_datasets.index,annual_national_datasets['table_name']))))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


a7067e9f-fe40-4338-85da-13a6071c76fe


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/a7067e9f-fe40-4338-85da-13a6071c76fe?sql=SELECT%20*%20FROM%20soc_005_political_rights_civil_liberties_index%20LIMIT%201 HTTP/1.1" 200 97382
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


03bfb30e-829f-4299-bab9-b2be1b66b5d4


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/03bfb30e-829f-4299-bab9-b2be1b66b5d4?sql=SELECT%20*%20FROM%20for_020_forest_employment_gdp_edit%20LIMIT%201 HTTP/1.1" 200 448
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


62c988a7-1e4d-418e-87bf-a743e24209e8


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/62c988a7-1e4d-418e-87bf-a743e24209e8?sql=SELECT%20*%20FROM%20com_028_effect_of_ag_prices_on_commodity_prices%20LIMIT%201 HTTP/1.1" 200 318674
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


cc354f7f-2622-44cb-91c1-73559373de72


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/cc354f7f-2622-44cb-91c1-73559373de72?sql=SELECT%20*%20FROM%20soc_074_employment_in_agriculture%20LIMIT%201 HTTP/1.1" 200 322
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd?sql=SELECT%20*%20FROM%20ene_028_access_clean_cooking_fuels%20LIMIT%201 HTTP/1.1" 200 402
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648?sql=SELECT%20*%20FROM%20%20soc_023_fragile_states_index%20LIMIT%201 HTTP/1.1" 200 326
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


1b97e47e-ca18-4e50-9aae-a2853acca3f0


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/1b97e47e-ca18-4e50-9aae-a2853acca3f0?sql=SELECT%20*%20FROM%20wat_005_improved_water_access%20LIMIT%201 HTTP/1.1" 200 345
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


6e10074a-a368-4afd-8564-db59814cdb74


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/6e10074a-a368-4afd-8564-db59814cdb74?sql=SELECT%20*%20FROM%20ene_029_gtf_primary_energy_intensity_data%20LIMIT%201 HTTP/1.1" 200 409
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


10337db6-8321-445e-a60b-28fc1e114f29


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/10337db6-8321-445e-a60b-28fc1e114f29?sql=SELECT%20*%20FROM%20cit_029_municipal_waste%20LIMIT%201 HTTP/1.1" 200 332
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


d446a52e-c4c1-4e74-ae30-3204620a0365


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/d446a52e-c4c1-4e74-ae30-3204620a0365?sql=SELECT%20*%20FROM%20ene_012_electricity_access%20LIMIT%201 HTTP/1.1" 200 337
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


52c55378-0484-48c3-92fc-3ee94d21c716


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/52c55378-0484-48c3-92fc-3ee94d21c716?sql=SELECT%20*%20FROM%20com_010_gdp_ppp_usd%20LIMIT%201 HTTP/1.1" 200 378
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


fa6443ff-eb95-4d0f-84d2-f0c91682efdf


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/fa6443ff-eb95-4d0f-84d2-f0c91682efdf?sql=SELECT%20*%20FROM%20cli_029_vulnerability_to_cc%20LIMIT%201 HTTP/1.1" 200 330
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


a290675c-9528-4a51-8201-f6c2d7848744


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/a290675c-9528-4a51-8201-f6c2d7848744?sql=SELECT%20*%20FROM%20cli_008_greenhouse_gas_emissions_country_sector_edit_1%20LIMIT%201 HTTP/1.1" 200 531
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


c18a38cd-94ff-48cd-818f-6ffb05992abb


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/c18a38cd-94ff-48cd-818f-6ffb05992abb?sql=SELECT%20*%20FROM%20com_015_recycling_rates%20LIMIT%201 HTTP/1.1" 200 328
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


fe311144-8c0e-4440-b068-6efd057e0f6a


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/fe311144-8c0e-4440-b068-6efd057e0f6a?sql=SELECT%20*%20FROM%20com_007_fdi_index%20LIMIT%201 HTTP/1.1" 200 25142
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


37d04efc-0ab2-4499-a891-54dca1013c74


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/37d04efc-0ab2-4499-a891-54dca1013c74?sql=SELECT%20*%20FROM%20soc_040_improved_sanitation%20LIMIT%201 HTTP/1.1" 200 347
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


b37048be-9b23-4458-a047-888956c69aa1


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/b37048be-9b23-4458-a047-888956c69aa1?sql=SELECT%20*%20FROM%20soc_039_out_of_school_rate_edit%20LIMIT%201 HTTP/1.1" 200 331
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


8671f536-1979-4b6f-a147-70152fcb44ed


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/8671f536-1979-4b6f-a147-70152fcb44ed?sql=SELECT%20*%20FROM%20soc_036_life_expectancy_at_birth%20LIMIT%201 HTTP/1.1" 200 395
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


00abb46f-34e2-4bf7-be30-1fb0b1de022f


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/00abb46f-34e2-4bf7-be30-1fb0b1de022f?sql=SELECT%20*%20FROM%20soc_008_gdp_per_capita_edit%20LIMIT%201 HTTP/1.1" 200 393
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


a89c95c7-0b82-4162-b9d8-cc0205e9f7ec


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/a89c95c7-0b82-4162-b9d8-cc0205e9f7ec?sql=SELECT%20*%20FROM%20soc_006_multidimensional_poverty_index%20LIMIT%201 HTTP/1.1" 200 343
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


bea122ce-1e4b-465d-8b7b-fa11aadd20f7


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/bea122ce-1e4b-465d-8b7b-fa11aadd20f7?sql=SELECT%20*%20FROM%20soc_004_human_development_index%20LIMIT%201 HTTP/1.1" 200 331
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


95b013a3-389a-4367-83b7-c9d68c28c406


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/95b013a3-389a-4367-83b7-c9d68c28c406?sql=SELECT%20*%20FROM%20foo_43_agriculture_value_added%20LIMIT%201 HTTP/1.1" 200 378
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


d38d0d5c-31b1-47f4-9d2e-d8fba4c7d083


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/d38d0d5c-31b1-47f4-9d2e-d8fba4c7d083?sql=SELECT%20*%20FROM%20cit_025_urban_population%20LIMIT%201 HTTP/1.1" 200 374
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


2e31a1f3-576b-46b4-84f0-3f0cc399f887


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/2e31a1f3-576b-46b4-84f0-3f0cc399f887?sql=SELECT%20*%20FROM%20com_006_national_current_accounts_edit%20LIMIT%201 HTTP/1.1" 200 387
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


11278cb6-b298-49a1-bf71-f1e269f40758


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/11278cb6-b298-49a1-bf71-f1e269f40758?sql=SELECT%20*%20FROM%20soc_025_gender_inequality_index%20LIMIT%201 HTTP/1.1" 200 331
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


7a551dd8-b59c-4f59-9d50-c92cb61c5799


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/7a551dd8-b59c-4f59-9d50-c92cb61c5799?sql=SELECT%20*%20FROM%20foo_042_agricultural_production_value%20LIMIT%201 HTTP/1.1" 200 23724
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


ccfb322a-20aa-4132-b58b-0f76acec8f5a


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/ccfb322a-20aa-4132-b58b-0f76acec8f5a?sql=SELECT%20*%20FROM%20foo_041_agricultural_emissions%20LIMIT%201 HTTP/1.1" 200 67711
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


1d6c6bcc-7787-4b9c-84bb-25a185c124dc


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/1d6c6bcc-7787-4b9c-84bb-25a185c124dc?sql=SELECT%20*%20FROM%20ene_020_clean_energy_investment_potential%20LIMIT%201 HTTP/1.1" 200 146675
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


8bc79a36-d77e-4ee3-b9bc-c77146cfc503


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/8bc79a36-d77e-4ee3-b9bc-c77146cfc503?sql=SELECT%20*%20FROM%20foo_019_child_malnutrition_edit%20LIMIT%201 HTTP/1.1" 200 183029
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


0be2ce12-79b3-434b-b557-d6ea92d787fe


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/0be2ce12-79b3-434b-b557-d6ea92d787fe?sql=SELECT%20*%20FROM%20soc_026_gender_gap_index_1%20LIMIT%201 HTTP/1.1" 200 452
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


e7582657-9c16-4eb1-89e8-0211d94015c6


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/e7582657-9c16-4eb1-89e8-0211d94015c6?sql=SELECT%20*%20FROM%20soc_021_environmental_performance_index%20LIMIT%201 HTTP/1.1" 200 837
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
ix = 3
print(annual_national_datasets.iloc[ix]['column_names'])
print(annual_national_datasets.iloc[ix]['column_types'])

dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'country_name', 'country_code', 'rw_country_code', 'rw_country_name', 'datetime', 'yr_data'])
[<class 'int'>, <class 'NoneType'>, <class 'NoneType'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'NoneType'>]


In [7]:
# Making the choice to only accept datetime and year as column names.
# Datetimes will be string timestamps, years will be ints
def pick_year_col(column_names):
    print("Available column names: {}".format(column_names))
    if 'datetime' in column_names:
        return 'datetime'
    elif 'year' in column_names:
        return 'year'
    else:
        return None
        #year_col = input("What is the year column called? Type nothing for None")
        #return year_col if year_col else None

annual_national_datasets.loc[:, 'year_col'] = list(map(pick_year_col, annual_national_datasets['column_names']))

Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'a_aggr', 'add_a', 'add_b', 'b_aggr', 'c_aggr', 'cartodb_georef_status', 'cl_aggr', 'cl_rating', 'country', 'd_aggr', 'e_aggr', 'f_aggr', 'g_aggr', 'index', 'pr_aggr', 'pr_rating', 'rw_country_code', 'rw_country_name', 'status', 'total_aggr', 'status_text'])
What is the year column called? Type nothing for None
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'country', 'employment_of_total_labour_force', 'employment_pulp_and_paper_1000', 'employment_roundwood_production_1000', 'employment_total_for_the_forest_sector_1000', 'employment_wood_processing_1000', 'gross_value_added_contribution_to_gdp', 'gross_value_added_pulp_and_paper_us_million', 'gross_value_added_roundwood_production_us_million', 'gross_value_added_total_for_the_forest_sector_us_million', 'gross_value_added_wood_processing_us_million', 'year', 'rw_country_code', 'rw_country_name'])
Available column n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
annual_national_datasets['year_col']

Dataset
a7067e9f-fe40-4338-85da-13a6071c76fe        None
03bfb30e-829f-4299-bab9-b2be1b66b5d4        year
62c988a7-1e4d-418e-87bf-a743e24209e8        year
cc354f7f-2622-44cb-91c1-73559373de72    datetime
c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd    datetime
d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648    datetime
1b97e47e-ca18-4e50-9aae-a2853acca3f0    datetime
6e10074a-a368-4afd-8564-db59814cdb74    datetime
10337db6-8321-445e-a60b-28fc1e114f29    datetime
d446a52e-c4c1-4e74-ae30-3204620a0365    datetime
52c55378-0484-48c3-92fc-3ee94d21c716    datetime
fa6443ff-eb95-4d0f-84d2-f0c91682efdf    datetime
a290675c-9528-4a51-8201-f6c2d7848744        year
c18a38cd-94ff-48cd-818f-6ffb05992abb    datetime
fe311144-8c0e-4440-b068-6efd057e0f6a        None
37d04efc-0ab2-4499-a891-54dca1013c74    datetime
b37048be-9b23-4458-a047-888956c69aa1    datetime
8671f536-1979-4b6f-a147-70152fcb44ed    datetime
00abb46f-34e2-4bf7-be30-1fb0b1de022f    datetime
a89c95c7-0b82-4162-b9d8-cc0205e9f7ec    datetime
bea122ce-1e4

### Find dates covered by each dataset

In [9]:
def try_datetime_conversion_to_year(possible_dt):
    DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
    try:
        return datetime.strptime(possible_dt, DATE_FORMAT).year
    except:
        return int(possible_dt)

def find_years_covered(info):
    ds_id, year_col, table_name = info
    if year_col:
        print(ds_id)
        # Template query string used to query RW datasets
        q = "https://api.resourcewatch.org/v1/query/{}?sql=SELECT {} FROM {}".format(ds_id,year_col,table_name)
        print(q)
        try:
            res = pd.DataFrame(req.get(q).json()['data'])
            unique_years = res[year_col].unique()
            print(unique_years)
            int_years = list(map(try_datetime_conversion_to_year, unique_years))
            return sorted(int_years)
        except:
            return None
    else:
        return None
    
annual_national_datasets.loc[:, 'years_covered'] = list(map(find_years_covered, zip(annual_national_datasets.index, annual_national_datasets['year_col'], annual_national_datasets['table_name'])))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


03bfb30e-829f-4299-bab9-b2be1b66b5d4
https://api.resourcewatch.org/v1/query/03bfb30e-829f-4299-bab9-b2be1b66b5d4?sql=SELECT year FROM for_020_forest_employment_gdp_edit


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/03bfb30e-829f-4299-bab9-b2be1b66b5d4?sql=SELECT%20year%20FROM%20for_020_forest_employment_gdp_edit HTTP/1.1" 200 253
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


[None]
62c988a7-1e4d-418e-87bf-a743e24209e8
https://api.resourcewatch.org/v1/query/62c988a7-1e4d-418e-87bf-a743e24209e8?sql=SELECT year FROM com_028_effect_of_ag_prices_on_commodity_prices


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/62c988a7-1e4d-418e-87bf-a743e24209e8?sql=SELECT%20year%20FROM%20com_028_effect_of_ag_prices_on_commodity_prices HTTP/1.1" 200 296
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


[2015 2012 2014 2011 2013 2009 2010]
cc354f7f-2622-44cb-91c1-73559373de72
https://api.resourcewatch.org/v1/query/cc354f7f-2622-44cb-91c1-73559373de72?sql=SELECT datetime FROM soc_074_employment_in_agriculture


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/cc354f7f-2622-44cb-91c1-73559373de72?sql=SELECT%20datetime%20FROM%20soc_074_employment_in_agriculture HTTP/1.1" 200 1178
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1991-01-01T00:00:00Z' '1992-01-01T00:00:00Z' '1995-01-01T00:00:00Z'
 '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1998-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2002-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2009-01-01T00:00:00Z' '2011-01-01T00:00:00Z'
 '2012-01-01T00:00:00Z' '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z'
 '2017-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1994-01-01T00:00:00Z'
 '2001-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z'
 '2006-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2008-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2013-01-01T00:00:00Z' '2016-01-01T00:00:00Z']
c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd
https://api.resourcewatch.org/v1/query/c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd?sql=SELECT datetime FROM ene_028_access_clean_cooking_fuels


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd?sql=SELECT%20datetime%20FROM%20ene_028_access_clean_cooking_fuels HTTP/1.1" 200 508
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '2000-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2008-01-01T00:00:00Z'
 '2009-01-01T00:00:00Z' '2010-01-01T00:00:00Z' '2014-01-01T00:00:00Z']
d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648
https://api.resourcewatch.org/v1/query/d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648?sql=SELECT datetime FROM %20soc_023_fragile_states_index


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648?sql=SELECT%20datetime%20FROM%20%20soc_023_fragile_states_index HTTP/1.1" 200 415
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


[2006 2011 2014 2016 2007 2008 2009 2010 2012 2013 2015 2017]
1b97e47e-ca18-4e50-9aae-a2853acca3f0
https://api.resourcewatch.org/v1/query/1b97e47e-ca18-4e50-9aae-a2853acca3f0?sql=SELECT datetime FROM wat_005_improved_water_access


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/1b97e47e-ca18-4e50-9aae-a2853acca3f0?sql=SELECT%20datetime%20FROM%20wat_005_improved_water_access HTTP/1.1" 200 894
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1990-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2009-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2014-01-01T00:00:00Z' '1991-01-01T00:00:00Z'
 '1992-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1994-01-01T00:00:00Z'
 '1995-01-01T00:00:00Z' '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z'
 '1998-01-01T00:00:00Z' '1999-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2002-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2006-01-01T00:00:00Z' '2007-01-01T00:00:00Z'
 '2008-01-01T00:00:00Z' '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z'
 '2013-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
6e10074a-a368-4afd-8564-db59814cdb74
https://api.resourcewatch.org/v1/query/6e10074a-a368-4afd-8564-db59814cdb74?sql=SELECT datetime FROM ene_029_gtf_primary_energy_intensity_data


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/6e10074a-a368-4afd-8564-db59814cdb74?sql=SELECT%20datetime%20FROM%20ene_029_gtf_primary_energy_intensity_data HTTP/1.1" 200 1002
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1990-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '2002-01-01T00:00:00Z'
 '2006-01-01T00:00:00Z' '2008-01-01T00:00:00Z' '1991-01-01T00:00:00Z'
 '1992-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1994-01-01T00:00:00Z'
 '1995-01-01T00:00:00Z' '1996-01-01T00:00:00Z' '1998-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2004-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2005-01-01T00:00:00Z'
 '2009-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2010-01-01T00:00:00Z'
 '2012-01-01T00:00:00Z' '2011-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z']
10337db6-8321-445e-a60b-28fc1e114f29
https://api.resourcewatch.org/v1/query/10337db6-8321-445e-a60b-28fc1e114f29?sql=SELECT datetime FROM cit_029_municipal_waste


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/10337db6-8321-445e-a60b-28fc1e114f29?sql=SELECT%20datetime%20FROM%20cit_029_municipal_waste HTTP/1.1" 200 476
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1990-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1991-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '1992-01-01T00:00:00Z' '1993-01-01T00:00:00Z'
 '1994-01-01T00:00:00Z' '1995-01-01T00:00:00Z' '1996-01-01T00:00:00Z'
 '1998-01-01T00:00:00Z' '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z'
 '2001-01-01T00:00:00Z' '2002-01-01T00:00:00Z' '2003-01-01T00:00:00Z'
 '2004-01-01T00:00:00Z' '2005-01-01T00:00:00Z' '2006-01-01T00:00:00Z'
 '2007-01-01T00:00:00Z' '2008-01-01T00:00:00Z' '2009-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z'
 '2013-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
d446a52e-c4c1-4e74-ae30-3204620a0365
https://api.resourcewatch.org/v1/query/d446a52e-c4c1-4e74-ae30-3204620a0365?sql=SELECT datetime FROM ene_012_electricity_access


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/d446a52e-c4c1-4e74-ae30-3204620a0365?sql=SELECT%20datetime%20FROM%20ene_012_electricity_access HTTP/1.1" 200 1098
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1990-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1995-01-01T00:00:00Z'
 '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1998-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2006-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2011-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '2001-01-01T00:00:00Z' '2008-01-01T00:00:00Z' '1992-01-01T00:00:00Z'
 '2000-01-01T00:00:00Z' '2002-01-01T00:00:00Z' '2007-01-01T00:00:00Z'
 '1991-01-01T00:00:00Z' '1994-01-01T00:00:00Z' '2005-01-01T00:00:00Z'
 '2009-01-01T00:00:00Z' '2004-01-01T00:00:00Z' '2014-01-01T00:00:00Z'
 '2012-01-01T00:00:00Z']
52c55378-0484-48c3-92fc-3ee94d21c716
https://api.resourcewatch.org/v1/query/52c55378-0484-48c3-92fc-3ee94d21c716?sql=SELECT datetime FROM com_010_gdp_ppp_usd


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/52c55378-0484-48c3-92fc-3ee94d21c716?sql=SELECT%20datetime%20FROM%20com_010_gdp_ppp_usd HTTP/1.1" 200 995
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1992-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1996-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2009-01-01T00:00:00Z'
 '2013-01-01T00:00:00Z' '2015-01-01T00:00:00Z' '1990-01-01T00:00:00Z'
 '1995-01-01T00:00:00Z' '1998-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z' '2006-01-01T00:00:00Z'
 '2008-01-01T00:00:00Z' '2010-01-01T00:00:00Z' '2011-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '2016-01-01T00:00:00Z' '1991-01-01T00:00:00Z'
 '1994-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1999-01-01T00:00:00Z'
 '2000-01-01T00:00:00Z' '2002-01-01T00:00:00Z' '2012-01-01T00:00:00Z']
fa6443ff-eb95-4d0f-84d2-f0c91682efdf
https://api.resourcewatch.org/v1/query/fa6443ff-eb95-4d0f-84d2-f0c91682efdf?sql=SELECT datetime FROM cli_029_vulnerability_to_cc


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/fa6443ff-eb95-4d0f-84d2-f0c91682efdf?sql=SELECT%20datetime%20FROM%20cli_029_vulnerability_to_cc HTTP/1.1" 200 870
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1995-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1998-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2002-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z'
 '2007-01-01T00:00:00Z' '2008-01-01T00:00:00Z' '2011-01-01T00:00:00Z'
 '2012-01-01T00:00:00Z' '2013-01-01T00:00:00Z' '1996-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2006-01-01T00:00:00Z' '2010-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '2009-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
a290675c-9528-4a51-8201-f6c2d7848744
https://api.resourcewatch.org/v1/query/a290675c-9528-4a51-8201-f6c2d7848744?sql=SELECT year FROM cli_008_greenhouse_gas_emissions_country_sector_edit_1


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/a290675c-9528-4a51-8201-f6c2d7848744?sql=SELECT%20year%20FROM%20cli_008_greenhouse_gas_emissions_country_sector_edit_1 HTTP/1.1" 200 922
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


[1990 1992 1999 2014 2002 2001 2000 1991 2003 1996 2008 2010 1993 1994 1995
 1997 1998 2004 2005 2006 2007 2009 2011 2012 2013]
c18a38cd-94ff-48cd-818f-6ffb05992abb
https://api.resourcewatch.org/v1/query/c18a38cd-94ff-48cd-818f-6ffb05992abb?sql=SELECT datetime FROM com_015_recycling_rates


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/c18a38cd-94ff-48cd-818f-6ffb05992abb?sql=SELECT%20datetime%20FROM%20com_015_recycling_rates HTTP/1.1" 200 465
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1992-01-01T00:00:00Z' '1998-01-01T00:00:00Z' '1990-01-01T00:00:00Z'
 '1991-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1994-01-01T00:00:00Z'
 '1995-01-01T00:00:00Z' '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2002-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2006-01-01T00:00:00Z' '2007-01-01T00:00:00Z'
 '2008-01-01T00:00:00Z' '2009-01-01T00:00:00Z' '2010-01-01T00:00:00Z'
 '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
37d04efc-0ab2-4499-a891-54dca1013c74
https://api.resourcewatch.org/v1/query/37d04efc-0ab2-4499-a891-54dca1013c74?sql=SELECT datetime FROM soc_040_improved_sanitation


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/37d04efc-0ab2-4499-a891-54dca1013c74?sql=SELECT%20datetime%20FROM%20soc_040_improved_sanitation HTTP/1.1" 200 927
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1992-01-01T00:00:00Z' '1996-01-01T00:00:00Z' '2009-01-01T00:00:00Z'
 '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z' '1990-01-01T00:00:00Z'
 '2015-01-01T00:00:00Z' '1991-01-01T00:00:00Z' '1993-01-01T00:00:00Z'
 '1994-01-01T00:00:00Z' '1995-01-01T00:00:00Z' '1997-01-01T00:00:00Z'
 '1998-01-01T00:00:00Z' '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z'
 '2001-01-01T00:00:00Z' '2002-01-01T00:00:00Z' '2003-01-01T00:00:00Z'
 '2004-01-01T00:00:00Z' '2005-01-01T00:00:00Z' '2007-01-01T00:00:00Z'
 '2006-01-01T00:00:00Z' '2008-01-01T00:00:00Z' '2010-01-01T00:00:00Z'
 '2013-01-01T00:00:00Z' '2014-01-01T00:00:00Z']
b37048be-9b23-4458-a047-888956c69aa1
https://api.resourcewatch.org/v1/query/b37048be-9b23-4458-a047-888956c69aa1?sql=SELECT datetime FROM soc_039_out_of_school_rate_edit


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/b37048be-9b23-4458-a047-888956c69aa1?sql=SELECT%20datetime%20FROM%20soc_039_out_of_school_rate_edit HTTP/1.1" 200 745
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['2000-01-01T00:00:00Z' '2002-01-01T00:00:00Z' '2014-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2006-01-01T00:00:00Z' '2008-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2013-01-01T00:00:00Z' '2015-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2016-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z' '2007-01-01T00:00:00Z'
 '2009-01-01T00:00:00Z' '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z']
8671f536-1979-4b6f-a147-70152fcb44ed
https://api.resourcewatch.org/v1/query/8671f536-1979-4b6f-a147-70152fcb44ed?sql=SELECT datetime FROM soc_036_life_expectancy_at_birth


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/8671f536-1979-4b6f-a147-70152fcb44ed?sql=SELECT%20datetime%20FROM%20soc_036_life_expectancy_at_birth HTTP/1.1" 200 2157
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1964-01-01T00:00:00Z' '1970-01-01T00:00:00Z' '1972-01-01T00:00:00Z'
 '1978-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '1965-01-01T00:00:00Z' '1975-01-01T00:00:00Z' '1979-01-01T00:00:00Z'
 '1986-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2008-01-01T00:00:00Z'
 '1993-01-01T00:00:00Z' '1961-01-01T00:00:00Z' '1977-01-01T00:00:00Z'
 '1981-01-01T00:00:00Z' '1983-01-01T00:00:00Z' '1984-01-01T00:00:00Z'
 '1985-01-01T00:00:00Z' '1994-01-01T00:00:00Z' '1995-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '1960-01-01T00:00:00Z' '1962-01-01T00:00:00Z'
 '1963-01-01T00:00:00Z' '1966-01-01T00:00:00Z' '1967-01-01T00:00:00Z'
 '1968-01-01T00:00:00Z' '1969-01-01T00:00:00Z' '1971-01-01T00:00:00Z'
 '1973-01-01T00:00:00Z' '1974-01-01T00:00:00Z' '2004-01-01T00:00:00Z'
 '1976-01-01T00:00:00Z' '1980-01-01T00:00:00Z' '1982-01-01T00:00:00Z'
 '2009-01-01T00:00:00Z' '1987-01-01T00:00:00Z' '1988-01-01T00:00:00Z'
 '1989-01-01T00:00:00Z' '1990-01-01T00:00:00Z' '1991-01-01T00:00:00Z'
 '1992-01-01T00:00:0

DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/00abb46f-34e2-4bf7-be30-1fb0b1de022f?sql=SELECT%20datetime%20FROM%20soc_008_gdp_per_capita_edit HTTP/1.1" 200 1020
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1991-01-01T00:00:00Z' '1992-01-01T00:00:00Z' '1994-01-01T00:00:00Z'
 '1995-01-01T00:00:00Z' '1998-01-01T00:00:00Z' '2008-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2013-01-01T00:00:00Z' '2016-01-01T00:00:00Z'
 '1990-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '2002-01-01T00:00:00Z'
 '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z' '2005-01-01T00:00:00Z'
 '2006-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2009-01-01T00:00:00Z'
 '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z' '2014-01-01T00:00:00Z'
 '2015-01-01T00:00:00Z' '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2001-01-01T00:00:00Z']
a89c95c7-0b82-4162-b9d8-cc0205e9f7ec
https://api.resourcewatch.org/v1/query/a89c95c7-0b82-4162-b9d8-cc0205e9f7ec?sql=SELECT datetime FROM soc_006_multidimensional_poverty_index


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/a89c95c7-0b82-4162-b9d8-cc0205e9f7ec?sql=SELECT%20datetime%20FROM%20soc_006_multidimensional_poverty_index HTTP/1.1" 200 446
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['2012-01-01T00:00:00Z' '2005-01-01T00:00:00Z' '2006-01-01T00:00:00Z'
 '2007-01-01T00:00:00Z' '2008-01-01T00:00:00Z' '2009-01-01T00:00:00Z'
 '2010-01-01T00:00:00Z' '2011-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
bea122ce-1e4b-465d-8b7b-fa11aadd20f7
https://api.resourcewatch.org/v1/query/bea122ce-1e4b-465d-8b7b-fa11aadd20f7?sql=SELECT datetime FROM soc_004_human_development_index


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/bea122ce-1e4b-465d-8b7b-fa11aadd20f7?sql=SELECT%20datetime%20FROM%20soc_004_human_development_index HTTP/1.1" 200 894
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1992-01-01T00:00:00Z' '1995-01-01T00:00:00Z' '2010-01-01T00:00:00Z'
 '2011-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '2002-01-01T00:00:00Z'
 '1990-01-01T00:00:00Z' '1991-01-01T00:00:00Z' '1994-01-01T00:00:00Z'
 '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1998-01-01T00:00:00Z'
 '1999-01-01T00:00:00Z' '2000-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2003-01-01T00:00:00Z' '2004-01-01T00:00:00Z' '2005-01-01T00:00:00Z'
 '2006-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2008-01-01T00:00:00Z'
 '2009-01-01T00:00:00Z' '2012-01-01T00:00:00Z' '2013-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
95b013a3-389a-4367-83b7-c9d68c28c406
https://api.resourcewatch.org/v1/query/95b013a3-389a-4367-83b7-c9d68c28c406?sql=SELECT datetime FROM foo_43_agriculture_value_added


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/95b013a3-389a-4367-83b7-c9d68c28c406?sql=SELECT%20datetime%20FROM%20foo_43_agriculture_value_added HTTP/1.1" 200 1798
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1962-01-01T00:00:00Z' '1970-01-01T00:00:00Z' '1974-01-01T00:00:00Z'
 '1988-01-01T00:00:00Z' '1998-01-01T00:00:00Z' '2002-01-01T00:00:00Z'
 '2011-01-01T00:00:00Z' '2012-01-01T00:00:00Z' '1964-01-01T00:00:00Z'
 '1960-01-01T00:00:00Z' '1967-01-01T00:00:00Z' '1969-01-01T00:00:00Z'
 '1972-01-01T00:00:00Z' '1973-01-01T00:00:00Z' '1975-01-01T00:00:00Z'
 '1976-01-01T00:00:00Z' '1977-01-01T00:00:00Z' '1979-01-01T00:00:00Z'
 '1980-01-01T00:00:00Z' '1982-01-01T00:00:00Z' '1984-01-01T00:00:00Z'
 '1989-01-01T00:00:00Z' '1990-01-01T00:00:00Z' '1996-01-01T00:00:00Z'
 '1997-01-01T00:00:00Z' '2003-01-01T00:00:00Z' '2005-01-01T00:00:00Z'
 '2008-01-01T00:00:00Z' '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z'
 '1961-01-01T00:00:00Z' '1966-01-01T00:00:00Z' '1963-01-01T00:00:00Z'
 '1983-01-01T00:00:00Z' '1965-01-01T00:00:00Z' '1991-01-01T00:00:00Z'
 '1985-01-01T00:00:00Z' '1993-01-01T00:00:00Z' '1968-01-01T00:00:00Z'
 '1971-01-01T00:00:00Z' '1978-01-01T00:00:00Z' '1981-01-01T00:00:00Z'
 '1986-01-01T00:00:0

DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/d38d0d5c-31b1-47f4-9d2e-d8fba4c7d083?sql=SELECT%20datetime%20FROM%20cit_025_urban_population HTTP/1.1" 200 1638
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1961-01-01T00:00:00Z' '1962-01-01T00:00:00Z' '1985-01-01T00:00:00Z'
 '2008-01-01T00:00:00Z' '1960-01-01T00:00:00Z' '1963-01-01T00:00:00Z'
 '1964-01-01T00:00:00Z' '1965-01-01T00:00:00Z' '1966-01-01T00:00:00Z'
 '1977-01-01T00:00:00Z' '1981-01-01T00:00:00Z' '1983-01-01T00:00:00Z'
 '1989-01-01T00:00:00Z' '2001-01-01T00:00:00Z' '2015-01-01T00:00:00Z'
 '1972-01-01T00:00:00Z' '1967-01-01T00:00:00Z' '1968-01-01T00:00:00Z'
 '1969-01-01T00:00:00Z' '1970-01-01T00:00:00Z' '1971-01-01T00:00:00Z'
 '1973-01-01T00:00:00Z' '1974-01-01T00:00:00Z' '1975-01-01T00:00:00Z'
 '1976-01-01T00:00:00Z' '1978-01-01T00:00:00Z' '1979-01-01T00:00:00Z'
 '1980-01-01T00:00:00Z' '1982-01-01T00:00:00Z' '1984-01-01T00:00:00Z'
 '1986-01-01T00:00:00Z' '1987-01-01T00:00:00Z' '1988-01-01T00:00:00Z'
 '1990-01-01T00:00:00Z' '1991-01-01T00:00:00Z' '1992-01-01T00:00:00Z'
 '1993-01-01T00:00:00Z' '1994-01-01T00:00:00Z' '1995-01-01T00:00:00Z'
 '1996-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '1998-01-01T00:00:00Z'
 '1999-01-01T00:00:0

DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/2e31a1f3-576b-46b4-84f0-3f0cc399f887?sql=SELECT%20datetime%20FROM%20com_006_national_current_accounts_edit HTTP/1.1" 200 1694
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['1981-01-01T00:00:00Z' '1983-01-01T00:00:00Z' '1987-01-01T00:00:00Z'
 '1994-01-01T00:00:00Z' '1997-01-01T00:00:00Z' '2003-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2007-01-01T00:00:00Z' '2010-01-01T00:00:00Z'
 '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z' '1962-01-01T00:00:00Z'
 '1963-01-01T00:00:00Z' '1966-01-01T00:00:00Z' '1969-01-01T00:00:00Z'
 '1971-01-01T00:00:00Z' '1975-01-01T00:00:00Z' '1976-01-01T00:00:00Z'
 '1977-01-01T00:00:00Z' '1980-01-01T00:00:00Z' '1986-01-01T00:00:00Z'
 '1993-01-01T00:00:00Z' '1996-01-01T00:00:00Z' '2001-01-01T00:00:00Z'
 '2004-01-01T00:00:00Z' '2013-01-01T00:00:00Z' '2016-01-01T00:00:00Z'
 '1960-01-01T00:00:00Z' '1961-01-01T00:00:00Z' '1964-01-01T00:00:00Z'
 '1965-01-01T00:00:00Z' '1967-01-01T00:00:00Z' '1968-01-01T00:00:00Z'
 '1970-01-01T00:00:00Z' '1972-01-01T00:00:00Z' '1973-01-01T00:00:00Z'
 '1974-01-01T00:00:00Z' '1978-01-01T00:00:00Z' '1979-01-01T00:00:00Z'
 '1982-01-01T00:00:00Z' '1984-01-01T00:00:00Z' '1985-01-01T00:00:00Z'
 '1988-01-01T00:00:0

DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/11278cb6-b298-49a1-bf71-f1e269f40758?sql=SELECT%20datetime%20FROM%20soc_025_gender_inequality_index HTTP/1.1" 200 477
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


['2013-01-01T00:00:00Z' '1995-01-01T00:00:00Z' '2000-01-01T00:00:00Z'
 '2005-01-01T00:00:00Z' '2010-01-01T00:00:00Z' '2011-01-01T00:00:00Z'
 '2012-01-01T00:00:00Z' '2014-01-01T00:00:00Z' '2015-01-01T00:00:00Z']
7a551dd8-b59c-4f59-9d50-c92cb61c5799
https://api.resourcewatch.org/v1/query/7a551dd8-b59c-4f59-9d50-c92cb61c5799?sql=SELECT year FROM foo_042_agricultural_production_value


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/7a551dd8-b59c-4f59-9d50-c92cb61c5799?sql=SELECT%20year%20FROM%20foo_042_agricultural_production_value HTTP/1.1" 200 250
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org


[2014]
8bc79a36-d77e-4ee3-b9bc-c77146cfc503
https://api.resourcewatch.org/v1/query/8bc79a36-d77e-4ee3-b9bc-c77146cfc503?sql=SELECT year FROM foo_019_child_malnutrition_edit


DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/8bc79a36-d77e-4ee3-b9bc-c77146cfc503?sql=SELECT%20year%20FROM%20foo_019_child_malnutrition_edit HTTP/1.1" 200 438
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


[2013 2005 2015 2014 2010 2012 2001 2016 2007 2008 2009 2002 2000 2011 2006
 2004 1995 1994 1999]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
annual_national_datasets['years_covered']

Dataset
a7067e9f-fe40-4338-85da-13a6071c76fe                                                 None
03bfb30e-829f-4299-bab9-b2be1b66b5d4                                                 None
62c988a7-1e4d-418e-87bf-a743e24209e8           [2009, 2010, 2011, 2012, 2013, 2014, 2015]
cc354f7f-2622-44cb-91c1-73559373de72    [1991, 1992, 1993, 1994, 1995, 1996, 1997, 199...
c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd    [2000, 2007, 2008, 2009, 2010, 2011, 2012, 201...
d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648    [2006, 2007, 2008, 2009, 2010, 2011, 2012, 201...
1b97e47e-ca18-4e50-9aae-a2853acca3f0    [1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...
6e10074a-a368-4afd-8564-db59814cdb74    [1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...
10337db6-8321-445e-a60b-28fc1e114f29    [1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...
d446a52e-c4c1-4e74-ae30-3204620a0365    [1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...
52c55378-0484-48c3-92fc-3ee94d21c716    [1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...
fa

In [11]:
def determine_if_analysis_possible(years_covered):
    flag = False
    try:
        if (2000 in years_covered) and (2015 in years_covered):
            flag = True
    except:
        pass
    return flag

annual_national_datasets.loc[:, 'analysis_possible'] = list(map(determine_if_analysis_possible, annual_national_datasets['years_covered']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
annual_national_datasets['analysis_possible'] 

Dataset
a7067e9f-fe40-4338-85da-13a6071c76fe    False
03bfb30e-829f-4299-bab9-b2be1b66b5d4    False
62c988a7-1e4d-418e-87bf-a743e24209e8    False
cc354f7f-2622-44cb-91c1-73559373de72     True
c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd    False
d3a6b89f-cf5c-40cf-b2b3-ac1c8315c648    False
1b97e47e-ca18-4e50-9aae-a2853acca3f0     True
6e10074a-a368-4afd-8564-db59814cdb74    False
10337db6-8321-445e-a60b-28fc1e114f29     True
d446a52e-c4c1-4e74-ae30-3204620a0365    False
52c55378-0484-48c3-92fc-3ee94d21c716     True
fa6443ff-eb95-4d0f-84d2-f0c91682efdf     True
a290675c-9528-4a51-8201-f6c2d7848744    False
c18a38cd-94ff-48cd-818f-6ffb05992abb     True
fe311144-8c0e-4440-b068-6efd057e0f6a    False
37d04efc-0ab2-4499-a891-54dca1013c74     True
b37048be-9b23-4458-a047-888956c69aa1     True
8671f536-1979-4b6f-a147-70152fcb44ed     True
00abb46f-34e2-4bf7-be30-1fb0b1de022f     True
a89c95c7-0b82-4162-b9d8-cc0205e9f7ec    False
bea122ce-1e4b-465d-8b7b-fa11aadd20f7     True
95b013a3-389a-4367-83b7-c9

### Pick the value column to use

In [13]:
def pick_value_col(info):
    analysis_possible = info[0]
    column_names = info[1]
    if analysis_possible:
        print("Available column names: {}".format(column_names))
        if 'yr_data' in column_names:
            return 'yr_data'
        elif 'total_data' in column_names:
            return 'total_data'
        else:
            value_col = input("What is the value column called? Type nothing for None")
            return value_col if value_col else None
    else:
        return None

annual_national_datasets.loc[:, 'value_col'] = list(map(pick_value_col, zip(annual_national_datasets['analysis_possible'], annual_national_datasets['column_names'])))

Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'country_name', 'country_code', 'rw_country_code', 'rw_country_name', 'datetime', 'yr_data'])
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'index', 'country_code', 'country_name', 'datetime', 'urban_data', 'rural_data', 'total_data', 'rw_country_code', 'rw_country_name'])
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', '_2012_2015_avg', 'country', 'datetime', 'rw_country_code', 'rw_country_name', 'yr_data'])
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'country_code', 'country_name', 'datetime', 'index', 'indicator_code', 'indicator_name', 'rw_country_code', 'rw_country_name', 'yr_data'])
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'country', 'datetime', 'gain_data', 'readiness_data', 'rw_country_code', 'rw_country_name', 'vulnerability_data'])
Wh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Pick the country column to use

In [20]:
def pick_country_col(column_names):
    print("Available column names: {}".format(column_names))
    if 'rw_country_code' in column_names:
        return 'rw_country_code'
    else:
        country_col = input("What is the country column called? Type nothing for None")
        return country_col if country_col else None

annual_national_datasets.loc[:, 'country_col'] = list(map(pick_country_col, annual_national_datasets['column_names']))

Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'a_aggr', 'add_a', 'add_b', 'b_aggr', 'c_aggr', 'cartodb_georef_status', 'cl_aggr', 'cl_rating', 'country', 'd_aggr', 'e_aggr', 'f_aggr', 'g_aggr', 'index', 'pr_aggr', 'pr_rating', 'rw_country_code', 'rw_country_name', 'status', 'total_aggr', 'status_text'])
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'country', 'employment_of_total_labour_force', 'employment_pulp_and_paper_1000', 'employment_roundwood_production_1000', 'employment_total_for_the_forest_sector_1000', 'employment_wood_processing_1000', 'gross_value_added_contribution_to_gdp', 'gross_value_added_pulp_and_paper_us_million', 'gross_value_added_roundwood_production_us_million', 'gross_value_added_total_for_the_forest_sector_us_million', 'gross_value_added_wood_processing_us_million', 'year', 'rw_country_code', 'rw_country_name'])
Available column names: dict_keys(['cartodb_id', 'the_geom', 'the_geom_w

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Preview datasets to run analysis with

In [15]:
analyse_these = annual_national_datasets[annual_national_datasets['analysis_possible']==True]

analyse_these

Unnamed: 0_level_0,name,table_name,provider,date_updated,num_metadata,metadata,num_layers,layers,num_widgets,widgets,num_tags,tags,spatial_resolution,temporal_resolution,column_names,column_types,year_col,years_covered,analysis_possible,value_col,country_col
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cc354f7f-2622-44cb-91c1-73559373de72,soc.074 Employment_in_agriculture,soc_074_employment_in_agriculture,cartodb,2018-04-09T14:28:42.603Z,1,"[{'id': '5ab72ec06f76b50012aeef07', 'type': 'm...",1,[{'id': '0a8fbc9a-c05d-421d-bedf-15ebb1e0c4f5'...,4,[{'id': 'd4f32fb9-9cc9-44b3-9abd-72ae78b43921'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1991, 1992, 1993, 1994, 1995, 1996, 1997, 199...",True,yr_data,rw_country_name
1b97e47e-ca18-4e50-9aae-a2853acca3f0,wat.005 Access to Improved Water Source,wat_005_improved_water_access,cartodb,2018-04-05T20:08:25.411Z,2,"[{'id': '59d79d8ae353f300125c0dec', 'type': 'm...",3,[{'id': '14bfb1ca-a672-4bec-8060-b8e51a5f21bc'...,5,[{'id': '8a648a08-537e-403b-90a6-dc84efff3029'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, i...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...",True,total_data,rw_country_name
10337db6-8321-445e-a60b-28fc1e114f29,cit.029 Municipal Waste,cit_029_municipal_waste,cartodb,2018-04-03T14:37:03.920Z,2,"[{'id': '59dd3a069caa8700129be7cb', 'type': 'm...",1,[{'id': '2800bf8d-6d27-4a77-a621-ef65cf44c7bf'...,2,[{'id': '32caf06d-b02a-440a-ac60-ba4ebec4ffb3'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, _...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...",True,yr_data,rw_country_name
52c55378-0484-48c3-92fc-3ee94d21c716,com.010 Gross Domestic Product,com_010_gdp_ppp_usd,cartodb,2018-03-30T18:30:22.879Z,2,"[{'id': '59dce62e991d67001321650c', 'type': 'm...",1,[{'id': 'f98f8874-3d5d-4818-a75c-29a8d1b57cb3'...,3,[{'id': 'e36177f8-0c00-499d-9036-c98d06f03d47'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...",True,yr_data,rw_country_name
fa6443ff-eb95-4d0f-84d2-f0c91682efdf,cli.029 Vulnerability to Climate Change Index,cli_029_vulnerability_to_cc,cartodb,2018-03-27T17:12:41.548Z,2,"[{'id': '59a427ab7b6c000012baa707', 'type': 'm...",1,[{'id': '07b6e469-d1c4-4ab8-a8a7-cf37f344ae4c'...,3,[{'id': '8c44a854-63e7-4ce6-b864-4858c7394852'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1995, 1996, 1997, 1998, 1999, 2000, 2001, 200...",True,gain_data,rw_country_name
c18a38cd-94ff-48cd-818f-6ffb05992abb,com.015 Recycled waste,com_015_recycling_rates,cartodb,2018-03-13T19:24:38.231Z,2,"[{'id': '59d5390066b9630011465ed0', 'type': 'm...",1,[{'id': '95557b73-07c5-48a6-841b-58f9d782b58e'...,2,[{'id': 'c77bc834-d1e6-4b5b-91de-81e72dcfcb9d'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, _...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...",True,yr_data,rw_country_name
37d04efc-0ab2-4499-a891-54dca1013c74,soc.040 Access to Improved Sanitation Facilities,soc_040_improved_sanitation,cartodb,2018-03-13T16:25:26.530Z,2,"[{'id': '59dd226d26cb7a0013147ab1', 'type': 'm...",3,[{'id': 'e01c4ff1-94a8-466c-a494-85dfc6a54b36'...,3,[{'id': 'ac7c76c3-481e-42f5-9db8-d61d6f47b4d3'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...",True,total_data,rw_country_name
b37048be-9b23-4458-a047-888956c69aa1,soc.039 Out-of-school rate,soc_039_out_of_school_rate_edit,cartodb,2018-03-13T16:22:44.930Z,2,"[{'id': '59dd23f18e6ae800119b3e4a', 'type': 'm...",1,[{'id': 'da812e1b-573a-45e7-8fbf-6f996133439c'...,1,[{'id': '6ba10980-af9b-4990-8a8b-9f375c416d55'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1999, 2000, 2001, 2002, 2003, 2004, 2005, 200...",True,yr_data,rw_country_name
8671f536-1979-4b6f-a147-70152fcb44ed,soc.036 Life Expectancy at Birth,soc_036_life_expectancy_at_birth,cartodb,2018-03-13T16:20:09.948Z,2,"[{'id': '59a427a9ce86740012fcb237', 'type': 'm...",1,[{'id': '562795be-8713-4cc9-acf6-5925f0cbff08'...,2,[{'id': 'cedf5035-2165-4c3c-a7b2-c1f97e041dcc'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1960, 1961, 1962, 1963, 1964, 1965, 1966, 196...",True,yr_data,rw_country_name
00abb46f-34e2-4bf7-be30-1fb0b1de022f,soc.008 Gross Domestic Product Per Capita (PPP...,soc_008_gdp_per_capita_edit,cartodb,2018-03-13T15:58:44.854Z,2,"[{'id': '59dd38fd9caa8700129be7ca', 'type': 'm...",1,[{'id': 'd68e4d68-16d6-4397-a415-de8e080a86ba'...,2,[{'id': '477057db-3d6e-4559-b8e1-5b790653aad3'...,1,"[{'type': 'vocabulary', 'attributes': {'resour...",National,Annual,"(cartodb_id, the_geom, the_geom_webmercator, c...","[<class 'int'>, <class 'NoneType'>, <class 'No...",datetime,"[1990, 1991, 1992, 1993, 1994, 1995, 1996, 199...",True,yr_data,rw_country_name


# Run the analysis

In [31]:
def extract_unique(df, col):
    return df[col].unique()

def run_linear_regressions(datax, xyear, xval_col, xcountry_col, xname,
                           datay_from_rwapi, ystartyear, yendyear,
                           test_size):
    ''' 
    Inputs: Data and needed column names
    Outputs: square matrix of regression coefficients for each indicator
    '''
    
    #datay_from_rw_api
    #yprod_col, yflow_col,yyear_col, yval_col,ycountry_col,
    
    # Result will be an upper right triangular square matrix in 4 dimensions
    results = {}    
    
    all_countries = datax[xcountry_col].unique()
    msg = "regressing GHG-GDP Divergence Index against {}"
    q_yr = "https://api.resourcewatch.org/v1/query/{}?sql=SELECT {}, {} FROM {} WHERE {}={}"#.format(ds_id,value_col,country_col,table_name,year_col,target_year)
    q_dt = "https://api.resourcewatch.org/v1/query/{}?sql=SELECT {}, {} FROM {} WHERE EXTRACT(year from {})={}"#.format(ds_id,value_col,country_col,table_name,year_col,target_year)
    
    for ds in datay_from_rwapi.iterrows():
        
        ds_id = ds[0]        
        table_name = ds[1]['table_name']
        year_col = ds[1]['year_col']
        value_col = ds[1]['value_col']
        country_col = ds[1]['country_col']
        
        if not (year_col and value_col and country_col and table_name):
            results[ds_id] = 'Some missing information'
            continue
        
        logging.info(msg.format(table_name))
    
        # Extract data
        # TO DO: allow for year ranges
        data_x = datax.copy()

        if year_col == 'datetime':
            res = req.get(q_dt.format(ds_id,value_col,country_col,table_name,year_col,ystartyear))
        elif year_col == 'year':
            res = req.get(q_yr.format(ds_id,value_col,country_col,table_name,year_col,ystartyear))
        else:
            continue
            
        try:
            data_y_start = pd.DataFrame(res.json()['data'])
        except:
            print(res.text)
            
        data_y_start = data_y_start.set_index(country_col)
        # Avoid division by 0
        data_y_start = data_y_start.loc[data_y_start[value_col] > 0]

        if year_col == 'datetime':
            res = req.get(q_dt.format(ds_id,value_col,country_col,table_name,year_col,yendyear))
        elif year_col == 'year':
            res = req.get(q_yr.format(ds_id,value_col,country_col,table_name,year_col,yendyear))
        else:
            continue
        
        try:
            data_y_end = pd.DataFrame(res.json()['data'])
        except:
            print(res.text)        
            
        data_y_end = data_y_end.set_index(country_col)

        logging.debug('Start {}'.format(data_y_start.head()))
        logging.debug('End {}'.format(data_y_end.head()))

        data_y = data_y_end[value_col].div(data_y_start[value_col])
        data_y = data_y[pd.notnull(data_y)]
        logging.debug('Change percent: {}'.format(data_y.head()))

        # Throw away all but intersection of countries
        logging.debug('data_x countries: {}'.format(set(data_x[xcountry_col])))
        logging.debug('data_y countries: {}'.format(set(data_y.index)))
        keep_countries = set(data_x[xcountry_col]) & set(data_y.index)
        skipped_countries = [country for country in all_countries if country not in keep_countries]

        data_x = data_x.set_index(xcountry_col).loc[keep_countries, xval_col]
        data_y = data_y.loc[keep_countries]

        # Reshape for regression
        data_x_reg = data_x.values.reshape(-1, 1)
        data_y_reg = data_y.values.reshape(-1, 1)

        if (len(data_x_reg)>test_size) & (len(data_y_reg)>test_size):
            pass
        else:
            results[ds_id] = {
                'r_squared': None,
                'skipped_countries': skipped_countries
            }
            continue

        # Split for training / test set
        X_train, X_test, y_train, y_test = train_test_split(data_x_reg, data_y_reg, 
                                                            test_size=test_size, random_state=42)

        # Run regression
        lm = linear_model.LinearRegression() 
        lm.fit(X_train, y_train)

        # Extract coefficient of determination (r^2)
        r_squared = lm.score(X_test, y_test)
        #y_pred = lm.predict(X_test)
        #r_squared2 = metrics.r2_score(y_test, y_pred)

        logging.info('rsquared: {}'.format(r_squared)) #, r_squared2))
        logging.info('num skipped countries: {}'.format(len(skipped_countries)))
        logging.info('num training countries: {}'.format(len(X_train)))
        logging.info('num testing countries: {}'.format(len(X_test)))

        # Store results
        results[ds_id] = {
            'r_squared': r_squared,
            'skipped_countries': skipped_countries
        }
        
        if r_squared > .1:
            results[ds_id]['data_x'] = data_x
            results[ds_id]['data_y'] = data_y
            
    return results    

In [32]:
DATAX = pd.read_csv('/Users/nathansuberi/Documents/GitHub/nsuberi.github.io/Compass Degrees for Summary Period.csv')
kwargs = {
    'datax': DATAX,
    'xname':'compass of divergence',
    'xyear': None,
    'xval_col': 'prod_degree',
    'xcountry_col': 'country',
    'datay_from_rwapi': annual_national_datasets[annual_national_datasets['analysis_possible']==True],
    'ystartyear': 2000,
    'yendyear': 2015,
    'test_size':30
}

regression_results = run_linear_regressions(**kwargs)

INFO:root:regressing GHG-GDP Divergence Index against soc_074_employment_in_agriculture
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org
DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/cc354f7f-2622-44cb-91c1-73559373de72?sql=SELECT%20yr_data,%20rw_country_code%20FROM%20soc_074_employment_in_agriculture%20WHERE%20EXTRACT(year%20from%20datetime)=2000 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org
DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/query/cc354f7f-2622-44cb-91c1-73559373de72?sql=SELECT%20yr_data,%20rw_country_code%20FROM%20soc_074_employment_in_agriculture%20WHERE%20EXTRACT(year%20from%20datetime)=2015 HTTP/1.1" 200 None
DEBUG:root:Start                    yr_data
rw_country_code           
BEN              45.299999
MLI              43.000000
BLZ              25.900000
KWT               2.400000
AFG              69.400002
DEBUG:

In [34]:
#logging.info('Number of regressions attempted: {}'.format(len(regression_results)))
#logging.info('Results: {}'.format(regression_results))

def pick_not_null(d):
    new_d = dict()
    for key, vals in d.items():
        if vals['r_squared']:
            new_d[key] = vals
    return new_d

notnull_results = pick_not_null(regression_results)


sorted_results = sorted(notnull_results.items(), 
                        key=lambda res: res[1]['r_squared'], 
                        reverse=True)
             
# Only keep non-perfect correlations, 
# and ones for which no more than 10 countries are skipped
filterd_sorted_results = [res for res in sorted_results if 
                  (res[1]['r_squared'] < 1) and 
                  (len(res[1]['skipped_countries']) < 40) ]

filterd_sorted_results

[('37d04efc-0ab2-4499-a891-54dca1013c74', {'data_x': country
   CHE    58.889113
   SRB    32.146467
   ITA    45.456782
   MRT    -6.595028
   GAB    36.665589
   ARE    11.330744
   MDV    26.202291
   FIN    83.149673
   MAR     3.232786
   GHA     2.905620
   QAT    21.001588
   SUR    48.210178
   CAN    49.514016
   VNM   -12.928423
   LKA    16.059468
   DZA     0.483434
   MMR    38.222570
   AND    57.566213
   PHL    19.037050
   VEN    27.183954
   PRY    17.339127
   UGA   -12.179444
   AGO    -5.705432
   FRA    89.199677
   BLZ    20.791564
   SLV    19.735446
   LBR   -25.246585
   PER     4.674379
   CYP    52.184520
   BOL    -3.353080
   BRA    -2.927058
   PAK     8.886589
   BHR     7.561849
   HND    -3.853940
   TGO    -3.749337
   HTI   -32.248478
   SVK    56.648949
   ISR    27.043127
   GRD   -14.889285
   VUT     4.492615
   NAM     7.274337
   KWT    -0.610503
   GNQ   -28.459393
   NIC    25.041628
   NLD    61.926373
   BFA   -11.376092
   PNG   -10.733077

In [35]:
analyse_these.loc['37d04efc-0ab2-4499-a891-54dca1013c74']

name                    soc.040 Access to Improved Sanitation Facilities
table_name                                   soc_040_improved_sanitation
provider                                                         cartodb
date_updated                                    2018-03-13T16:25:26.530Z
num_metadata                                                           2
metadata               [{'id': '59dd226d26cb7a0013147ab1', 'type': 'm...
num_layers                                                             3
layers                 [{'id': 'e01c4ff1-94a8-466c-a494-85dfc6a54b36'...
num_widgets                                                            3
widgets                [{'id': 'ac7c76c3-481e-42f5-9db8-d61d6f47b4d3'...
num_tags                                                               1
tags                   [{'type': 'vocabulary', 'attributes': {'resour...
spatial_resolution                                              National
temporal_resolution                                

# Upload the results to Carto and RW API

In [None]:
cc = cartoframes.CartoContext()

# Configure the vega chart output, which can be uploaded to the RW API

In [None]:
Config=''.join([
    "{\n",
    "  \"range\": {\n",
    "    \"dotSize\": [20, 250],\n",
    "    \"category\": [\n",
    "      \"#3BB2D0\",\n",
    "      \"#2C75B0\",\n",
    "      \"#FAB72E\",\n",
    "      \"#EF4848\",\n",
    "      \"#65B60D\",\n",
    "      \"#717171\"\n",
    "    ],\n",
    "    \"category20\": [\n",
    "      \"#3BB2D0\",\n",
    "      \"#2C75B0\",\n",
    "      \"#FAB72E\",\n",
    "      \"#EF4848\",\n",
    "      \"#65B60D\",\n",
    "      \"#C32D7B\",\n",
    "      \"#F577B9\",\n",
    "      \"#5FD2B8\",\n",
    "      \"#F1800F\",\n",
    "      \"#9F1C00\",\n",
    "      \"#A5E9E3\",\n",
    "      \"#B9D765\",\n",
    "      \"#393F44\",\n",
    "      \"#CACCD0\",\n",
    "      \"#717171\"\n",
    "    ],\n",
    "    \"ordinal\": { \"scheme\": \"greens\" },\n",
    "    \"ramp\": { \"scheme\": \"purples\" }\n",
    "  },\n",
    "  \"axis\": {\n",
    "    \"labelFontSize\": 13,\n",
    "    \"labelFont\": \"Lato\",\n",
    "    \"labelColor\": \"#717171\",\n",
    "    \"labelPadding\": 10,\n",
    "    \"ticks\": true,\n",
    "    \"tickSize\": 5,\n",
    "    \"tickColor\": \"#A9ABAD\",\n",
    "    \"tickOpacity\": 0.5,\n",
    "    \"tickExtra\": false\n",
    "  },\n",
    "  \"axisX\": {\n",
    "    \"bandPosition\": 0.5,\n",
    "    \"domainWidth\": 1.2,\n",
    "    \"domainColor\": \"#A9ABAD\",\n",
    "    \"labelAlign\": \"center\",\n",
    "    \"labelBaseline\": \"top\"\n",
    "  },\n",
    "  \"axisY\": {\n",
    "    \"domain\": false,\n",
    "    \"labelAlign\": \"left\",\n",
    "    \"labelBaseline\": \"bottom\",\n",
    "    \"tickOpacity\": 0.5,\n",
    "    \"grid\": true,\n",
    "    \"ticks\": false,\n",
    "    \"gridColor\": \"#A9ABAD\",\n",
    "    \"gridOpacity\": 0.5\n",
    "  },\n",
    "  \"mark\": {\n",
    "    \"fill\": \"#3BB2D0\"\n",
    "  },\n",
    "  \"symbol\": {\n",
    "    \"fill\": \"#3BB2D0\",\n",
    "    \"stroke\": \"#fff\"\n",
    "  },\n",
    "  \"rect\": {\n",
    "    \"cornerRadius\": 0.3,\n",
    "    \"fill\": \"#3BB2D0\"\n",
    "  },\n",
    "  \"line\": {\n",
    "    \"interpolate\": \"linear\",\n",
    "    \"stroke\": \"#3BB2D0\",\n",
    "    \"fillOpacity\": 0\n",
    "  }\n",
    "}\n"])

In [None]:
specV3 = json.loads(''.join(["{\n",
    "  \"autosize\": {\n",
    "    \"type\": \"pad\",\n",
    "    \"resize\": true,\n",
    "    \"contains\": \"padding\"\n",
    "  },\n",
    "  \"data\": [\n",
    "    {\n",
    "      \"name\": \"table\",\n",
    "      \"url\": \"https://wri-rw.carto.com/api/v2/sql?q=SELECT year as x, bleached_area as y FROM sotp_bleaching_alerts ORDER  BY year\",\n",
    "      \"format\": {\n",
    "        \"type\": \"json\",\n",
    "        \"property\": \"rows\"\n",
    "      },\n",
    "      \"transform\": [\n",
    "        {\n",
    "          \"type\": \"window\",\n",
    "          \"sort\": {\n",
    "            \"field\": \"x\",\n",
    "            \"order\": \"ascending\"\n",
    "          }\n",
    "        }\n",
    "      ]\n",
    "    }\n",
    "  ],\n",
    "  \"scales\": [\n",
    "    {\n",
    "      \"name\": \"x\",\n",
    "      \"type\": \"band\",\n",
    "      \"domain\": {\n",
    "        \"data\": \"table\",\n",
    "        \"field\": \"x\"\n",
    "      },\n",
    "      \"range\": \"width\",\n",
    "      \"padding\": 0.05,\n",
    "      \"round\": true\n",
    "    },\n",
    "    {\"type\": \"linear\",\n",
    "      \"name\": \"y\",\n",
    "      \"domain\": {\n",
    "        \"data\": \"table\",\n",
    "        \"field\": \"y\"\n",
    "      },\n",
    "      \"nice\": true,\n",
    "      \"range\": \"height\"\n",
    "    }\n",
    "  ],\n",
    "  \"axes\": [\n",
    "    {\n",
    "      \"orient\": \"bottom\",\n",
    "      \"scale\": \"x\",\n",
    "      \"labelOverlap\":\"parity\",\n",
    "      \"encode\": {\n",
    "        \"labels\": {\n",
    "          \"update\": {\n",
    "            \"angle\": {\n",
    "              \"value\": 90\n",
    "            },\n",
    "            \"align\": {\n",
    "              \"value\": \"left\"\n",
    "            },\n",
    "            \"baseline\": {\n",
    "              \"value\": \"middle\"\n",
    "            }\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    },\n",
    "    {\n",
    "      \"orient\": \"left\",\n",
    "      \"labelOverlap\":\"parity\",\n",
    "      \"scale\": \"y\",\n",
    "      \"format\": \"2s\"\n",
    "    }\n",
    "  ],\n",
    "  \"marks\": [\n",
    "    {\n",
    "      \"type\": \"rect\",\n",
    "      \"from\": {\n",
    "        \"data\": \"table\"\n",
    "      },\n",
    "      \"encode\": {\n",
    "        \"enter\": {\n",
    "          \"x\": {\n",
    "            \"scale\": \"x\",\n",
    "            \"field\": \"x\"\n",
    "          },\n",
    "          \"width\": {\n",
    "            \"scale\": \"x\",\n",
    "            \"band\": 1\n",
    "          },\n",
    "          \"y\": {\n",
    "            \"scale\": \"y\",\n",
    "            \"field\": \"y\"\n",
    "          },\n",
    "          \"y2\": {\n",
    "            \"scale\": \"y\",\n",
    "            \"value\": 0\n",
    "          }\n",
    "        },\n",
    "        \"update\": {\n",
    "          \"opacity\": {\n",
    "            \"value\": 1\n",
    "          }\n",
    "        },\n",
    "        \"hover\": {\n",
    "          \"opacity\": {\n",
    "            \"value\": 0.8\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    "}"]))
                  
vega4view = dict(specV3)
vega4view['config']=json.loads(Config)
vega4view['width']=300
vega4view['height']=200
Vega(vega4view)