# Import libraries

In [1]:
import pandas as pd
pd.options.display.max_columns = 200

import boto3
import io

import requests as req
import json
from datetime import datetime

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

import ast

# Authenticate to S3

In [2]:
aws_access_key_id = ''#os.environ.get('aws_access_key_id')
aws_secret_access_key = ''#os.environ.get('aws_secret_access_key')

s3_bucket = "wri-projects"
s3_folder = "resourcewatch/rw_api_backups/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

#https://alexwlchan.net/2017/07/listing-s3-keys/
def get_matching_s3_keys(bucket, prefix='', suffix=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            size = obj['Size']
            if key.startswith(prefix) and key.endswith(suffix):
                yield key, size

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break
            
# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

def write_to_S3(df, bucket, key):
    csv_buffer = io.StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

# List available API dumps

In [3]:
all_backups = []
try:
    for _key, _size in get_matching_s3_keys(bucket=s3_bucket, prefix=s3_folder, suffix='.csv'):
        all_backups.append('s3://wri-public-data/{}'.format(_key))
except:
    print('No backups available')

all_backups

No backups available


[]

# View fields from dump

In [218]:
api = pd.read_csv(all_backups[0], index_col=0)
layers_to_revert = ['a0aecb8d-07ee-42e6-be3d-e5cabf12b0a9',
'ea208a8b-4559-434b-82ee-95e041596a3a',
'1616a329-1bf0-4a45-992f-3087b76c232e',
'20cc5eca-8c63-4c41-8e8e-134dcf1e6d76',
'a9e33aad-eece-4453-8279-31c4b4e0583f',
'acf42a1b-104b-4f81-acd0-549f805873fb',
'c667617a-44e8-4181-b96d-f99bbe73c331',
'8746e75d-2749-405e-8f3b-0c12097860a1',
'8ee88f34-db15-4711-a76d-bf82dbfcffed']

#metadata.loc['7793f46c-a48a-466f-a8ce-ca1a87b7aeed']['metadata']
#metadata = ast.literal_eval(metadata)
for dset, layers in api.loc[layers_to_revert, 'layers'].items():
    print('~~')
    print('dataset id:', dset)
    print()
    for l in ast.literal_eval(layers):
        print('name:', l['attributes']['name'])
        print('description:', l['attributes']['description'])
        print()


~~
dataset id: a0aecb8d-07ee-42e6-be3d-e5cabf12b0a9

name: 2014-2018 Number of Migrant Deaths
description: Deaths along migratory routes worldwide from 2014 to the present.

name: Migrant Deaths (Past 2 Months)
description: Migrant deaths reported in the past 2 months.

~~
dataset id: ea208a8b-4559-434b-82ee-95e041596a3a

name: 2017 Conflict and Protest Events in African and Asian States
description: 

name: Conflict and Protest Events in African and Asian States (Past 30 Days)
description: Records of violence and protests in Africa and 10 countries in South and Southeast Asia.

name: ACLED Country Coverage
description: Countries included in the Conflict and Protest Events in Asian and African States data produced by ACLED.

~~
dataset id: 1616a329-1bf0-4a45-992f-3087b76c232e

name: Major Floods (Past Month)
description: Major flood events that ended within the past month.

~~
dataset id: 20cc5eca-8c63-4c41-8e8e-134dcf1e6d76

name: Active Fires (Past Week)
description: VIIRS-derived ac

# Download current info from RW API

In [6]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000, "language": "en"}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "published":atts["published"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

logging.info("Number of datasets on RW API: " + str(current_datasets_on_api.shape[0]))

INFO:root:Number of datasets on RW API: 375


In [228]:
current_datasets_on_api.loc['995ec4fe-b3cc-4cf4-bd48-b89d4e3ea072', 'metadata']

[{'attributes': {'application': 'rw',
   'columns': {'countryname': {'alias': ''},
    'datetime': {'alias': 'Year'},
    'georegion': {'alias': 'Georegion'},
    'geosubregion': {'alias': 'Geosubregion'},
    'incomegroup': {'alias': 'Income Group',
     'description': 'GNI per capita (low, lower middle, upper middle, high)'},
    'iso3v10': {'alias': 'ISO Code'},
    'lendingcategory': {'alias': 'Lending Category',
     'description': 'IDA, IBRD, blend, not classified'},
    'rural_area': {'alias': 'Rural Area (km²)'},
    'rural_area_pct': {'alias': 'Rural Areas as % of Country Area'},
    'rural_population1990': {'alias': '1990 Rural Population'},
    'rural_population1990pct': {'alias': 'Percentage 1990 Rural Population in Coastal Zones'},
    'rural_population2000': {'alias': '2000 Rural Population'},
    'rural_population2000pct': {'alias': 'Percentage 2000 Rural Population in Coastal Zones'},
    'rural_population2010': {'alias': '2010 Rural Population'},
    'rural_population2

# Upload to S3

In [7]:
dt = str(datetime.now()).replace(' ', '_')
write_to_S3(current_datasets_on_api, s3_bucket, s3_folder + 'mondayBeforeLaunch_{}.csv'.format(dt))

# Reformat to Metadata CSV

In [126]:
metadatas_all = current_datasets_on_api.loc[:,'metadata']
[m for m in metadatas_all.items()]

[('6cfd6255-609e-4922-8709-5aaad0db9fae', []),
 ('995ec4fe-b3cc-4cf4-bd48-b89d4e3ea072',
  [{'attributes': {'application': 'rw',
     'columns': {'countryname': {'alias': ''},
      'datetime': {'alias': 'Year'},
      'georegion': {'alias': 'Georegion'},
      'geosubregion': {'alias': 'Geosubregion'},
      'incomegroup': {'alias': 'Income Group',
       'description': 'GNI per capita (low, lower middle, upper middle, high)'},
      'iso3v10': {'alias': 'ISO Code'},
      'lendingcategory': {'alias': 'Lending Category',
       'description': 'IDA, IBRD, blend, not classified'},
      'rural_area': {'alias': 'Rural Area (km²)'},
      'rural_area_pct': {'alias': 'Rural Areas as % of Country Area'},
      'rural_population1990': {'alias': '1990 Rural Population'},
      'rural_population1990pct': {'alias': 'Percentage 1990 Rural Population in Coastal Zones'},
      'rural_population2000': {'alias': '2000 Rural Population'},
      'rural_population2000pct': {'alias': 'Percentage 2000 Ru

In [221]:
metadatas_all = current_datasets_on_api.loc[:,'metadata']
#metadatas_published = current_datasets_on_api.loc[current_datasets_on_api['published'],'metadata']

def try_pull(d, k):
    try:
        return d[k]
    except:
        if 'dataset' in d:
            print('Missed', k, 'in', d['dataset'])
        else:
            print('No metadata available')
        return None

def try_reformat(source_list):
    if source_list:
        return '/'.join([s['source-name'] for s in source_list if s])
    else:
        print('Couldnt reformat source list')
        return None
    
# Can use dict.get(key, default_value) instead of try_pull... try_pull has benefit of calling out which fields are missed
def create_row(info):
    ds = info[0]
    metadata = info[1]
    try:
        metadata = metadata[0]['attributes']
    except:
        return {'Unique ID':ds, 'Public Title':'skip'}
    #print(metadata.keys())
    info = metadata['info']
    #print(info.keys())
    return {
        'Public Title':try_pull(metadata,'name'),
        'Description':try_pull(metadata,'description'),
        'Subtitle':try_pull(metadata,'source'),
        'Source Organizations':try_reformat(try_pull(info,'sources')),
        'Function':try_pull(info,'functions'),
        'Unique ID':try_pull(metadata,'dataset'),
        'WRI ID':try_pull(info,'wri_rw_id'),
        'Data Type':try_pull(info,'data_type'),
        'Formal Name':try_pull(info,'technical_title'),
        'Cautions':try_pull(info,'cautions'),
        'Citation':try_pull(info,'citation'),
        'License':try_pull(info,'license'),
        'License Link':try_pull(info,'license_link'),
        'Geographic Coverage':try_pull(info,'geographic_coverage'),
        'Spatial Resolution':try_pull(info,'spatial_resolution'),
        'Date of Content':try_pull(info,'date_of_content'),
        'Frequency of Updates':try_pull(info,'frequency_of_updates'),
        'Learn More Link':try_pull(info,'learn_more_link'),
        'Download from S3':try_pull(info,'data_download_link'),
        'Download from Source':try_pull(info,'data_download_original_link'),
        'Columns and Aliases':json.dumps(try_pull(metadata,'columns'))
    }
                            
column_order = ['WRI ID',  'Unique ID', 'Public Title','Formal Name', 
                'Source Organizations',  'Subtitle',
                'Learn More Link', 'Download from S3', 'Download from Source',
                'Function', 'Description', 'Cautions', 'Data Type', 'Date of Content', 
               'Frequency of Updates', 'Geographic Coverage','Spatial Resolution',
                'Citation', 'License', 'License Link', 'Columns and Aliases']

df = pd.DataFrame(list(map(create_row,metadatas_all.items())))[column_order]
drop_ix = df[df['Public Title'] == 'skip'].index
df = df.drop(drop_ix)
df.to_csv('Drop_missing_rows.csv')
#pd.DataFrame(list(map(create_row,metadatas_published.items())))[column_order].to_csv('Published_metadata.csv')

Missed columns in ed7862df-242a-4576-a4f4-619e2a3d347a
No metadata available
Couldnt reformat source list
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
Couldnt reformat source list
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
Couldnt reformat source list
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available
No metadata available


In [200]:
df

Unnamed: 0,WRI ID,Unique ID,Public Title,Formal Name,Source Organizations,Subtitle,Learn More Link,Download from S3,Download from Source,Function,Description,Cautions,Data Type,Date of Content,Frequency of Updates,Geographic Coverage,Spatial Resolution,Citation,License,License Link,Columns and Aliases
1,cli.022,995ec4fe-b3cc-4cf4-bd48-b89d4e3ea072,Populations in Coastal Zones,"Population, Landscape, And Climate Estimates (...",Columbia University Earth Institute Center for...,CIESIN/UNEP GPA Coordination Office,http://sedac.ciesin.columbia.edu/data/set/nagd...,https://wri-public-data.s3.amazonaws.com/resou...,http://sedac.ciesin.columbia.edu/data/set/nagd...,Percentage of total population living within 1...,The percentage of the total population living ...,Users should be cautioned that because land co...,Tabular,"1990, 1995, 2000",,Global,National,Center for International Earth Science Informa...,Attribution required,http://sedac.ciesin.columbia.edu/data/set/nagd...,"{""iso3v10"": {""alias"": ""ISO Code""}, ""countrynam..."
10,foo.015,4338471d-881a-475f-8bd9-60c4d48b8e12,Global Hunger Index,2017 Global Hunger Index,International Food Policy Research Institute (...,IFPRI/Concern Worldwide/Welthungerhilfe,http://www.globalhungerindex.org/about/,https://wri-public-data.s3.amazonaws.com/resou...,https://dataverse.harvard.edu/dataset.xhtml?pe...,Measure of hunger globally and by country and ...,The International Food Policy Research Institu...,GHI scores for 2017 could not be calculated fo...,Tabular,"1992, 2000, 2008, and 2017",8 years,Select Countries,30 arc second,International Food Policy Research Institute (...,Creative Commons Attribution-NonCommercial-NoD...,https://creativecommons.org/licenses/by-nc-nd/...,"{""country"": {""alias"": ""Country""}, ""ghi_1992"": ..."
11,ene.028,c665f519-eef9-4f67-a8bf-7e3e6dc8bfcd,Access to Clean Cooking Fuels,World Development Indicators: Access to Clean ...,World Bank Group (WBG),WBG,http://databank.worldbank.org/data/reports.asp...,http://databank.worldbank.org/data/reports.asp...,http://databank.worldbank.org/data/reports.asp...,Proportion of total population primarily using...,Access to clean fuels and technologies for coo...,"Under WHO guidelines, kerosene is excluded fro...",Tabular,2000-2014,Annual,Global,1000 m (also available from www.croplands.org ...,"World Bank. 2015. ""World Development Indicator...",Creative Commons Attribution 4.0 International,https://datacatalog.worldbank.org/public-licen...,"{""country_name"": {""alias"": ""Country""}, ""countr..."
25,cit.029,10337db6-8321-445e-a60b-28fc1e114f29,Municipal Waste,Municipal Waste Generated per Capita,Organisation for Economic Co-operation and Dev...,OECD,https://stats.oecd.org/Index.aspx?DataSetCode=...,https://wri-public-data.s3.amazonaws.com/resou...,https://stats.oecd.org/Index.aspx?DataSetCode=...,Amount of municipal (including household) wast...,The Municipal Waste Generated per Capita data ...,"In many countries, systematic collection of en...",Tabular,1990-2015,Annual,OCED countries,National,"OECD. 2015. ""Municipal Waste Generated per Cap...",Attribution required,http://www.oecd.org/termsandconditions/,"{""country"": {""alias"": ""Country""}, ""yr_1990"": {..."
26,cit.013,5d269c36-6ccf-4620-838d-431f86c30f69,Reduction in Life Expectancy Attributable to t...,Age-standardized DALYs attributable to the env...,United Nations World Health Organization (WHO)...,WHO/IHME,http://apps.who.int/gho/data/node.wrapper.imr?...,https://wri-public-data.s3.amazonaws.com/resou...,http://apps.who.int/gho/data/view.main.ENVDALY...,"The disability-adjusted life years (DALYs), or...",This data set of disability-adjusted life year...,The data set may not be complete due to inform...,Tabular,2012,,Global,National,"World Health Organization. 2016. ""Global Healt...",Restrictions Apply,http://www.who.int/about/copyright/en/,"{""country"": {""alias"": ""Country""}, ""year"": {""al..."
27,,60d3b365-6c0b-4f1c-9b7f-f3f00f2a05d7,,Smithsonian/USGS Weekly Volcanic Activity Report,,,,,,Volcanic activity as a weekly report,"The Weekly Volcano Report, produced by the Smi...",The Weekly Volcanic Activity Report is intende...,,Geologic time to present,Weekly (Wednesdays),Global,,Smithsonian Institution Global Volcanism Progr...,Restrictions apply,http://www.si.edu/termsofuse/,"{""description"": {""alias"": ""Description""}, ""vol..."
28,wat.008,b8307c16-fd77-4e35-9b68-8726a025f401,Annual Surface Water Extent,JRC Yearly Water Classification History v1.0 (...,European Commission Joint Research Centre (EC ...,EC JRC/Google,https://global-surface-water.appspot.com,,https://global-surface-water.appspot.com/download,Annual location and temporal distribution of s...,The Annual Surface Water Extent data set conta...,Any bodies of water smaller than 30 m resoluti...,Raster,1984-2015,Varies,78N-60S,0.5°,"Pekel, Jean-François, Andrew Cottam, Noel Gore...",Source open licence,https://global-surface-water.appspot.com/download,{}
29,for.005,d472171e-e515-4c1f-939e-8d5691e55c2a,Mangrove Forests,Global Distribution of Mangroves USGS,United Nations Environment Programme World Con...,UNEP-WCMC,http://data.unep-wcmc.org/datasets/4,,http://data.unep-wcmc.org/datasets/4,Global extent of mangrove forests in 2010,This data set was generated by Aberystwyth Uni...,The Landsat-7 ETM+ scanline error affects the ...,Vector,2010,,Global,375 m,"Giri C, Ochieng E, Tieszen LL, Zhu Z, Singh A,...",Restrictions Apply,https://www.unep-wcmc.org/policies,{}
30,soc.072,595bcf6f-0343-4146-ba0d-c54b1c928510,"Population (Grid, 250 m)",GHS Population Grid (LDS),European Commission Joint Research Centre (EC ...,EC JRC/CIESIN,http://ghsl.jrc.ec.europa.eu/ghs_pop.php,,http://cidportal.jrc.ec.europa.eu/ftp/jrc-open...,"Distribution and density of population, expres...",The European Commission Joint Research Centre ...,"For 1975, greater uncertainties in estimating ...",Raster,"1975, 1990, 2000, 2015",,Global,,"European Commission, Joint Research Centre (JR...",Attribution required,http://ec.europa.eu/info/legal-notice_en,{}
31,dis.009,e94f0e2d-2b5f-41ed-967f-d97e54dd81ea,Tsunamis,NGDC/WDS Global Historical Tsunami Database,National Oceanic and Atmospheric Administratio...,NOAA,https://ngdc.noaa.gov/hazard/tsu_db.shtml,https://wri-public-data.s3.amazonaws.com/resou...,https://www.ngdc.noaa.gov/hazard/tsu_db.shtml,A record of all known tsunamis and their chara...,The Global Historical Tsunami Database release...,The data quality is limited in that it relies ...,Vector,2100 BC-present,Weekly,Global,,National Geophysical Data Center / World Data ...,Public domain,Public domain,


# Comparing Launch Metadata w/ Legacy Metadata Sheet

In [190]:
#### Download Google Spreadsheets ####
# Legacy Metadata sheet
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > old_metadata.tsv
old_mdata = pd.read_csv(open("old_metadata.tsv", "r"), sep="\t", index_col=[2])
os.remove("old_metadata.tsv")

# New metadata sheet
!curl "https://docs.google.com/spreadsheets/d/1laymLZAbNsto9Pj4iAHCdyaqZo2OYedKuyXaG48ZuLU/export?format=tsv" > new_metadata.tsv
new_mdata = pd.read_csv(open("new_metadata.tsv", "r"), sep="\t", index_col=[1])
os.remove("new_metadata.tsv")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  830k    0  830k    0     0  2031k      0 --:--:-- --:--:-- --:--:-- 2031k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  952k    0  952k    0     0  1520k      0 --:--:-- --:--:-- --:--:-- 1521k


In [198]:
print(old_mdata.shape)
old_mdata2 = old_mdata.drop(['Published Language', 'Published Title (if not English)',
       'Layer Name 1', 'Layer Definition 1', 'Layer Name 2',
       'Layer Definition 2', 'Layer Name 3', 'Layer Definition 3',
       'Layer Name 4', 'Layer Definition 4', 'Original Data Name 1',
       'Original Data Link 1', 'Original Data Name 2', 'Original Data Link 2',
       'Original Data Name 3', 'Original Data Link 3', 'Original Data Name 4',
       'Original Data Link 4', 'Unnamed: 37', 'API_ID'], axis=1)

old_mdata2 = old_mdata2[['Unique ID', 'Learn More Link',
       'Download from Source', 'Download Data (S3)',
       'Distribution Restriction', 'Shared API - Do Not Touch These!',
       'Public Title', 'Technical Title', 'Subtitle', 'Source Organizations',
       'Function', 'Description', 'Cautions', 'Geographic Coverage',
       'Data Type', 'Spatial Resolution', 'Date of Content',
       'Frequency of Updates', 'Summary of Licence', 'Link to License',
       'Citation']]

old_mdata2.columns = ['Unique ID', 'Learn More Link',
       'Download from Source', 'Download Data (S3)',
       'Distribution Restriction', 'Shared API - Do Not Touch These!',
       'Public Title', 'Formal Name', 'Subtitle', 'Source Organizations',
       'Function', 'Description', 'Cautions', 'Geographic Coverage',
       'Data Type', 'Spatial Resolution', 'Date of Content',
       'Frequency of Updates', 'Summary of Licence', 'Link to License',
       'Citation']

old_mdata2

(295, 42)


Unnamed: 0_level_0,Unique ID,Learn More Link,Download from Source,Download Data (S3),Distribution Restriction,Shared API - Do Not Touch These!,Public Title,Formal Name,Subtitle,Source Organizations,Function,Description,Cautions,Geographic Coverage,Data Type,Spatial Resolution,Date of Content,Frequency of Updates,Summary of Licence,Link to License,Citation
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5b5a21ac-0835-43fb-86b9-64b93d472e10,bio.001,http://www.biodiversitya-z.org/content/allianc...,http://www.biodiversitya-z.org/content/allianc...,,"X emailed that it is okay, but affiliated with...",,Endangered Species Critical Habitats,Alliance for Zero Extinction Sites (AZE),AZE,Alliance for Zero Extinction (AZE),Global map of critical sites for conservation ...,Created by the Alliance for Zero Extinction (A...,,Global,Vector,,2010,5 years,Restrictions Apply,https://www.arcgis.com/home/item.html?id=4ecca...,"Alliance for Zero Extinction. 2010. ""2010 AZE ..."
4458eb12-8572-45d1-bf07-d5a3ee097021,bio.002,http://www.cepf.net/resources/hotspots/Pages/d...,http://www.cepf.net/resources/hotspots/Pages/d...,https://wri-public-data.s3.amazonaws.com/resou...,,GFW,Biodiversity Hotspots,"Hotspots Revisited, 2011",CI/CEPF,Conservation International Foundation (CI)/Cri...,Conservation International’s biodiversity hots...,First defined in 1988 by scientist Norman Myer...,This layer only displays the land-based portio...,Global,Vector,,2016,,Creative Commons Attribution-ShareAlike 4.0 In...,https://creativecommons.org/licenses/by-sa/4.0/,Conservation International and Critical Ecosys...
16df8ada-87cc-4907-adce-a98bc4e91856,bio.003,http://advances.sciencemag.org/content/3/2/e16...,http://advances.sciencemag.org/content/3/2/e16...,,,,Marine Species Richness,Climate impacts on global hot spots of marine ...,EBD-CSIC/Phillip Island Nature Parks/Otago,Estación Biológica de Doñana–Consejo Superior ...,On the basis of the worldwide distribution (th...,To calculate the impact of climate change on m...,Fishing data are released with very low resolu...,Global,Raster,0.5°,Varies,,Creative Commons Attribution-NonCommercial lic...,https://creativecommons.org/licenses/by-nc/4.0/,"Ramirez, Francisco, Isabel Afan, Lloyd S. Davi..."
3624554e-b240-4edb-9110-1f010642c3f3,bio.004,http://data.unep-wcmc.org/datasets/1,http://data.unep-wcmc.org/datasets/1,,"X (No commercial use, no redistributing data)",,Coral Reef Locations,Global Distribution of Coral Reefs (2010),UNEP-WCMC/WorldFish Centre/WRI/TNC,United Nations Environment Programme World Con...,Global coral reef locations,The United Nations Environment Programme World...,These data may be outdated.,Global,Raster,500 m,1954-2009,,Restrictions Apply,https://www.unep-wcmc.org/policies/general-dat...,"UNEP-WCMC, WorldFish Centre, WRI, and TNC. 201..."
ad790c87-fe9e-4405-891d-de7c2ddfda79,bio.005,https://coralreefwatch.noaa.gov/satellite/blea...,https://coralreefwatch.noaa.gov/satellite/blea...,,,,Coral Reef Bleaching Alerts,Coral Reef Watch Bleaching Alerts,NOAA CRW,National Oceanic and Atmospheric Administratio...,Displays the maximum level of coral bleaching ...,The Coral Reef Watch (CRW) program of the Nati...,These data layers do not contain any informati...,Global,Raster,5 km,2013-present,Daily (13:30 EST),Public domain,https://wiki.creativecommons.org/wiki/Public_d...,NOAA Coral Reef Watch. 2000 (updated twice wee...
7d3465f8-5959-4531-aaf2-c9a8a03183b3,bio.006,http://datazone.birdlife.org/eba,http://datazone.birdlife.org/site/requestgis,,X,GFW,Endemic Bird Areas,Endemic Bird Areas of the World: Priorities fo...,BirdLife International,BirdLife International,Areas where the geographic range of two or mor...,"While many bird species are widespread, over 2...",The number of EBAs having a combination of sev...,Global,Vector,,2014,Annual,Restrictions Apply,http://datazone.birdlife.org/info/dataterms,"Stattersfield, A.J., M.J. Crosby, A.J. Long, a..."
de452a4c-a55c-464d-9037-8c3e9fe48365,bio.007,https://www.protectedplanet.net/,https://protectedplanet.net/,,X,"GFW, dataforsdg",World Database on Protected Areas,,UNEP-WCMC/IUCN,United Nations Environment Programme World Con...,Legally protected areas according to various d...,The World Database on Protected Areas (WDPA) i...,Protected area boundaries come from a variety ...,Global,Vector,,,Monthly,Restrictions Apply,https://www.protectedplanet.net/c/terms-and-co...,"IUCN and UNEP-WCMC. 2017. ""The World Database ..."
3c82c421-8964-444e-86f2-df800174d8b9,bio.008,http://digital.csic.es/handle/10261/142056,http://digital.csic.es/handle/10261/142056,,,,Cumulative Climate Impacts on Marine Ecosystems,Global Distribution of Cumulative Environmenta...,EBD-CSIC/Phillip Island Nature Parks/Otago,Estación Biológica de Doñana–Consejo Superior ...,"Index of cumulative, equally weighted impacts ...",This cumulative impact index ranging from rang...,,Global,Raster,1°,"Varies based on data: SST (1980–2014), CHL (19...",,Creative Commons Attribution-NonCommercial lic...,https://creativecommons.org/licenses/by-nc/4.0/,"Ramirez, Francisco, Isabel Afan, Lloyd S. Davi..."
3c12072d-611b-413f-b314-4df0834523ab,bio.009,http://maps.tnc.org/gis_data.html,http://maps.tnc.org/gis_data.html,https://wri-public-data.s3.amazonaws.com/resou...,,,Ecoregions Prioritized for Conservation,Ecoregional Portfolio,TNC,The Nature Conservancy (TNC),Locations that The Nature Conservatory has pri...,The Ecoregions Prioritized for Conservation re...,Ecoregion types do not reflect the average hab...,U.S. and parts of East Asia,Vector,,2011,,Restrictions Apply,http://maps.tnc.org/gis_data.html,"The Nature Conservatory. 2011. ""Priority Conse..."
33bed1fb-9261-41bf-8b50-127a4d0c80c5,bio.011,http://maps.tnc.org/files/metadata/FEOW.xml,http://www.feow.org/downloadlist,https://wri-projects.s3.amazonaws.com/resource...,(can show in map services),,Freshwater Ecoregions,Freshwater Ecoregions of the World,TNC/WWF,The Nature Conservancy (TNC)/World Wildlife Fu...,A classification of 11 freshwater ecoregions o...,"The Freshwater Ecoregions of the World (FEOW),...","Because of the large scale of ecoregions, all ...",Global,Vector,,2008,,Restrictions Apply,http://maps.tnc.org/gis_data.html,"The Nature Conservancy, ""Freshwater Ecoregions..."


In [None]:
print(new_mdata.shape)
new_mdata

In [140]:
# Merge in any changes from the API

# Identify any new datasets (i.e. Jasmine's new nrt ones)


(380, 20)


Unnamed: 0_level_0,WRI ID,Public Title,Formal Name,Source Organizations,Subtitle,Learn More Link,Download from S3,Download from Source,Function,Description,Cautions,Data Type,Date of Content,Frequency of Updates,Geographic Coverage,Spatial Resolution,Citation,License,License Link,Columns and Aliases
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6cfd6255-609e-4922-8709-5aaad0db9fae,,,,,,,,,,,,,,,,,,,,
995ec4fe-b3cc-4cf4-bd48-b89d4e3ea072,cli.022,Populations in Coastal Zones,"Population, Landscape, And Climate Estimates (...",Columbia University Earth Institute Center for...,CIESIN/UNEP GPA Coordination Office,http://sedac.ciesin.columbia.edu/data/set/nagd...,https://wri-public-data.s3.amazonaws.com/resou...,,Percentage of total population living within 1...,The percentage of the total population living ...,Users should be cautioned that because land co...,Tabular,"1990, 1995, 2000",,Global,National,Center for International Earth Science Informa...,Attribution required,http://sedac.ciesin.columbia.edu/data/set/nagd...,"{""iso3v10"": {""alias"": ""ISO Code""}, ""countrynam..."
18c08acb-68fc-4d9e-a908-9b91b6bee5ad,,,,,,,,,,,,,,,,,,,,
f1dd117a-c722-4d24-b659-86309a72b9f8,,,,,,,,,,,,,,,,,,,,
ff73f84b-81b5-4b92-b0b5-c7ea6b2ef6df,,,,,,,,,,,,,,,,,,,,
f53afc98-ce85-4ac6-afd8-266d1483e41b,,,,,,,,,,,,,,,,,,,,
8281cb9e-14a9-41a4-bfc2-654c83a94728,,,,,,,,,,,,,,,,,,,,
f5fe4dfc-5ad1-4008-bc94-96ddbaa0d474,,,,,,,,,,,,,,,,,,,,
80f951a7-edd5-472a-8b46-744eb93fd7ef,,,,,,,,,,,,,,,,,,,,
3ebc33cf-c174-4159-a07f-3ee0dc0d6ba9,,,,,,,,,,,,,,,,,,,,


In [191]:
missed = [ix for ix in old_mdata.index if ix not in new_mdata.index and str(ix) != 'nan']

In [192]:
missed

['0303127a-70b0-4164-9251-d8162615d058',
 'bbfcf170-5c47-40b0-880d-b04648eba354',
 'b0f859ce-f13b-462e-9063-ebc68ed88420',
 '973ba39a-0a68-4481-849d-faddc374c6e1',
 '92e6446e-f60f-497d-9164-5b8d5126b8e5',
 '134caa0a-21f7-451d-a7fe-30db31a424aa',
 'dd95d1e6-b811-4907-867b-78857ad87ec6',
 '66d1bba4-ccf4-415e-a2d0-f607c6304994']

In [193]:
[list(old_mdata.loc[ix, ['Unique ID', 'Public Title']].items()) for ix in missed]

[[('Unique ID', 'cit.017'), ('Public Title', 'Travel Time to Major Cities')],
 [('Unique ID', 'dis.005'), ('Public Title', 'Current Floods')],
 [('Unique ID', 'foo.003.1'), ('Public Title', 'Observed Food Insecurity')],
 [('Unique ID', 'for.023'),
  ('Public Title', 'Annual tree cover gain (provisional)')],
 [('Unique ID', 'for.024'),
  ('Public Title', 'Tree cover height (provisional)')],
 [('Unique ID', 'soc.065'), ('Public Title', 'National Political Boundaries')],
 [('Unique ID', 'soc.074'), ('Public Title', 'Employment in Agriculture')],
 [('Unique ID', 'foo.052'), ('Public Title', 'Soil Moisture')]]

In [187]:
def find_phrase(val, phrase):
    if isinstance(val, float):
        return False
    if phrase in val:
        return True
    return False

list(filter(lambda val: find_phrase(val, 'soc.074'), new_mdata['WRI ID']))

[]

In [185]:
new_mdata.set_index('WRI ID').loc['foo.003']

Public Title                        Current and Projected Food Insecurity
Source Organizations    United States Agency for International Develop...
Subtitle                    USAID/NASA/NOAA/USDA/USGS/Chemonics/Kimetrica
Learn More Link                         http://www.fews.net/fews-data/333
Download from S3                                                      NaN
Download from Source                                                  NaN
Function                5-level scale for classifying food insecurity ...
Cautions                FEWS NET uses a scenario development method to...
Data Type                                                          Vector
Date of Content                                              2009-Present
Frequency of Updates                                             4 months
Geographic Coverage     El Salvador, Guatemala, Haiti, Honduras, Nicar...
Spatial Resolution                                            Subnational
Citation                FEWS NET. 2013

DISCREPANCIES IN ALL METADATA

'0303127a-70b0-4164-9251-d8162615d058': cit.017 Travel Time to Major Cities
'bbfcf170-5c47-40b0-880d-b04648eba354': dis.005 GDACS flood detection
'b0f859ce-f13b-462e-9063-ebc68ed88420': foo.003.1 Observed Food Insecurity
'973ba39a-0a68-4481-849d-faddc374c6e1': for.023 Tree Cover Gain
'92e6446e-f60f-497d-9164-5b8d5126b8e5': for.024 Tree Cover Height
 "Global Gridded Geographically Based Economic Data (G-Econ), Version 4 provides a measure of global cell product (GCP): estimates for the amount of gross domestic product (GDP) produced in each grid cell of a 1-degree grid of the world using 2 metrics: GCP measured in the market exchange rate (MER) from local currencies into USD, and GCP measured in purchasing power parity (PPP), which adjusts country currency GCP according to the equivalent amount of USD needed to purchase a standardized collection of goods in the country using the country's currency. GCP is defined as the gross value added by activities in a grid cell. Gross value is calculated as the total value of the outputs of local businesses providing goods and services in the grid cell, minus the value of inputs bought by those businesses. The data and methods used to calculate local added value are different for different countries, as the data available are different for different countries. State and county level gross value data are used whenever possible.  Resource Watch shows only a subset of the dataset. For access to the full dataset and additional information, see the Learn More link.": 
'134caa0a-21f7-451d-a7fe-30db31a424aa': soc.065 wri-bounds
'Add from GFW': 
'dd95d1e6-b811-4907-867b-78857ad87ec6': soc.074 Employment in Agriculture

In [156]:
old_mdata.loc['dd95d1e6-b811-4907-867b-78857ad87ec6']

udpated since 3/21                                                                NaN
Unique ID                                                                     soc.074
Learn More Link                     https://data.worldbank.org/indicator/SL.AGR.EM...
Download from Source                https://data.worldbank.org/indicator/SL.AGR.EM...
Download Data (S3)                                                                NaN
Distribution Restriction                                                          NaN
Shared API - Do Not Touch These!                                                  NaN
Public Title                                                Employment in Agriculture
Technical Title                     Employment in agriculture (% of total employment)
Subtitle                                                                     WBG/ILO 
Source Organizations                World Bank Group (WBG) /International Labour O...
Function                            National employmen