# Import Libraries

In [122]:
import pandas as pd
import requests as req

pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

# Load datasets from API

In [240]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&published=true&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000, "language": "en"}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

def try_for_id(metadata):
    try:
        return metadata[0]["info"]["rwId"]
    except:
        return None

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "wri_id":try_for_id(metadata),
        "metadata": metadata,
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
api.index.rename("API_ID", inplace=True)

logging.info("Number of datasets on RW API: " + str(api.shape[0]))

INFO:root:Number of datasets on RW API: 226


# Download tracking sheet, old metadata, new metadata sheets

In [241]:
#### Download Google Spreadsheets ####
# Legacy Metadata sheet
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > old_metadata.tsv
old_mdata = pd.read_csv(open("old_metadata.tsv", "r"), sep="\t", index_col=[2])
os.remove("old_metadata.tsv")

# New metadata sheet
!curl "https://docs.google.com/spreadsheets/d/1laymLZAbNsto9Pj4iAHCdyaqZo2OYedKuyXaG48ZuLU/export?format=tsv" > new_metadata.tsv
new_mdata = pd.read_csv(open("new_metadata.tsv", "r"), sep="\t", index_col=[2])
os.remove("new_metadata.tsv")

# Tracking Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > tracking_sheet.tsv
tracking_sheet = pd.read_csv("tracking_sheet.tsv", sep="\t", index_col=[0])
os.remove("tracking_sheet.tsv")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  829k    0  829k    0     0  1773k      0 --:--:-- --:--:-- --:--:-- 1776k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  949k    0  949k    0     0  1200k      0 --:--:-- --:--:-- --:--:-- 1199k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 83404    0 83404    0     0   231k      0 --:--:-- --:--:-- --:--:--  231k


# Goal 1: Match Tracking sheet ids with Backoffice ids, identify API IDs not linked to tracking

In [242]:
tracking_ids = tracking_sheet.reset_index().set_index('API_ID')

In [243]:
ids = [id for id in tracking_ids.index if id not in api.index]
tracking_ids.loc[ids, 'WRI_ID']#.to_csv('tracked_and_unpublished.csv')

API_ID
136aab69-c625-4347-b16a-c2296ee5e99e    cli.047.nrt
NaN                                     cli.060.nrt
75d5e90e-2baa-4664-a3fb-9e136c1c5535        soc.073
85345c7d-b608-4698-a44d-f5554885b99e        soc.075
Name: WRI_ID, dtype: object

In [244]:
api.loc[[id for id in api.index if id not in tracking_ids.index], 'name']#.to_csv('Not on tracking sheet.csv')

Series([], Name: name, dtype: object)

# Goal 2: Match Metadata ids with Tracking ids

In [245]:
#tracking_ids old_mdata
ids = [id for id in new_mdata.index if id not in tracking_ids.index]
new_mdata.loc[ids, ['WRI_ID', 'Public Title']]#.to_csv('tracked_and_nometadata.csv')

Unnamed: 0_level_0,WRI_ID,Public Title
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1
e245c1ee-70f3-4855-9903-071a167f39a5,,MODIS Surface Water
1417c29a-ca73-4f82-aa99-01c6f3c3b101,,MODIS Flood Water


In [246]:
#tracking_ids old_mdata
ids = [id for id in tracking_ids.index if id not in new_mdata.index]
tracking_ids.loc[ids, ['WRI_ID', 'Public Title']]#.to_csv('tracked_and_nometadata.csv')

Unnamed: 0_level_0,WRI_ID,Public Title
API_ID,Unnamed: 1_level_1,Unnamed: 2_level_1


co2 concentrations --- post launch

In [139]:
old_mdata.loc[[id for id in old_mdata.index if id not in tracking_ids.index], ['Unique ID', 'Public Title']]#.to_csv('Old Metadata on tracking sheet.csv')

Unnamed: 0_level_0,Unique ID,Public Title
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
16df8ada-87cc-4907-adce-a98bc4e91856,bio.003,Marine Species Richness
ad790c87-fe9e-4405-891d-de7c2ddfda79,bio.005,Coral Reef Bleaching Alerts
96ce9416-7a34-4c67-a21f-4f9b914d0d45,bio.034,Percent of Land Area Protected by Country
815eaa09-d626-495e-91e2-523cb07de475,cit.003,Air Quality Measurements
0303127a-70b0-4164-9251-d8162615d058,cit.017,Travel Time to Major Cities
aa9e9e43-a0bc-4835-a06d-d67af82bfd7c,cit.018,NO₂ Concentrations (Monthly)
7d9c0d09-e833-4a74-811b-0af78da9c731,cit.032,
9e7dc020-5a93-4df8-b81e-ee3e7bf32764,cli.002,Air Temperature Anomalies
c8040a7a-a40f-48bd-b003-625c33beff5e,cli.003,Sea Surface Temperature Anomalies
c9c9cb2f-9655-4f40-8736-9b407ee43514,cli.013,Methane


In [158]:
missing_wri_ids = new_mdata.loc[pd.isnull(new_mdata['WRI ID'])].index
old_mdata.loc[missing_wri_ids]

Unnamed: 0_level_0,udpated since 3/21,Unique ID,Learn More Link,Download from Source,Download Data (S3),Distribution Restriction,Shared API - Do Not Touch These!,Public Title,Technical Title,Subtitle,Source Organizations,Function,Description,Cautions,Geographic Coverage,Data Type,Spatial Resolution,Date of Content,Frequency of Updates,Summary of Licence,Link to License,Citation,Published Language,Published Title (if not English),Layer Name 1,Layer Definition 1,Layer Name 2,Layer Definition 2,Layer Name 3,Layer Definition 3,Layer Name 4,Layer Definition 4,Original Data Name 1,Original Data Link 1,Original Data Name 2,Original Data Link 2,Original Data Name 3,Original Data Link 3,Original Data Name 4,Original Data Link 4,Unnamed: 37,API_ID
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
6e10074a-a368-4afd-8564-db59814cdb74,,ene.029,,,,,,Energy Intensity by Country,Energy Efficiency Indicator Results,GTF,Global Tracking Framework,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9b6e6bce-efce-49a5-b603-385b8dae29e0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
b21d07fa-fdb9-4451-9297-4a0e132e7d0a,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
449c0650-4481-443e-813f-0776e20ef7bf,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
60d3b365-6c0b-4f1c-9b7f-f3f00f2a05d7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8ee88f34-db15-4711-a76d-bf82dbfcffed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
a0aecb8d-07ee-42e6-be3d-e5cabf12b0a9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
de24a492-acee-4345-9073-bbbe991f6ede,,soc.038,http://popstats.unhcr.org/en/asylum_seekers_mo...,http://popstats.unhcr.org/en/asylum_seekers_mo...,,X,,Monthly Asylum Request,Monthly Asylum Requests,UNHCR,Office of the United Nations High Commissioner...,Volume of monthly asylum requests made to 45 c...,UNHCR Monthly Asylum Requests are sourced from...,Attempts are made to exclude repeate or re-ope...,Global,Vector,Country level,1999-Present,Monthly,Unknown,Unknown,Pending convo with UNHCR,,,,,,,,,,,,,,,,,,,,
aeb0afc3-b5f2-4018-98fa-127ccb29e139,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5c434a8b-71cc-4841-a80e-49161fb222d3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Goal 3: Create master "Launch Metadata" material

In [189]:
add_these = ["bio.005.nrt",
"foo.008",
"for.023",
"soc.048"]

old_mdata.reset_index().set_index('Unique ID').loc[add_these].to_csv('additional_metadata.csv')

## Structure metadata from API

In [None]:
metadatas_all = current_datasets_on_api.loc[:,'metadata']
#metadatas_published = current_datasets_on_api.loc[current_datasets_on_api['published'],'metadata']

def try_pull(d, k):
    try:
        return d[k]
    except:
        if 'dataset' in d:
            print('Missed', k, 'in', d['dataset'])
        else:
            print('No metadata available')
        return None

def try_reformat(source_list):
    if source_list:
        return '/'.join([s['source-name'] for s in source_list if s])
    else:
        return None
    
    
# Can use dict.get(key, default_value) instead of try_pull... try_pull has benefit of calling out which fields are missed
def create_row(info):
    ds = info[0]
    metadata = info[1]
    try:
        metadata = metadata[0]['attributes']
    except:
        return {'Unique ID':ds, 'Public Title':'skip'}
    #print(metadata.keys())
    info = metadata['info']
    #print(info.keys())
    return {
        'Public Title':try_pull(metadata,'name'),
        'Description':try_pull(metadata,'description'),
        'Subtitle':try_pull(metadata,'source'),
        'Source Organizations':try_reformat(try_pull(info,'sources')),
        'Function':try_pull(info,'functions'),
        'Unique ID':try_pull(metadata,'dataset'),
        'WRI ID':try_pull(info,'wri_rw_id'),
        'Data Type':try_pull(info,'data_type'),
        'Formal Name':try_pull(info,'technical_title'),
        'Cautions':try_pull(info,'cautions'),
        'Citation':try_pull(info,'citation'),
        'License':try_pull(info,'license'),
        'License Link':try_pull(info,'license_link'),
        'Geographic Coverage':try_pull(info,'geographic_coverage'),
        'Spatial Resolution':try_pull(info,'spatial_resolution'),
        'Date of Content':try_pull(info,'date_of_content'),
        'Frequency of Updates':try_pull(info,'frequency_of_updates'),
        'Learn More Link':try_pull(info,'learn_more_link'),
        'Download from S3':try_pull(info,'data_download_link'),
        'Download from Source':try_pull(info,'data_download_original_link'),
        'Columns and Aliases':json.dumps(try_pull(metadata,'columns'))
    }
                            
column_order = ['WRI ID',  'Unique ID', 'Public Title','Formal Name', 
                'Source Organizations',  'Subtitle',
                'Learn More Link', 'Download from S3', 'Download from Source',
                'Function', 'Description', 'Cautions', 'Data Type', 'Date of Content', 
               'Frequency of Updates', 'Geographic Coverage','Spatial Resolution',
                'Citation', 'License', 'License Link', 'Columns and Aliases']

df = pd.DataFrame(list(map(create_row,metadatas_all.items())))[column_order]
drop_ix = df[df['Public Title'] == 'skip'].index
df = df.drop(drop_ix)
df.to_csv('Drop_missing_rows.csv')
#pd.DataFrame(list(map(create_row,metadatas_published.items())))[column_order].to_csv('Published_metadata.csv')

## Use old_mdata as template, overwrite with new_mdata