# Import libraries

In [3]:
import pandas as pd
pd.options.display.max_columns = 200

import requests as req
import json
import boto3
import io

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

import itertools

# Constants

In [None]:
RW_API_TOKEN = ""

# Grab website metadata

In [12]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&published=true&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000, "language":"en"}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

logging.info("Number of datasets: " + str(current_datasets_on_api.shape[0]))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org
DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/dataset?sort=slug,-provider,userId&status=saved&published=true&includes=metadata,vocabulary,widget,layer&application=rw&page%5Bsize%5D=1000&language=en HTTP/1.1" 200 499170
INFO:root:Number of datasets: 208


In [None]:
current_datasets_on_api.iloc[0]

# Pull metadata, layer, widget info

In [13]:
def pick_layer_info(layer, f):
    s_atts = layer['attributes']
    t_atts = ['name', 'description']
    f.write('*layer id*: {}\n'.format(layer['id']))
    for att in t_atts:
        if att in s_atts:
            f.write('*{}*: {}\n'.format(att, s_atts[att]))
    f.write('~end_layer~\n\n')

def pick_widget_info(widget, f):
    s_atts = widget['attributes']
    t_atts = ['name', 'description']
    f.write('*widget id*: {}\n'.format(widget['id']))
    for att in t_atts:
        if att in s_atts:
            f.write('*{}*: {}\n'.format(att, s_atts[att]))
    f.write('~end_widget~\n\n')
    
def pick_metadata_info(metadata, f):
    if not len(metadata):
        return

    s_atts = metadata['attributes']
    t_top_atts = ['description']
    t_info_atts = ['cautions', 'functions', 'citation']
    f.write('*metadata id*: {}\n'.format(metadata['id']))
    
    for att in t_top_atts:
        if att in s_atts:
            f.write('*{}*: {}\n'.format(att, s_atts[att]))
    
    for att in t_info_atts:
        if att in s_atts['info']:
            f.write('*{}*: {}\n'.format(att, s_atts['info'][att]))
    f.write('~end_metadata~\n\n')
    
def record_dataset_info(info, f):
    '''Includes information on metadata, layers, widgets, and tags'''
    ds = info[0]
    info = info[1]
    
    name = info['name']
    metadata = info['metadata']
    layers = info['layers']
    widgets = info['widgets']
    tags = info['tags']
    
    f.write('** NEW DATASET **\n')
    f.write('ds name: {}\n'.format(name))
    f.write('ds id: {}\n'.format(ds))
    f.write('ds on site: https://staging.resourcewatch.org/data/explore/{}\n\n'.format(ds))
    
    f.write('~ metadata ~\n\n')
    list(map(lambda m: pick_metadata_info(m, f), metadata))
    
    f.write('~ layers ~\n\n')
    list(map(lambda l: pick_layer_info(l,f),layers))
    
    f.write('~ widgets ~\n\n')
    list(map(lambda w: pick_widget_info(w,f),widgets))


In [14]:
try:
    os.unlink('all_info.txt')
except:
    pass

f = open('all_info.txt', 'a')

list(map(lambda i: record_dataset_info(i, f), current_datasets_on_api.iterrows()))
    
f.close()

# Prove inverse works

In [None]:
def isplit(iterable,splitters):
    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]

def patch_layers(info, ds):
    
    rw_api_url = 'https://api.resourcewatch.org/v1/dataset/{}/layer'.format(ds)
    old_layers = req.get(rw_api_url).json()['data']
    
    edit_layers = isplit(info, '~end_layer~')
    edit_layer_info = {}
    for edit_layer in edit_layers:
        e_id = edit_layer[0].split(':')[1].strip()
        edit_layer_info[e_id] = {
            'name':edit_layer[1].split(':')[1].strip(),
            'description':edit_layer[2].split(':')[1].strip()
        }
        
    for old_layer in old_layers:
        layer_id = old_layer['id']
        if layer_id in edit_layer_info:
            logging.info('Layer ID: {}'.format(layer_id))

            new_layer = old_layer.copy()
            new_atts = old_layer['attributes'].copy()
            edit_atts = edit_layer_info[layer_id]
            
            new_atts.update(name = edit_atts['name'], description= edit_atts['description'])
            new_layer.update(attributes = new_atts)
            logging.info('New layer: {}'.format(new_layer))
            
            patch_url = 'https://api.resourcewatch.org/v1/dataset/{}/layer/{}'.format(ds, layer_id)
            headers = {
                'content-type': "application/json",
                'authorization': "Bearer {}".format(RW_API_TOKEN)
            }
            logging.debug('URL: {}'.format(patch_url))
            # Tried json.dumps(layer) and that didn't work
            res = req.request("PATCH", patch_url, data=json.dumps(new_atts), headers = headers)
            logging.info('Layer load Response: {}'.format(res.text))
    
def patch_widgets(info, ds):
    
    rw_api_url = 'https://api.resourcewatch.org/v1/dataset/{}/widget'.format(ds)
    old_widgets = req.get(rw_api_url).json()['data']
    
    edit_widgets = isplit(info, '~end_widget~')
    edit_widget_info = {}
    for edit_widget in edit_widgets:
        e_id = edit_widget[0].split(':')[1].strip()
        edit_widget_info[e_id] = {
            'name':edit_widget[1].split(':')[1].strip(),
            'description':edit_widget[2].split(':')[1].strip()
        }
        
    for old_widget in old_widgets:
        widget_id = old_widget['id']
        if widget_id in edit_widget_info:
            logging.info('Widget ID: {}'.format(widget_id))

            new_widget = old_widget.copy()
            new_atts = old_widget['attributes'].copy()
            edit_atts = edit_widget_info[widget_id]
            
            new_atts.update(name = edit_atts['name'], description= edit_atts['description'])
            new_widget.update(attributes = new_atts)
            logging.info('New widget: {}'.format(new_widget))
            
            patch_url = 'https://api.resourcewatch.org/v1/dataset/{}/widget/{}'.format(ds, widget_id)
            headers = {
                'content-type': "application/json",
                'authorization': "Bearer {}".format(RW_API_TOKEN)
            }
            logging.debug('URL: {}'.format(patch_url))
            # Tried json.dumps(layer) and that didn't work
            res = req.request("PATCH", patch_url, data=json.dumps(new_atts), headers = headers)
            logging.info('Widget load Response: {}'.format(res.text))
    
def patch_metadata(info, ds):
    rw_api_url = 'https://api.resourcewatch.org/v1/dataset/{}/metadata'.format(ds)
    old_metadatas = req.get(rw_api_url).json()['data']
    
    edit_metadatas = isplit(info, '~end_metadata~')
    edit_metadata_info = {}
    for edit_metadata in edit_metadatas:
        e_id = edit_metadata[0].split(':')[1].strip()
        edit_metadata_info[e_id] = {
            'description':edit_metadata[1].split(':')[1].strip(),
            'cautions':edit_metadata[2].split(':')[1].strip(),
            'functions':edit_metadata[3].split(':')[1].strip(),
            'citation':edit_metadata[4].split(':')[1].strip()
        }
        
    for old_metadata in old_metadatas:
        metadata_id = old_metadata['id']
        if metadata_id in edit_metadata_info:
            logging.info('Metadata ID: {}'.format(metadata_id))

            new_metadata = old_metadata.copy()
            new_atts = old_metadata['attributes'].copy()
            new_info = new_atts['info'].copy()
            edit_atts = edit_metadata_info[metadata_id]
            
            new_atts.update(description= edit_atts['description'])
            new_info.update(cautions = edit_atts['cautions'], functions = edit_atts['functions'])
            new_atts.update(info = new_info)
            
            new_metadata.update(attributes = new_atts)
            logging.info('New metadata: {}'.format(new_metadata))
            
            patch_url = 'https://api.resourcewatch.org/v1/dataset/{}/metadata/{}'.format(ds, metadata_id)
            headers = {
                'content-type': "application/json",
                'authorization': "Bearer {}".format(RW_API_TOKEN)
            }
            logging.debug('URL: {}'.format(patch_url))
            # Tried json.dumps(layer) and that didn't work
            res = req.request("PATCH", patch_url, data=json.dumps(new_atts), headers = headers)
            logging.info('Metadata load Response: {}'.format(res.text))
    

def structure_patchs(lst):
    if not len(lst):
        return None
    
    print(lst)
    ds = lst[1].split(':')[1].strip()
    print(ds)
    
    metadata_start = lst.index('~ metadata ~')
    layers_start = lst.index('~ layers ~')
    widgets_start = lst.index('~ widgets ~')
    
    metadata = lst[metadata_start:layers_start]
    layers = lst[layers_start:widgets_start]
    widgets = lst[widgets_start:]
    
    patch_metadata(metadata[1:], ds)
    patch_layers(layers[1:], ds)
    patch_widgets(widgets[1:], ds)

def send_patches(txt):
    by_ds = txt.split('** NEW DATASET **') 
    by_line = list(map(lambda i: i.split('\n'), by_ds))
    clean_lines = list(map(lambda l: list(filter(lambda i: len(i)>0,l)), by_line))
    patches = list(map(structure_patchs, clean_lines))

In [None]:
with open('all_info.txt', 'r') as src:
    load_info = send_patches(src.read())

# Experimentation below

In [None]:
load_info_sample = '** NEW DATASET **\nds name: soc.069 Political Violence Risk Assessment\nds id: 25eebe25-aaf2-48fc-ab7b-186d7498f393\nds on site: https://staging.resourcewatch.org/data/explore/25eebe25-aaf2-48fc-ab7b-186d7498f393\n\n~ metadata ~\n\nmetadata id: 59debf249d46b60010c48aad\ndescription: CHANGE ME The Political Violence Risk Assessment data set was originally published by the Hague Centre for Strategic Studies. Its purpose is to forcast the onset of large-scale political violence. Countries are evaluated to determine their "risk" of experiencing a large-scale political violence event in the next month using structural data (infant mortality, regime type, conflicts in neighboring countries, and state-led discrimination) and automated event data (protests, assaults, and expressions of disapproval) from the previous month, which are are combined in a logistic regression model. The score is normalized between 0 and 100 and indicates the percentile rank of probability of onset. Infant mortality data are from World Development Indicators; other structural data are from the Center for Systemic Peace; event data are from the Global Database of Events, Language, and Tone; and normalization and aggregation were done by the HCSS. Resource Watch shows only a subset of the dataset. For access to the full dataset and additional information, see the Learn More link.\ncautions: The data page provides no formal cautions.\nfunctions: Countries most at risk for political violence\n~end_metadata~\n\nmetadata id: 59e125369817d60012ba095f\ndescription: Both structural data (infant mortality, regime type, conflicts in neighboring countries, and state-led discrimination) is combined with automated event data (protests, assaults, and expressions of disapproval) of the previous month are combined in a logistic regression model to assess the risk of civil war onset in the next month. The score is normalized between 0 and 100 and indicates the percentile rank of probability of onset. Infant mortality data is from WDI, other structural data is from Center for Systemic Peace, event data is from GDELT, normalization and aggregation were done by HCSS.\ncautions: The data page provides no formal cautions. \nfunctions: Countries most at risk for political violence\n~end_metadata~\n\n~ layers ~\n\nlayer id: 1f8090c0-6594-45e2-b1df-1cbed74f12a8\nname: Political Violence Risk Assessment\ndescription: Risk score on a country level indicating the percentile rank of the likelihood of civil war onset during the next 30 days, updated daily. Both structural indicators and dynamic, event-based variables are used in regression.  The higher the value, the more at risk of civil war.\n~end_layer~\n\n~ widgets ~\n\nwidget id: 54160dca-ecb3-4d62-b80e-95970e8e834d\nname: Political Violence Risk Assessment\ndescription: \n~end_widget~\n\n'
send_patchs(load_info_sample)

In [None]:
list(map(len, current_datasets_on_api['metadata']))

# Pull out all layer names and definitions
* Link to data set detail page
* For each layer:
* * Layer name
* * Layer description


In [None]:
try:
    os.unlink('layer_info.txt')
except:
    pass


### NOTE: use option 'a' to append to a file. 'w' would overwrite
f = open('layer_info.txt', 'a')

def pick_layer_info(layer, f):
    s_atts = layer['attributes']
    t_atts = ['name', 'description']
    f.write('layer id: {}\n'.format(layer['id']))
    for att in t_atts:
        if att in s_atts:
            f.write('{}: {}\n'.format(att, s_atts[att]))
    f.write('\n')
            
            
def record_layers(info, f):
    ds = info[0]
    layers = info[1]
    f.write('** NEW DATASET **\n')
    f.write('ds id: {}\n'.format(ds))
    f.write('ds on site: https://staging.resourcewatch.org/data/explore/{}\n\n'.format(ds))
    f.write('~ layers ~\n\n')
    list(map(lambda l: pick_layer_info(l,f), layers))
    f.write('\n')

list(map(lambda i: record_layers(i,f), current_datasets_on_api['layers'].items()))

f.close()

# Pull out all widget names and definitions
* Link to data set detail page
* For each widget:
* * Widget name
* * Widget description

In [None]:
try:
    os.unlink('widget_info.txt')
except:
    pass

f = open('widget_info.txt', 'a')

def pick_widget_info(widget, f):
    s_atts = widget['attributes']
    t_atts = ['name', 'description']
    f.write('widget id: {}\n'.format(widget['id']))
    for att in t_atts:
        if att in s_atts:
            f.write('{}: {}\n'.format(att, s_atts[att]))
    f.write('\n')
            
            
def record_widgets(ds, info, f):
    ds = info[0]
    widgets = info[1]
    f.write('** NEW DATASET **\n')
    f.write('ds id: {}\n'.format(ds))
    f.write('ds on site: https://staging.resourcewatch.org/data/explore/{}\n\n'.format(ds))
    f.write('~ widgets ~\n\n')
    list(map(lambda l: pick_widget_info(l,f), widgets))
    f.write('\n')

list(map(lambda i: record_widgets(i,f), current_datasets_on_api['widgets'].items()))

f.close()