# Import libraries

In [2]:
import pandas as pd
pd.options.display.max_columns = 200

import requests as req
import json
import boto3
import io

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

# Grab website metadata

In [6]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

logging.info("Number of datasets: " + str(current_datasets_on_api.shape[0]))

INFO:root:Number of datasets: 328


# Pull out all layer names and definitions
* Link to data set detail page
* For each layer:
* * Layer name
* * Layer description


In [32]:
current_datasets_on_api['layers']

Dataset
f6d8caef-578f-4afe-a8de-f5e972c43f0c                                                   []
d3f50041-2fbd-4749-b8c5-73d14f35193a    [{'id': '5b7a036c-eadc-42ad-a0db-130fbf66449b'...
acf42a1b-104b-4f81-acd0-549f805873fb    [{'id': '3a52f7ed-0c20-4691-981b-3a3521b069aa'...
57156462-e47b-4e25-88bc-610ad55c35bc                                                   []
6b8442f5-4766-4444-94b4-d6676277fd80    [{'id': '3885375e-7bac-4fcd-807f-8e12c6c0e5f1'...
917f1945-fff9-4b6f-8290-4f4b9417079e                                                   []
b8a6a6ea-7d2f-4d59-bb5e-7143a2ddc1fe                                                   []
a290675c-9528-4a51-8201-f6c2d7848744    [{'id': '09aa80be-d7c8-4008-98fb-8adbd3a2e05b'...
fa6443ff-eb95-4d0f-84d2-f0c91682efdf    [{'id': '07b6e469-d1c4-4ab8-a8a7-cf37f344ae4c'...
26fe7429-9087-434b-b9d1-856881a34f4a                                                   []
66d1bba4-ccf4-415e-a2d0-f607c6304994    [{'id': '7701ed4c-95f6-4c7f-a170-f1cac0909cc1'...
01

In [25]:
try:
    os.unlink('layer_info.txt')
except:
    pass


### NOTE: use option 'a' to append to a file. 'w' would overwrite
f = open('layer_info.txt', 'a')

def pick_layer_info(layer, f):
    s_atts = layer['attributes']
    t_atts = ['name', 'description']
    f.write('layer id: {}\n'.format(layer['id']))
    for att in t_atts:
        if att in s_atts:
            f.write('{}: {}\n'.format(att, s_atts[att]))
    f.write('\n')
            
            
def record_layers(info, f):
    ds = info[0]
    layers = info[1]
    f.write('** NEW DATASET **\n')
    f.write('ds id: {}\n'.format(ds))
    f.write('ds on site: https://staging.resourcewatch.org/data/explore/{}\n\n'.format(ds))
    f.write('~ layers ~\n\n')
    list(map(lambda l: pick_layer_info(l,f), layers))
    f.write('\n')

list(map(lambda i: record_layers(i,f), current_datasets_on_api['layers'].items()))

f.close()

# Pull out all widget names and definitions
* Link to data set detail page
* For each widget:
* * Widget name
* * Widget description

In [33]:
current_datasets_on_api['widgets']

Dataset
f6d8caef-578f-4afe-a8de-f5e972c43f0c                                                   []
d3f50041-2fbd-4749-b8c5-73d14f35193a    [{'id': 'fb5218b2-3073-4b8d-a770-55bc16b770cc'...
acf42a1b-104b-4f81-acd0-549f805873fb                                                   []
57156462-e47b-4e25-88bc-610ad55c35bc                                                   []
6b8442f5-4766-4444-94b4-d6676277fd80    [{'id': '0e81e05e-c506-4994-91f0-4f79acbe454d'...
917f1945-fff9-4b6f-8290-4f4b9417079e    [{'id': 'c8a973d1-7600-4950-aded-9ef74cfe3080'...
b8a6a6ea-7d2f-4d59-bb5e-7143a2ddc1fe    [{'id': '940e2242-c7b8-40bd-9c55-2156976a63bb'...
a290675c-9528-4a51-8201-f6c2d7848744    [{'id': 'ed53f98b-990d-49ab-aa73-cdd71d0bdd75'...
fa6443ff-eb95-4d0f-84d2-f0c91682efdf    [{'id': '8c44a854-63e7-4ce6-b864-4858c7394852'...
26fe7429-9087-434b-b9d1-856881a34f4a                                                   []
66d1bba4-ccf4-415e-a2d0-f607c6304994                                                   []
01

In [29]:
try:
    os.unlink('widget_info.txt')
except:
    pass

f = open('widget_info.txt', 'a')

def pick_widget_info(widget, f):
    s_atts = widget['attributes']
    t_atts = ['name', 'description']
    f.write('widget id: {}\n'.format(widget['id']))
    for att in t_atts:
        if att in s_atts:
            f.write('{}: {}\n'.format(att, s_atts[att]))
    f.write('\n')
            
            
def record_widgets(info, f):
    ds = info[0]
    widgets = info[1]
    f.write('** NEW DATASET **\n')
    f.write('ds id: {}\n'.format(ds))
    f.write('ds on site: https://staging.resourcewatch.org/data/explore/{}\n\n'.format(ds))
    f.write('~ widgets ~\n\n')
    list(map(lambda l: pick_widget_info(l,f), widgets))
    f.write('\n')

list(map(lambda i: record_widgets(i,f), current_datasets_on_api['widgets'].items()))

f.close()