In [None]:
import requests as req
import os
import json
from pprint import pprint

import numpy as np
import pandas as pd
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000


In [None]:
#### Download Google Spreadsheets ####

# Make a temporary directory, and move into it
!mkdir temp
os.chdir("temp")
dest = os.getcwd()

# July Data Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > july_data_upload.tsv
july_data_upload = pd.read_csv(dest+"/july_data_upload.tsv", sep="\t", index_col=[0])

# # Metadata to Upload
# !curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
# current_mdata = pd.read_csv(dest+"/current_metadata.tsv", sep="\t", index_col=[0])
# current_mdata = current_mdata.transpose()

!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
current_mdata = pd.read_csv(dest+"/current_metadata.tsv", sep="\t", index_col=[0])
current_mdata = current_mdata.transpose()

# Delete temporary files
os.chdir("..")
!rm -r temp

#### Merge info from the Tracking sheet and Metadata sheet

# Add RW API key to metadata_to_upload
def fetch_info(row, match_col, dest_df, target_column):
    try:
        info = dest_df.loc[row[match_col], target_column]
        return(info)
    except:
        return(None)

old_id_col = "VIZZ - RW API (bulk upload)"
new_id_col = "API-ID (PERFECT DATASET)"
dl_from_src_col = "Download from Source"
dl_from_s3_col = "Download Data (S3)"

match_col = "Unique ID"

current_mdata[old_id_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,old_id_col), axis=1)
current_mdata[new_id_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,new_id_col), axis=1)

current_mdata[dl_from_src_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,dl_from_src_col), axis=1)
current_mdata[dl_from_s3_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,dl_from_s3_col), axis=1)

# Keep only those datasets with rw_ids already
valid_old_ids = pd.notnull(current_mdata[old_id_col])
valid_new_ids = pd.notnull(current_mdata[new_id_col])

def choose_new_id(df, valid_old_ids,old_id_col, valid_new_ids,new_id_col):
    assert(len(valid_old_ids)==len(valid_new_ids))
    final_ids = []
    for i in range(0, len(valid_new_ids)):
        if(valid_new_ids[i]):
            final_ids.append(df.iloc[i][new_id_col])
        elif(valid_old_ids[i]):
            final_ids.append(df.iloc[i][old_id_col])
        else:
            final_ids.append(None)
    return(final_ids)

current_mdata["final_ids"] = choose_new_id(current_mdata, valid_old_ids,old_id_col, valid_new_ids,new_id_col)
keep_matched_ids = pd.notnull(current_mdata["final_ids"])

current_mdata = current_mdata.loc[keep_matched_ids]
current_mdata.set_index("final_ids", inplace=True)

In [None]:
list1 = [None, "a", None, "b"]
list2 = ["c", None, None, "d"]
df = pd.DataFrame({"l1":list1, "l2":list2})
choose_new_id(df, list1,"l1", list2,"l2")

In [None]:
print(current_mdata.shape)
print(july_data_upload.shape)

In [None]:
sum(valid_new_ids)

In [None]:
current_mdata.head(1)

In [None]:
july_data_upload.head(1)

In [None]:
current_mdata.columns

In [None]:
## FOR EACH DATASET IN BACKOFFICE THAT HAS METADATA, UPLOAD IT
from configparser import ConfigParser
config = ConfigParser()
config.read("../.env")
api_token = config.get("auth", "api_token")

auth_token = api_token # <Insert Auth Token Here>
def clean_nulls(val):
    try:
        if np.isnan(val):
            return(None)
        else:
            return(val)
    except:
        return(val)

#test = ["3624554e-b240-4edb-9110-1f010642c3f3"]



processed1 = []


### THIS ADDS ALL DATASETS FOR WHICH WE HAVE METADATA in METADATA FOR UPLOAD ###

small_batch = ["5e69cfac-1f68-4864-a19a-3c1bdb180100"]
#for rw_id in current_mdata.index:
for rw_id in small_batch:
    url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
    print(url)
    # Everything from current_mdata
    metadata = current_mdata.loc[rw_id]
    if len(metadata.shape) > 1:
        print(metadata)
    
    #print(metadata)
    row_payload = {
        "language": "en",
        
        "name": clean_nulls(metadata["Public Title"]),
        "description": clean_nulls(metadata["Description"]),
        "source": clean_nulls(metadata["Subtitle"]),
        "functions": clean_nulls(metadata["Function"]),
        
        "application":"rw",
        "dataset":rw_id,
        
        "info": {
            
            "wri_rw_id": clean_nulls(metadata["Unique ID"]),
            "data_type": clean_nulls(metadata["Data Type"]),

            "name": clean_nulls(metadata["Public Title"]),
            "source_organization": clean_nulls(metadata["Source Organizations"]),
            "technical_title":clean_nulls(metadata["Technical Title"]),

            "function": clean_nulls(metadata["Function"]),
            "cautions": clean_nulls(metadata["Cautions"]),
            
            "citation": clean_nulls(metadata["Citation"]),
            "summary_of_license": clean_nulls(metadata["Summary of Licence"]),
            "link_to_license": clean_nulls(metadata["Link to License"]),
            
            "geographic_coverage": clean_nulls(metadata["Geographic Coverage"]),
            "spatial_resolution": clean_nulls(metadata["Spatial Resolution"]),
            "date_of_content": clean_nulls(metadata["Date of Content"]),
            "frequency_of_updates": clean_nulls(metadata["Frequency of Updates"]),
            
            "learn_more_link": clean_nulls(metadata["Learn More Link"]),
            "data_download_link": clean_nulls(metadata["Download Data (S3)"]),
            "data_download_original_link":clean_nulls(metadata["Download from Source"])
            
        }
    }

    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + auth_token,
    }

    try:
        processed1.append(rw_id)
        res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
        print(res)
        #print(res.text)
        if("already exists" in res.text):
            res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
            print(res)
            if("errors:" in res.text):
                print(res.text)
        elif("errors:" in res.text):
            print(res.text)
    except TypeError as e:
        print(e.args)
        print(metadata[["Unique ID", "Public Title"]])

In [None]:
#### Merge subtitles with Tracking sheet

#### UPLOADS TITLE, SUBTITLE, AND DOWNLOAD LINKS, if there is no METADATA IN METADATA FOR UPLOAD AVAILABEL ####

# Keep only those datasets from trakcing sheet with rw_ids already
tracking_valid_old_ids = pd.notnull(july_data_upload[old_id_col])
tracking_valid_new_ids = pd.notnull(july_data_upload[new_id_col])

july_data_upload["final_ids"] = choose_new_id(july_data_upload, tracking_valid_old_ids,old_id_col, tracking_valid_new_ids,new_id_col)

missed_ids = [rw_id for rw_id in july_data_upload["final_ids"].values if ((rw_id not in current_mdata.index) and (rw_id != None))]

missed_data = july_data_upload.reset_index().set_index("final_ids")
missed_data = missed_data.loc[missed_ids]
missed_data


### THIS ADDS ALL DATASETS FOR WHICH WE HAVE ENTRIES IN TRACKING SHEET and NOTHING IN METADATA FOR UPLOAD###
print("True if below print empty list []")
print([ind for ind in missed_data.index if ind in current_mdata.index])

processed2 = []

for rw_id in missed_data.index:
    url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
    # Everything from current_mdata
    metadata = missed_data.loc[rw_id]
    print(metadata["WRI Unique ID"])
    print(metadata["Public Title"])
    print(url)
    #print(metadata)
    row_payload = {
        "language": "en",
        
        "name": clean_nulls(metadata["Public Title"]),
        "source": clean_nulls(metadata["Subtitle"]),
        
        "application":"rw",
        "dataset":rw_id,
        
        "info": {
            
            "wri_rw_id": clean_nulls(metadata["WRI Unique ID"]),

            "name": clean_nulls(metadata["Public Title"]),
            "technical_title":clean_nulls(metadata["Technical Title"]),

            "data_download_link": clean_nulls(metadata["Download Data (S3)"]), 
            "data_download_original_link": clean_nulls(metadata["Download from Source"])
            
        }
    }

    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + auth_token,
    }
    #print(row_payload)

    try:
        processed2.append(rw_id)
        res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
        if("already exists" in res.text):
            res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
            if("errors:" in res.text):
                print(res.text)
        elif("errors:" in res.text):
            print(res.text)
    except TypeError as e:
        print(e.args)
        print(metadata[["Unique ID", "Public Title"]])



In [None]:
missed_data.to_csv("/Users/nathansuberi/Desktop/datasets_on_july_sheet_with_rw_id_no_metadata.csv")
missed_data

In [None]:
"9ea634db-53af-445e-a767-60ec9efc321e" in processed2

In [None]:
#### Inspect metadata on backoffice

# Base URL for getting dataset metadata from RW API
# Metadata = Data that describes Data 
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

#############################################################

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[atts["name"]] = {
        "rw_id":dset["id"],
        "upload_name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata_keys":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }    
    
# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')

def check_public_title(metadata):
    if len(metadata) > 0:
        mdata = metadata[0]
        if "attributes" in mdata:
            if "info" in mdata["attributes"]:
                if "name" in mdata["attributes"]["info"]:
                    return(mdata["attributes"]["info"]["name"])
        return(None)

# Grab public title, if it exists in metadata
current_datasets_on_api["public_title"] = current_datasets_on_api.apply(lambda row: check_public_title(row["metadata"]), axis=1)

current_datasets_on_api.set_index("rw_id", inplace=True)
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

In [None]:
### THIS COVERS ALL DATASETS WHICH ARE ON THE BACKOFFICE but HAVE NO WRI_ID / RW_ID IN TRACKING SHEET ###
### Occasionally this is because the data has been moved to after launch

investigate_mdata = current_datasets_on_api[["upload_name", "public_title", "metadata"]]

missed_ids = [rw_id for rw_id in investigate_mdata.index if ((rw_id not in processed1) & (rw_id not in processed2))]

investigate_mdata = investigate_mdata.loc[missed_ids]

investigate_mdata

Many of these are datasets for which the Unique ID changed

soc.003 Distribution of Infant Mortality
soc.016 Conflict and Protest Events in African...
dis_007 Landslide Susceptibility Map
bio.035 Coral Bleaching Frequency Prediction
dis.001 Earthquakes Over the Past 30 days
Foo_046a Food Footprint in Protein
wat.033 Agriculture Water Demand and Depletion
soc.062 Internal Displacement
soc.061 Rural Poverty
soc.042 Percentage of Urban Population with Ac
soc.020 GINI Index
soc.008 Gross Domestic Product Per Capita (PPP
soc.006 Multidimensional Poverty Index
soc.004 Human Development Index
soc.002 Gender Development Index
foo.002 GLDAS Land Water Content from NOAH Lan..
com.028 Effect of Agricultural Policies on Com...
cit.029 Municipal Waste

In [None]:
# DANGER Bug - able to update metadata for a dataset that no longer exists on the API
#test upload cit.029:
#    broken, old id: 8f14a33e-5a61-47af-b26e-c1fc036932a5
#    working, new id: 00abb46f-34e2-4bf7-be30-1fb0b1de022f
    
url1="https://api.resourcewatch.org/v1/dataset/8f14a33e-5a61-47af-b26e-c1fc036932a5/metadata"    
url2="https://api.resourcewatch.org/v1/dataset/10337db6-8321-445e-a60b-28fc1e114f29/metadata"

res1a = req.request("POST", url1, data=json.dumps(row_payload), headers = headers)
if("already exists" in res1a.text):
    res1b = req.request("PATCH", url1, data=json.dumps(row_payload), headers = headers)
        
res2a = req.request("POST", url2, data=json.dumps(row_payload), headers = headers)
if("already exists" in res2a.text):
    res2b = req.request("PATCH", url1, data=json.dumps(row_payload), headers = headers)
    
print(res1b.text)