Overview:
* Load metadata and tracking sheet details

Import libraries

In [None]:
import requests as req
import os
import json

import numpy as np
import pandas as pd
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000

Load Authentication Credentials

In [None]:
from configparser import ConfigParser
config = ConfigParser()
config.read("/Users/nathansuberi/Desktop/WRI_Programming/cred/.env")
api_token = config.get("auth", "rw_api_token")

auth_token = api_token # <Insert Auth Token Here>

In [None]:
#### Download Google Spreadsheets ####
# QC Ready Metadata
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
current_mdata = pd.read_csv(open("current_metadata.tsv", "r"), sep="\t", index_col=[0])
os.remove("current_metadata.tsv")

# Continue with the metadata that matches elements in the tracking sheet
ids_on_backoffice = pd.notnull(current_mdata["final_ids"])
mdata_for_ids_on_backoffice = current_mdata.loc[ids_on_backoffice]

# Should have used this:
mdata_for_ids_on_backoffice = mdata_for_ids_on_backoffice.reset_index().set_index("final_ids")

In [None]:
mdata_for_ids_on_backoffice.head()

In [None]:
def clean_nulls(val):
    """Used to clean np.nan values from the metadata update call... which don't play nice with the RW API"""
    try:
        if np.isnan(val):
            return(None)
        else:
            return(val)
    except:
        return(val)

def create_source_object(sources):
    """Format the source information as appropriate for the api"""
    if sources:
        source_object = []
        srcs = sources.split("/")
        for ix, src in enumerate(srcs):
            source_object.append({
                "source-name":src,
                "id":ix,
                "source-description":""
            })
        return source_object
    return None

In [None]:
## FOR EACH DATASET IN BACKOFFICE THAT HAS METADATA, UPLOAD IT

### THIS ADDS ALL DATASETS FOR WHICH WE HAVE METADATA in METADATA FOR UPLOAD ###

processed1 = []

for rw_id in mdata_for_ids_on_backoffice.index:
    url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
    metadata = mdata_for_ids_on_backoffice.loc[rw_id]
    
    # If there are multiple metadata entries for a single rw_id, print these for trouble-shooting
    if len(metadata.shape) > 1:
        print(metadata)

    # If the data is of type raster, don't include the Download Data (S3) link
    flag1 = clean_nulls(metadata["Data Type"]) != None
    if(flag1):
        flag2 = clean_nulls(metadata["Data Type"]).lower() != "raster"
        if(flag2):
            data_dl_link = clean_nulls(metadata["Download Data (S3)"])
        else:
            data_dl_link = None
    else:
        data_dl_link = None

    # If there is no download from source, default to the learn more link
    if(clean_nulls(metadata["Download from Source"]) != None):
        data_dl_orig_link = clean_nulls(metadata["Download from Source"])
    else:
        data_dl_orig_link = clean_nulls(metadata["Learn More Link"])

    # If there is no technical title, default to the public title
    if(clean_nulls(metadata["Technical Title"]) != None):
        tech_title = clean_nulls(metadata["Technical Title"])
    else:
        tech_title = clean_nulls(metadata["Public Title"])

    print(clean_nulls(metadata["Unique ID"]))
    row_payload = {
        "language": "en",

        "name": clean_nulls(metadata["Public Title"]),
        "description": clean_nulls(metadata["Description"]),
        "subtitle": clean_nulls(metadata["Subtitle"]),
        "source": clean_nulls(metadata["Subtitle"]),
        "functions": clean_nulls(metadata["Function"]),

        "application":"rw",
        "dataset":rw_id,

        "info": {
            "wri_rw_id": clean_nulls(metadata["Unique ID"]),
            "rwId": clean_nulls(metadata["Unique ID"]),

            "data_type": clean_nulls(metadata["Data Type"]),

            "name": clean_nulls(metadata["Public Title"]),
            "sources": create_source_object(clean_nulls(metadata["Source Organizations"])),

            "technical_title":tech_title,

            "functions": clean_nulls(metadata["Function"]),
            "cautions": clean_nulls(metadata["Cautions"]),

            "citation": clean_nulls(metadata["Citation"]),

            "license": clean_nulls(metadata["Summary of Licence"]),
            "license_link": clean_nulls(metadata["Link to License"]),

            "geographic_coverage": clean_nulls(metadata["Geographic Coverage"]),
            "spatial_resolution": clean_nulls(metadata["Spatial Resolution"]),

            "date_of_content": clean_nulls(metadata["Date of Content"]),
            "frequency_of_updates": clean_nulls(metadata["Frequency of Updates"]),

            "learn_more_link": clean_nulls(metadata["Learn More Link"]),

            "data_download_link": data_dl_link,
            "data_download_original_link":data_dl_orig_link
        }
    }

    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + auth_token,
    }

    try:
        processed1.append(rw_id)
        res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
        if res.ok:
            print('New metadata uploaded')
        else:
            print('Whoops, already exists! Updating metadata.')
            res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
            print('Ok now?:', res.ok)
            if not res.ok:
                print(res.text)
                print(mdata_for_ids_on_backoffice.loc[rw_id])
    except TypeError as e:
        print(e.args)
        print(metadata[["Unique ID", "Public Title"]])

Experimentation below

In [None]:
# July Data Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > tracking_sheet.tsv
tracking_sheet = pd.read_csv("tracking_sheet.tsv", sep="\t", index_col=[0])
os.remove("tracking_sheet.tsv")

# Metadata for Upload
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
current_mdata = pd.read_csv(open("current_metadata.tsv", "r"), sep="\t", index_col=[0])
os.remove("current_metadata.tsv")

Helper Functions

In [None]:
# From General Utilities

def transfer_columns_between_tables(src_df, dst_df, columns_to_xfer=None):
    """
    Inputs:
    * src_df and dst_df with same index
    * list of columns from src_df to transfer to dst_df
    
    Outputs:
    * dst_df with new columns from the src_df
    """
    
    dst_df = dst_df.copy()
    try:
        info = src_df.loc[dst_df.index, columns_to_xfer]
        dst_df[columns_to_xfer] = info
        return(dst_df)
    except:
        print("ERROR")
        print("Possible source: There is an element in the metadata sheet that is not in the tracking sheet")
        print("Possible source: One of more of the given columns to transfer is not in the source df")
        return(None)
    
# Specific to this notebook

def investigate_unmatched(src_df, dst_df):
    """ This function assumes that the two dataframes share the same index of Unique IDs"""
    ids = dst_df.index
    unmatched = []
    for ix, id in enumerate(ids):
        try:
            src_df.loc[id]
        except:
            unmatched.append((ix+2, id))
    return(unmatched)

def clean_nulls(val):
    """Used to clean np.nan values from the metadata update call... which don't play nice with the RW API"""
    try:
        if np.isnan(val):
            return(None)
        else:
            return(val)
    except:
        return(val)

def create_source_object(sources):
    """Format the source information as appropriate for the api"""
    if sources:
        source_object = []
        srcs = sources.split("/")
        for ix, src in enumerate(srcs):
            source_object.append({
                "source-name":src,
                "id":ix,
                "source-description":""
            })
        return source_object
    return None

def choose_new_id(df, old_id_col, new_id_col):
    """
    Inputs:
    * A dataframe with two columns - old_ids, and new_ids. Could be "Perfect IDs" vs. old ids on RW backoffice
    * Names of the columns with the old_ids and new_ids
    Outputs:
    * A list of final_ids that can be appended to df
    """
    final_ids = []
    for i in range(df.shape[0]):
        new_col = df.iloc[i][new_id_col]
        old_col = df.iloc[i][old_id_col]
        if(pd.notnull(new_col)):
            final_ids.append(new_col)
        elif(pd.notnull(old_col)):
            final_ids.append(old_col)
        else:
            final_ids.append(None)
    return(final_ids)

# Sample usage
old_list = [None, "a", None, "b"]
new_list = ["c", None, None, "d"]
df = pd.DataFrame({"old":old_list, "new":new_list})
choose_new_id(df, "old", "new")

#old_id_col = "old_ids"
#new_id_col = "new_ids"
#tracking_valid_old_ids = pd.notnull(tracking_sheet[old_id_col])
#tracking_valid_new_ids = pd.notnull(tracking_sheet[new_id_col])
#tracking_sheet["final_ids"] = choose_new_id(tracking_sheet, tracking_valid_old_ids,old_id_col, tracking_valid_new_ids,new_id_col)

In [None]:
# Investigating dropped metadata

qc = pd.read_csv("/Users/nathansuberi/Desktop/RW_Data/Metadata for Upload - QC Ready Metadata.csv", index_col=[0]).reset_index().set_index("Unique ID")
later = pd.read_csv("/Users/nathansuberi/Desktop/RW_Data/Metadata for Upload - Metadata for later.csv", index_col=[0])
dropped = pd.read_csv("/Users/nathansuberi/Desktop/RW_Data/Metadata for Upload - Updated metadata 12-07.csv", index_col=[0])

dropped_ix = dropped.index
later_ix = later.index
qc_ix = qc.index

qc_in_dropped = [ix for ix in qc_ix if ix in dropped_ix]
later_in_dropped = [ix for ix in later_ix if ix in dropped_ix]

In [None]:
print(len(dropped_ix))
print(len(later_ix))
print(len(qc_ix))
print(len(later_in_dropped))
print(len(qc_in_dropped))

In [None]:
dropped.columns

In [None]:
data_to_add_to_qc = dropped.drop(qc_in_dropped+later_in_dropped)
missing_cols = [col for col in qc.columns if col not in data_to_add_to_qc.columns]
missing_cols

#for col in missing_cols:
#    data_to_add_to_qc[col] = None

#### Transfer info from the Tracking sheet to the Metadata sheet
id_col = "API_ID"
dl_from_src_col = "Download from Source"
dl_from_s3_col = "Download Data (S3)"
public_title = "Public Title"
technical_title = "Technical Title"
distribution_restriction = "Distribution Restriction"
shared_api = "Shared API - Do Not Touch These!"

columns_to_xfer = [id_col, 
                   dl_from_src_col, dl_from_s3_col, 
                   public_title, technical_title, 
                   distribution_restriction, shared_api]

current_mdata = transfer_columns_between_tables(dropped, data_to_add_to_qc, columns_to_xfer)
current_mdata["final_ids"] = current_mdata[id_col]
current_mdata = current_mdata[qc.columns]

current_mdata.to_csv("/Users/nathansuberi/Desktop/RW_Data/Adding_back_in_missed_data.csv")

In [None]:
#### Download Google Spreadsheets ####

# Make a temporary directory, and download files into it
# This makes it easy to remove them later
!mkdir temp
os.chdir("temp")
dest = os.getcwd()

# July Data Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > tracking_sheet.tsv
tracking_sheet = pd.read_csv(dest+"/tracking_sheet.tsv", sep="\t", index_col=[0])

# # Metadata to Upload
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
current_mdata = pd.read_csv(open(dest+"/current_metadata.tsv", "r"), sep="\t", index_col=[0])

# Delete temporary files
os.chdir("..")
!rm -r temp


#### Check for entries that will break the transfer
unmatched = investigate_unmatched(tracking_sheet, current_mdata)
print("These cause an error: " + str(unmatched))
try:
    _, drop_ixs = zip(*unmatched)
except:
    drop_ixs=[]

# Remove the metadata that is not in the tracking sheet, set aside
# Need to cast as a list, or else assumes the second item in the list is a column identifier
metadata_for_later = current_mdata.loc[list(drop_ixs)]
metadata_for_later.to_csv("/Users/nathansuberi/Desktop/RW_Data/metadata_for_later.csv")

# Continue with the metadata that matches elements in the tracking sheet
current_mdata = current_mdata.drop(list(drop_ixs), axis=0)

#### Transfer info from the Tracking sheet to the Metadata sheet
# id_col = "API_ID"
# dl_from_src_col = "Download from Source"
# dl_from_s3_col = "Download Data (S3)"
# public_title = "Public Title"
# technical_title = "Technical Title"
# distribution_restriction = "Distribution Restriction"
# shared_api = "Shared API - Do Not Touch These!"

# columns_to_xfer = [id_col, 
#                    dl_from_src_col, dl_from_s3_col, 
#                    public_title, technical_title, 
#                    distribution_restriction, shared_api]

# current_mdata = transfer_columns_between_tables(tracking_sheet, current_mdata, columns_to_xfer)

# Set the index of the current_mdata df to be the final id's, 
# Only move forward with metadata 
#current_mdata["final_ids"] = current_mdata[id_col]
ids_on_backoffice = pd.notnull(current_mdata["final_ids"])
mdata_for_ids_on_backoffice = current_mdata.loc[ids_on_backoffice]

# Should have used this:
mdata_for_ids_on_backoffice = mdata_for_ids_on_backoffice.reset_index().set_index("final_ids")

# Save the updated metadata
#mdata_for_ids_on_backoffice.to_csv("/Users/nathansuberi/Desktop/RW_Data/mdata_for_ids_on_backoffice.csv")

In [None]:
mdata_for_ids_on_backoffice.index

In [None]:
# Transfer the subtitles from the metadata sheet to the tracking sheet
tracking_sheet_with_subtitles = transfer_columns_between_tables(current_mdata, tracking_sheet, "Subtitle")
tracking_sheet_with_subtitles.to_csv("/Users/nathansuberi/Desktop/RW_Data/tracking_sheet_with_subtitles.csv")

In [None]:
print(mdata_for_ids_on_backoffice.shape)
print(tracking_sheet.shape)

In [None]:
print(mdata_for_ids_on_backoffice.columns)
print(mdata_for_ids_on_backoffice.head(1))

In [None]:
## FOR EACH DATASET IN BACKOFFICE THAT HAS METADATA, UPLOAD IT

### THIS ADDS ALL DATASETS FOR WHICH WE HAVE METADATA in METADATA FOR UPLOAD ###

processed1 = []

for rw_id in mdata_for_ids_on_backoffice.index:
    url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
    print(url)
    metadata = mdata_for_ids_on_backoffice.loc[rw_id]
    
    # If there are multiple metadata entries for a single rw_id, print these for trouble-shooting
    if len(metadata.shape) > 1:
        print(metadata)

    # If the data is of type raster, don't include the Download Data (S3) link
    flag1 = clean_nulls(metadata["Data Type"]) != None
    if(flag1):
        flag2 = clean_nulls(metadata["Data Type"]).lower() != "raster"
        if(flag2):
            data_dl_link = clean_nulls(metadata["Download Data (S3)"])
        else:
            data_dl_link = None
    else:
        data_dl_link = None

    # If there is no download from source, default to the learn more link
    if(clean_nulls(metadata["Download from Source"]) != None):
        data_dl_orig_link = clean_nulls(metadata["Download from Source"])
    else:
        data_dl_orig_link = clean_nulls(metadata["Learn More Link"])

    # If there is no technical title, default to the public title
    if(clean_nulls(metadata["Technical Title"]) != None):
        tech_title = clean_nulls(metadata["Technical Title"])
    else:
        tech_title = clean_nulls(metadata["Public Title"])

    print(clean_nulls(metadata["Unique ID"]))

    row_payload = {
        "language": "en",

        "name": clean_nulls(metadata["Public Title"]),
        "description": clean_nulls(metadata["Description"]),
        "subtitle": clean_nulls(metadata["Subtitle"]),
        "source": clean_nulls(metadata["Subtitle"]),
        "functions": clean_nulls(metadata["Function"]),

        "application":"rw",
        "dataset":rw_id,

        "info": {

            # One of these a duplicate, test how shows up in front-end
            # or should rwId be dataset, above?
            "wri_rw_id": clean_nulls(metadata["Unique ID"]),
            "rwId": clean_nulls(metadata["Unique ID"]),

            "data_type": clean_nulls(metadata["Data Type"]),

            "name": clean_nulls(metadata["Public Title"]),
            "sources": create_source_object(clean_nulls(metadata["Source Organizations"])),

            "technical_title":tech_title,

            "functions": clean_nulls(metadata["Function"]),
            "cautions": clean_nulls(metadata["Cautions"]),

            "citation": clean_nulls(metadata["Citation"]),

            "license": clean_nulls(metadata["Summary of Licence"]),
            "license_link": clean_nulls(metadata["Link to License"]),

            "geographic_coverage": clean_nulls(metadata["Geographic Coverage"]),
            "spatial_resolution": clean_nulls(metadata["Spatial Resolution"]),

            "date_of_content": clean_nulls(metadata["Date of Content"]),
            "frequency_of_updates": clean_nulls(metadata["Frequency of Updates"]),

            "learn_more_link": clean_nulls(metadata["Learn More Link"]),

            "data_download_link": data_dl_link,
            "data_download_original_link":data_dl_orig_link

        }
    }

    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + auth_token,
    }

    try:
        processed1.append(rw_id)
        res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
        print(res)
        if("already exists" in res.text):
            res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
            print(res)
            if("errors:" in res.text):
                print(res.text)
        elif("errors:" in res.text):
            print(res.text)

    except TypeError as e:
        print(e.args)
        print(metadata[["Unique ID", "Public Title"]])

In [None]:
#### Merge subtitles with Tracking sheet

#### UPLOADS TITLE, SUBTITLE, AND DOWNLOAD LINKS, if there is no METADATA IN METADATA FOR UPLOAD AVAILABLE (RW_UNIQUE ID EXISTS) ####

# Keep only those datasets from trakcing sheet with rw_ids already

missed_ids = [rw_id for rw_id in tracking_sheet["final_ids"].values if ((rw_id not in processed1) and (rw_id != None))]

missed_data = tracking_sheet.reset_index().set_index("final_ids")
missed_data = missed_data.loc[missed_ids]
missed_data

### THIS ADDS ALL DATASETS FOR WHICH WE HAVE ENTRIES IN TRACKING SHEET and NOTHING IN METADATA FOR UPLOAD###
print("True if below print empty list []")
print([ind for ind in missed_data.index if ind in mdata_for_ids_on_backoffice.index])

processed2 = []

for rw_id in missed_data.index:
    url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
    # Everything from current_mdata
    metadata = missed_data.loc[rw_id]
    #print(metadata)
    print(metadata["WRI Unique ID"])
    print(metadata["Public Title"])
    print(url)
    #print(metadata)
    row_payload = {
        "language": "en",
        
        "name": clean_nulls(metadata["Public Title"]),
        "subtitle": clean_nulls(metadata["Subtitle"]),
        
        "application":"rw",
        "dataset":rw_id,
        
        "info": {
            
            "wri_rw_id": clean_nulls(metadata["WRI Unique ID"]),

            "name": clean_nulls(metadata["Public Title"]),
            "technical_title":clean_nulls(metadata["Technical Title"]),

            "data_download_link": clean_nulls(metadata["Download Data (S3)"]), 
            "data_download_original_link": clean_nulls(metadata["Download from Source"])
            
        }
    }

    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + auth_token,
    }
    #print(row_payload)

    try:
        processed2.append(rw_id)
        res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
        if("already exists" in res.text):
            res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
            if("errors:" in res.text):
                print(res.text)
        elif("errors:" in res.text):
            print(res.text)
    except TypeError as e:
        print(e.args)
        print(metadata[["Unique ID", "Public Title"]])



In [None]:
missed_data.to_csv("/Users/nathansuberi/Desktop/datasets_on_july_sheet_with_rw_id_no_metadata.csv")
missed_data

In [None]:
#### Which datasets do we have metadata for, but are not on tracking sheet? ####
# processed1 stores datasets with metadata that were uploaded
# current_mdata is all mdata
# matched_mdata is all mdata with a final_id
print(mdata_for_ids_on_backoffice["Unique ID"].head())
print(current_mdata["Unique ID"].head())

unmatched_ids = [wri_id for wri_id in current_mdata["Unique ID"].values if wri_id not in mdata_for_ids_on_backoffice["Unique ID"].values]
unmatched_mdata = current_mdata.set_index('Unique ID').loc[unmatched_ids]
unmatched_mdata

Access all metadata on backoffice

In [None]:
# Base URL for getting dataset metadata from RW API
# Metadata = Data that describes Data 
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
api_response = res.json()["data"]

pprint(api_response[0], depth=2)

#############################################################

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}

for ix, dset in enumerate(api_response):

    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[atts["name"]] = {
        "rw_id":dset["id"],
        "upload_name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata_keys":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }    
    
# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')

def check_public_title(metadata):
    if len(metadata) > 0:
        mdata = metadata[0]
        if "attributes" in mdata:
            if "info" in mdata["attributes"]:
                if "name" in mdata["attributes"]["info"]:
                    return(mdata["attributes"]["info"]["name"])
        return(None)

# Grab public title, if it exists in metadata
datasets_on_api["public_title"] = datasets_on_api.apply(lambda row: check_public_title(row["metadata"]), axis=1)

datasets_on_api.set_index("rw_id", inplace=True)
datasets_on_api.index.rename("Dataset", inplace=True)
datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

Pull down all the table names - use to make sure this information is up to date on the tracking docs

In [None]:
table_names_new = {"final_ids":[], "table_name":[], "provider":[]}

for dset in datasets_on_api:    
    table_names_new["final_ids"].append(dset["id"])
    table_names_new["table_name"].append(dset["attributes"]["tableName"])
    table_names_new["provider"].append(dset["attributes"]["provider"])
    
dataset_table_names = pd.DataFrame.from_dict(table_names_new).set_index("final_ids")
dataset_table_names

In [None]:
# Two methods of merging:

# Method 1
matching_with_tracking = pd.merge(july_data_upload, dataset_table_names, 
                                  left_on="final_ids",
                                  right_index=True,
                                  how="left")
matching_with_tracking = matching_with_tracking[["final_ids", "provider", "table_name_y"]]
matching_with_tracking["Perfect Dataset?"] = valid_new_ids_tracking
matching_with_tracking.to_csv("/Users/nathansuberi/Desktop/RW_Data/update_final_ids.csv")

# Method 2
df = july_data_upload.reset_index().merge(dataset_table_names, how="left", on="Dataset on Backoffice").set_index("WRI Unique ID")
df.to_csv("/Users/nathansuberi/Desktop/RW_Data/tracking_sheet_w_table_names.csv")

Experimentation

In [None]:
### THIS COVERS ALL DATASETS WHICH ARE ON THE BACKOFFICE but HAVE NO WRI_ID / RW_ID IN TRACKING SHEET ###
### Occasionally this is because the data has been moved to after launch

### Check if any metadata are not updating as expected ###
### Indicating that their unique IDs are wrong in the tracking sheet ###

investigate_mdata = current_datasets_on_api[["upload_name", "public_title", "metadata"]]

missed_ids = [rw_id for rw_id in investigate_mdata.index if ((rw_id not in processed1) & (rw_id not in processed2))]

investigate_mdata = investigate_mdata.loc[missed_ids]

investigate_mdata.to_csv("Datasets_on_backoffice_with_no_WRIID.csv")

In [None]:
investigate_mdata

Many of these are datasets for which the Unique ID changed

soc.003 Distribution of Infant Mortality
soc.016 Conflict and Protest Events in African...
dis_007 Landslide Susceptibility Map
bio.035 Coral Bleaching Frequency Prediction
dis.001 Earthquakes Over the Past 30 days
Foo_046a Food Footprint in Protein
wat.033 Agriculture Water Demand and Depletion
soc.062 Internal Displacement
soc.061 Rural Poverty
soc.042 Percentage of Urban Population with Ac
soc.020 GINI Index
soc.008 Gross Domestic Product Per Capita (PPP
soc.006 Multidimensional Poverty Index
soc.004 Human Development Index
soc.002 Gender Development Index
foo.002 GLDAS Land Water Content from NOAH Lan..
com.028 Effect of Agricultural Policies on Com...
cit.029 Municipal Waste

In [None]:
# DANGER Bug - able to update metadata for a dataset that no longer exists on the API
#test upload cit.029:
#    broken, old id: 8f14a33e-5a61-47af-b26e-c1fc036932a5
#    working, new id: 00abb46f-34e2-4bf7-be30-1fb0b1de022f
    
url1="https://api.resourcewatch.org/v1/dataset/8f14a33e-5a61-47af-b26e-c1fc036932a5/metadata"    
url2="https://api.resourcewatch.org/v1/dataset/10337db6-8321-445e-a60b-28fc1e114f29/metadata"

res1a = req.request("POST", url1, data=json.dumps(row_payload), headers = headers)
if("already exists" in res1a.text):
    res1b = req.request("PATCH", url1, data=json.dumps(row_payload), headers = headers)
        
res2a = req.request("POST", url2, data=json.dumps(row_payload), headers = headers)
if("already exists" in res2a.text):
    res2b = req.request("PATCH", url1, data=json.dumps(row_payload), headers = headers)
    
print(res1b.text)