In [143]:
import pandas as pd
import requests as req
import os

pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

Fetching Backoffice Info

In [214]:
url = "https://api.resourcewatch.org/v1/layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
configs = {}
count = 0
for ds in res.json()["data"]:
    try:
        if ds['attributes']['dataset'] in configs:
            configs[ds['attributes']['dataset']].append(ds['attributes']['provider'])
        else:
            configs[ds['attributes']['dataset']] = [ds['attributes']['provider']]
        count += 1
    except:
        pass
ds_in_api_layers = pd.DataFrame.from_dict(configs, orient='index')
print('Total number of layers in api:', count)
print('Total number of unique datasets in layers in api:', len(configs))
print()

print('layers on the api')
print(ds_in_api_layers.head(1))
print(ds_in_api_layers.shape)
print()

print(api_layers.shape[0], 'number of unique dataset ids represented in the layers')
print('Some of these may be attached to datasets which no longer exist')

configs = {}
for ds in res.json()["data"]:
    try:
        configs[ds['attributes']['dataset']] = ds['attributes']['layerConfig']['body']['layers'][0]['options']['sql']
    except:
        pass
    
api_sql = pd.DataFrame.from_dict(configs, orient='index')
api_sql.columns = ['SQL']
print('layers with SQL on the api')
print(api_sql.head(1))
print(api_sql.shape)

Total number of layers in api: 442
Total number of unique datasets in layers in api: 259

layers on the api
                                           0     1     2     3     4     5   \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  cartodb  None  None  None  None  None   

                                        6     7     8     9     10    11  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  None  None   

                                        12    13    14    15  
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  
(259, 16)

0 number of unique dataset ids represented in the layers
Some of these may be attached to datasets which no longer exist
layers with SQL on the api
                                                                 SQL
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  SELECT * FROM global_mangroves
(194, 1)


In [192]:
url = "https://api.resourcewatch.org/v1/dataset"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
api_datasets = pd.DataFrame(pd.DataFrame(res.json()["data"])['id']).set_index('id')
print('datasets on the api')
print(api_datasets.head(1))
print(api_datasets.shape)

datasets on the api
Empty DataFrame
Columns: []
Index: [098b33df-6871-4e53-a5ff-b56a7d989f9a]
(291, 0)


Fetching data from Google Spreadsheets

In [91]:
# Tracking Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > tracking_sheet.tsv
tracking_sheet = pd.read_csv("tracking_sheet.tsv", sep="\t", index_col=[0])
os.remove("tracking_sheet.tsv")

# Continue with the metadata that matches elements in the tracking sheet
ids_on_tracking = pd.notnull(tracking_sheet["API_ID"])
tracking = tracking_sheet.loc[ids_on_tracking]
tracking = tracking.reset_index().set_index("API_ID")

# QC Ready Metadata
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > metadata_sheet.tsv
metadata_sheet = pd.read_csv("metadata_sheet.tsv", sep="\t", index_col=[0])
os.remove("metadata_sheet.tsv")

# Continue with the metadata that matches elements in the tracking sheet
ids_on_metadata = pd.notnull(metadata_sheet["final_ids"])
metadata = metadata_sheet.loc[ids_on_metadata]

# Should have used this:
metadata = metadata.reset_index().set_index("final_ids")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   97k    0   97k    0     0   239k      0 --:--:-- --:--:-- --:--:--  239k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  763k    0  763k    0     0  1546k      0 --:--:-- --:--:-- --:--:-- 1546k


In [92]:
tracking.head(1)

Unnamed: 0_level_0,WRI_ID,Perfect Dataset? C = extra QC needed,Published on RW,Public Title,Technical Title,Subtitle,Theme_1,Theme_2,Theme_3,Format,...,Uploaded to S3,Download Data (S3),Dataset Processed for Upload,Distribution Restriction,Data Upload Responsibility,Alias defined on Backoffice,Layer Definition/Description/Name,Editable Widget (Chart/Map),Tags,Category
API_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5b5a21ac-0835-43fb-86b9-64b93d472e10,bio.001,X,X,Endangered Species Sites,Alliance for Zero Extinction Sites (AZE),AZE,Biodiversity,,,Vector,...,X,https://wri-public-data.s3.amazonaws.com/resou...,omitted spreadsheet forward from AZE along wit...,,Peter,,X,Map,,BIO


Comparing amongst backoffice and google spreadsheet data

In [196]:
api_datasets.loc['9c5ce780-41e4-4004-be24-683a865b235e']

Series([], Name: 9c5ce780-41e4-4004-be24-683a865b235e, dtype: float64)

In [194]:
datasets_missing_layers = [ix for ix in api_datasets.index if ix not in api_layers.index]
layers_missing_datasets = [ix for ix in api_layers.index if ix not in api_datasets.index]
print('Number datasets missing layers:', len(datasets_missing_layers))
print('Number layers missing datasets:', len(layers_missing_datasets))


dml_ids = tracking.loc[datasets_missing_layers]['WRI_ID']
dlm_ids = tracking.loc[layers_missing_datasets]['WRI_ID']

print('datasets on api without layers')
print(dml_ids)
print(api_datasets.loc[datasets_missing_layers[0]])
print()

print('layers on api without datasets')
print(dlm_ids)
print(api_layers.loc[layers_missing_datasets[0]])
print()

098b33df-6871-4e53-a5ff-b56a7d989f9a
0b9f0100-ce5b-430f-ad8f-3363efa05481
0a59f415-ee0b-4d19-96f7-c7304c152e1b
16df8ada-87cc-4907-adce-a98bc4e91856
12510410-1eb3-4af0-844f-8a05be50b1c1
42de3f98-ba1c-4572-a227-2e18d45239a5
4458eb12-8572-45d1-bf07-d5a3ee097021
9e7dc020-5a93-4df8-b81e-ee3e7bf32764
a5e50799-fbd1-4aca-b3fa-22a6f6fa7aad
d8a45b34-4cc0-42f4-957d-e13b37e9182e
e63bb157-4b98-4ecb-81d6-c1b15e79895a
2b569ae2-9452-44f8-9c81-3c3afb6c3c25
9e9a5c50-b825-4f12-838f-1650943c2be1
bb80312e-b514-48ad-9252-336408603591
a9e33aad-eece-4453-8279-31c4b4e0583f
5edefab9-c707-447e-96f9-6115149e3a87
8d84f6c6-8e7d-410e-a408-cbfb2555b35d
7d3465f8-5959-4531-aaf2-c9a8a03183b3
9d9b48d3-152d-48c3-8c2a-2957ddb601a1
e8f5b4e8-454e-488d-8b4a-b60ad02bce36
a07f1bed-ca16-4fbf-b14b-d3a0344cab74
3624554e-b240-4edb-9110-1f010642c3f3
3f35ccf5-a104-4edf-b1ca-4e0bd423615b
86777822-d995-49cd-b9c3-d4ea4f82c0a3
415dd499-8385-4c23-bd18-8ac49803cf9e
5e156d22-7f84-4cd2-9724-c1a519a83e0a
c9c9cb2f-9655-4f40-8736-9b407ee43514
d

KeyError: 'None of [[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]] are in the [index]'

In [93]:
metadata.head(1)

Unnamed: 0_level_0,Unique ID,Learn More Link,Download from Source,Download Data (S3),Distribution Restriction,Shared API - Do Not Touch These!,Public Title,Technical Title,Subtitle,Source Organizations,...,Original Data Name 1,Original Data Link 1,Original Data Name 2,Original Data Link 2,Original Data Name 3,Original Data Link 3,Original Data Name 4,Original Data Link 4,Unnamed: 37,API_ID
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5b5a21ac-0835-43fb-86b9-64b93d472e10,bio.001,http://www.biodiversitya-z.org/content/allianc...,http://www.biodiversitya-z.org/content/allianc...,,"X emailed that it is okay, but affiliated with...",,Endangered Species Sites,Alliance for Zero Extinction Sites (AZE),AZE,Alliance for Zero Extinction (AZE),...,,,,,,,,,,5b5a21ac-0835-43fb-86b9-64b93d472e10


In [109]:
#https://stackoverflow.com/questions/22676081/pandas-the-difference-between-join-and-merge
# Join attaches data from left onto indices of right

df = pd.DataFrame(tracking['WRI_ID'])

table_joined = api_layers.join(df, how='left')
table_merged = api_layers.merge(df, left_index=True, right_index=True)
pd_table_merged = pd.merge(api_layers, df, left_index=True, right_index=True)
#table = table.reset_index().set_index('WRI_ID')

print(table_joined.head(1))
print(table_merged.head(1))
print(pd_table_merged.head(1))

                                                                 SQL   WRI_ID
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  SELECT * FROM global_mangroves  for.005
                                                                 SQL   WRI_ID
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  SELECT * FROM global_mangroves  for.005
                                                                 SQL   WRI_ID
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  SELECT * FROM global_mangroves  for.005


In [135]:
m = pd.DataFrame(metadata['Unique ID'])
t = pd.DataFrame(tracking['WRI_ID'])
print('m')
print(m.head(1))
print(m.shape)
print()
print('t')
print(t.head(1))
print(t.shape)
print()

mgmt = pd.merge(m, t, left_index=True, right_index=True)
print('merged t to m')
print(mgmt.head(1))
print(mgmt.shape)
print()
mgtm = pd.merge(t, m, left_index=True, right_index=True)
print('merged m to t')
print(mgtm.head(1))
print(mgtm.shape)
print()

jtm = t.join(m, how='left')
print('joined m to t')
print(jtm.head(1))
print(jtm.shape)
print()
jmt = m.join(t, how='left')
print('joined t to m')
print(jmt.head(1))
print(jmt.shape)
print()

print('api response')
print(api_layers.head(1))
print(api_layers.shape)
print()

m
                                     Unique ID
final_ids                                     
5b5a21ac-0835-43fb-86b9-64b93d472e10   bio.001
(213, 1)

t
                                       WRI_ID
API_ID                                       
5b5a21ac-0835-43fb-86b9-64b93d472e10  bio.001
(219, 1)

merged t to m
                                     Unique ID   WRI_ID
5b5a21ac-0835-43fb-86b9-64b93d472e10   bio.001  bio.001
(210, 2)

merged m to t
                                       WRI_ID Unique ID
5b5a21ac-0835-43fb-86b9-64b93d472e10  bio.001   bio.001
(210, 2)

joined m to t
                                       WRI_ID Unique ID
API_ID                                                 
5b5a21ac-0835-43fb-86b9-64b93d472e10  bio.001   bio.001
(219, 2)

joined t to m
                                     Unique ID   WRI_ID
final_ids                                              
5b5a21ac-0835-43fb-86b9-64b93d472e10   bio.001  bio.001
(213, 2)

api response
                            

In [134]:
print('These data sets have a final id in the tracking sheet, but not the metadata sheet')
jtm[pd.isnull(jtm['Unique ID'])]['WRI_ID']

API_ID
7d9c0d09-e833-4a74-811b-0af78da9c731    cit.032
37d5db2e-8ab8-4870-bfcc-534cdd5bdb76    ene.025
33a6b7d0-a495-4231-880b-9f2d071a8ce2    ene.026
5d077888-5f04-41f5-9e70-e121dda35d27    ene.027
db7a2b6b-726f-429b-8833-3f81be0ac286    foo.013
8acb9918-e064-47a2-bcc1-502b31f502fb    soc.030
42b575a2-f1f4-485c-9f81-ee60d4e76340    soc.031
e1fe8457-8bbe-4a9e-8632-5e9681d1d0c1    soc.058
20662342-dcdd-4a42-9f58-bcc80217de71    soc.071
Name: WRI_ID, dtype: object

In [151]:
print(sum(pd.isnull(api_layers['SQL'])))
print(api_layers.shape)

0
(194, 1)


In [150]:
jtma = jtm.join(api_layers, how='left')
print('nested join')
print(jtma.head(1))
print(jtma.shape)
print()

keep_vals = pd.isnull(jtma['SQL'])
jtmanb = jtma[keep_vals]
print('datasets confirmed whose API ids are not confirmed in the backoffice')
print(jtmanb.head(1))
print(jtmanb.shape)
print()

nested join
                                       WRI_ID Unique ID  \
API_ID                                                    
5b5a21ac-0835-43fb-86b9-64b93d472e10  bio.001   bio.001   

                                                                                    SQL  
API_ID                                                                                   
5b5a21ac-0835-43fb-86b9-64b93d472e10  SELECT * FROM alliance_for_zero_extinction_sit...  
(219, 3)

datasets confirmed whose API ids are not confirmed in the backoffice
                                       WRI_ID Unique ID  SQL
API_ID                                                      
3c82c421-8964-444e-86f2-df800174d8b9  bio.008   bio.008  NaN
(69, 3)



In [145]:
jtmanb[['WRI_ID']]

Unnamed: 0_level_0,WRI_ID
API_ID,Unnamed: 1_level_1
3c82c421-8964-444e-86f2-df800174d8b9,bio.008
85d5d550-b946-4dbd-bf77-875017c8e7ec,bio.012
0e565ddf-74fd-4f90-a6b8-c89d747a89ab,bio.014
048b2140-9d4b-433e-a2dd-8d4122eb157b,bio.015
1ef55baf-bbbe-480d-85e9-7132c742f196,bio.035
28dd2700-6de7-4345-830a-a5ffa0716bb2,cit.005
565c3cb1-8015-4029-b497-e9245608580f,cit.014
0303127a-70b0-4164-9251-d8162615d058,cit.017
aa9e9e43-a0bc-4835-a06d-d67af82bfd7c,cit.018
6e6750da-50c8-4b52-914f-b0d663a7ab59,cit.031


In [None]:
table.to_csv('ids_and_sql.csv')

Checking API IDs