Goal 1: Make sure tracking sheet and qc sheet are in sync with each other
* Both have same datasets

Goal 2: Make sure tracking sheet and qc sheet are in sync with backoffice
* Flag datasets on backoffice that don't correspond to anything in tracking/qc

Goal 3: Make sure we have layers for all datasets we have metadata for

In [1]:
import pandas as pd
import requests as req
import os

pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

Pull down layer info from back office

In [301]:
url = "https://api.resourcewatch.org/v1/layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
configs = {}
count = 0
for ds in res.json()["data"]:
    try:
        if ds['attributes']['dataset'] in configs:
            configs[ds['attributes']['dataset']].append(ds['id'])
        else:
            configs[ds['attributes']['dataset']] = [ds['id']]
        count += 1
    except:
        pass
layer_ids_by_ds = pd.DataFrame.from_dict(configs, orient='index')
print('Total number of layers in api:', count)
print('Total number of unique datasets in layers in api:', len(configs))
print()

print(layer_ids_by_ds.shape[0], 'datasets w/ layers on the api')
print(layer_ids_by_ds.head(1))
print()

configs = {}
for ds in res.json()["data"]:
    try:
        configs[ds['attributes']['dataset']] = ds['attributes']['layerConfig']['body']['layers'][0]['options']['sql']
    except:
        pass
    
api_sql = pd.DataFrame.from_dict(configs, orient='index')
api_sql.columns = ['SQL']
print(api_sql.shape[0],'layers with SQL on the api')
print(api_sql.head(1))
print()

Total number of layers in api: 449
Total number of unique datasets in layers in api: 260

260 datasets w/ layers on the api
                                                                        0   \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  255cf15f-39c1-46d2-ba82-59540133e4d1   

                                        1     2     3     4     5     6   \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  None  None   

                                        7     8     9     10    11    12  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  None  None   

                                        13    14    15  
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  

194 layers with SQL on the api
                                                                 SQL
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  SELECT * FROM global_mangroves



Pull down dataset ids from backoffice

In [2]:
url = "https://api.resourcewatch.org/v1/dataset"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
df = pd.DataFrame(res.json()["data"])
print(df.columns)

for ix, row in df.iterrows():
    df.loc[ix,'Public Title'] = row['attributes']['name']
    df.loc[ix,'Slug'] = row['attributes']['slug']

api_datasets = df.set_index('id')[['Public Title', 'Slug']]
print(api_datasets.shape[0], 'datasets on the api')
print(api_datasets.head(1))

Index(['attributes', 'id', 'type'], dtype='object')
385 datasets on the api
                                                          Public Title  \
id                                                                       
098b33df-6871-4e53-a5ff-b56a7d989f9a  Subnational Political Boundaries   

                                                                                   Slug  
id                                                                                       
098b33df-6871-4e53-a5ff-b56a7d989f9a  Politcial-Boundaries-GADM-adminitrative-level-...  


In [18]:
api_datasets.to_csv('/Users/nathansuberi/Desktop/backoffice_names_for_elise.csv')

Fetching data from Google Spreadsheets

In [3]:
# Tracking Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > tracking_sheet.tsv
tracking_sheet = pd.read_csv("tracking_sheet.tsv", sep="\t", index_col=[0])
os.remove("tracking_sheet.tsv")

# Continue with the metadata that matches elements in the tracking sheet
ids_on_tracking = pd.notnull(tracking_sheet["API_ID"])
tracking = tracking_sheet.loc[ids_on_tracking]
tracking = tracking.reset_index().set_index("API_ID")

# QC Ready Metadata
!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > metadata_sheet.tsv
metadata_sheet = pd.read_csv("metadata_sheet.tsv", sep="\t", index_col=[0])
os.remove("metadata_sheet.tsv")

# Continue with the metadata that matches elements in the tracking sheet
ids_on_metadata = pd.notnull(metadata_sheet["final_ids"])
metadata = metadata_sheet.loc[ids_on_metadata]

# Should have used this:
metadata = metadata.reset_index().set_index("final_ids")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 90108    0 90108    0     0  56743      0 --:--:--  0:00:01 --:--:-- 56743
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  830k    0  830k    0     0  1007k      0 --:--:-- --:--:-- --:--:-- 1007k


In [4]:
tracking.head(1)

Unnamed: 0_level_0,WRI_ID,On Staging,Perfect Dataset? C = extra QC needed,Published on RW,Public Title,Technical Title,Subtitle,Theme_1,Theme_2,Theme_3,Format,Real Time,Time Lag,Time Period,NRT not working,NRT has layer,Selected for Pulse,"Pulse Contextual Layer, Widget",Subscription,signals check,Fancy Datasets & Visualization Needs,Shared API - Do Not Touch These!,From WRI Platform (we need viz to get/replicate them),Problem Solving,Metadata Completed,Download from Source,table/asset_name on server,"Server Location (and account wri-rw, insights, or wri-01)",Uploaded to S3,Download Data (S3),Dataset Processed for Upload,Distribution Restriction,Data Upload Responsibility,Layer Definition/Description/Name,Editable Widget (Chart/Map),SDGs,Unnamed: 37,Unnamed: 38
API_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
94328fdf-67ab-4c3f-b88c-d90d1ccf2462,soc.073,,,,Gridded GDP,Gridded global datasets for Gross Domestic Pro...,,,,,,,,,,,,,,,,,,,Copyedit,,,,,,,,,,,,6.6,


Comparing amongst backoffice and google spreadsheet data

In [5]:
metadata.head(1)

Unnamed: 0_level_0,udpated since 3/21,Unique ID,Learn More Link,Download from Source,Download Data (S3),Distribution Restriction,Shared API - Do Not Touch These!,Public Title,Technical Title,Subtitle,Source Organizations,Function,Description,Cautions,Geographic Coverage,Data Type,Spatial Resolution,Date of Content,Frequency of Updates,Summary of Licence,Link to License,Citation,Published Language,Published Title (if not English),Layer Name 1,Layer Definition 1,Layer Name 2,Layer Definition 2,Layer Name 3,Layer Definition 3,Layer Name 4,Layer Definition 4,Original Data Name 1,Original Data Link 1,Original Data Name 2,Original Data Link 2,Original Data Name 3,Original Data Link 3,Original Data Name 4,Original Data Link 4,Unnamed: 37,API_ID
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
5b5a21ac-0835-43fb-86b9-64b93d472e10,,bio.001,http://www.biodiversitya-z.org/content/allianc...,http://www.biodiversitya-z.org/content/allianc...,,"X emailed that it is okay, but affiliated with...",,Endangered Species Critical Habitats,Alliance for Zero Extinction Sites (AZE),AZE,Alliance for Zero Extinction (AZE),Global map of critical sites for conservation ...,Created by the Alliance for Zero Extinction (A...,,Global,Vector,,2010,5 years,Restrictions Apply,https://www.arcgis.com/home/item.html?id=4ecca...,"Alliance for Zero Extinction. 2010. ""2010 AZE ...",English,,,,,,,,,,,,,,,,,,,5b5a21ac-0835-43fb-86b9-64b93d472e10


Ensure metadata and tracking sheets agree

In [7]:
m = pd.DataFrame(metadata['Unique ID'])
t = pd.DataFrame(tracking['WRI_ID'])
print('m')
print(m.head(1))
print(m.shape)
print()
print('t')
print(t.head(1))
print(t.shape)
print()

print('These should both be empty if the metadata and tracking sheets all share the same final_ids')
t_not_in_m = [id for id in t['WRI_ID'] if id not in m['Unique ID'].values]
m_not_in_t = [id for id in m['Unique ID'] if id not in t['WRI_ID'].values]
print('t not in m:', t_not_in_m)
print()
print('m not in t:', m_not_in_t)
print()

mgmt = pd.merge(m, t, left_index=True, right_index=True)
print('merged t to m')
print(mgmt.head(1))
print(mgmt.shape)
print()
mgtm = pd.merge(t, m, left_index=True, right_index=True)
print('merged m to t')
print(mgtm.head(1))
print(mgtm.shape)
print()

jtm = t.join(m, how='left')
print('joined m to t')
print(jtm.head(1))
print(jtm.shape)
print()
jmt = m.join(t, how='left')
print('joined t to m')
print(jmt.head(1))
print(jmt.shape)
print()

#print('api response')
#print(ds_in_api_layers.head(1))
#print(ds_in_api_layers.shape)
#print()

m
                                     Unique ID
final_ids                                     
5b5a21ac-0835-43fb-86b9-64b93d472e10   bio.001
(232, 1)

t
                                       WRI_ID
API_ID                                       
94328fdf-67ab-4c3f-b88c-d90d1ccf2462  soc.073
(236, 1)

These should both be empty if the metadata and tracking sheets all share the same final_ids
t not in m: ['soc.073', 'cli.039.nrt', 'cli.006', 'foo.024.nrt', 'cli.047.nrt', 'cli.050', 'cli.051', 'cli.052', 'cli.053', 'cli.054', 'cli.055', 'cli.056', 'cli.057', 'cli.058', 'cli.059', 'foo.051.nrt', 'wat.038a.nrt', 'wat.038b.nrt', 'foo.054', 'cit.003a.nrt', 'cit.003b.nrt', 'cli.037.nrt', 'dis.001.nrt', 'cit.034', 'soc.075', 'dis.003.nrt', 'dis.009.nrt', 'dis.012.nrt', 'cli.049', 'cli.012.nrt', 'soc.076', 'cli.024.f', 'ene.008.nrt', 'bio.005.nrt', 'cli.005a.nrt', 'cli.005b.nrt', 'cli.035.nrt', 'cli.040.nrt', 'cli.041.nrt', 'cli.042.nrt', 'cli.043.nrt', 'cli.044.nrt', 'cli.045.nrt', 'for.003.nr

In [8]:
print('These data sets have a final id in the tracking sheet, but not the metadata sheet')
jtm[pd.isnull(jtm['Unique ID'])]['WRI_ID']

These data sets have a final id in the tracking sheet, but not the metadata sheet


API_ID
94328fdf-67ab-4c3f-b88c-d90d1ccf2462         soc.073
d68f14f6-1a1e-41e8-aa46-033dc845c464     cli.039.nrt
b1ebea96-5963-4c2c-9273-7d08536ac07d         cli.006
b5458f7f-b5e6-4b29-8524-5040e4bf5a27     foo.024.nrt
136aab69-c625-4347-b16a-c2296ee5e99e     cli.047.nrt
b21d07fa-fdb9-4451-9297-4a0e132e7d0a         cli.050
045755ce-7b88-4d36-979d-0056cae40783         cli.051
f0d65ee4-b5c5-4eed-ad26-645090bb6d35         cli.052
e989752c-5a9f-4407-9e21-6a40ae4e2259         cli.053
ecc9f301-42fb-4ba9-8d98-09c29f9415f4         cli.054
02a5046a-428e-43df-b0d5-cf602ef1cb2f         cli.055
c4e0474f-45a7-49b1-a1da-e8f041114e9d         cli.056
4eadc0de-f1cc-4eaa-900f-54a6e11478f0         cli.057
5c434a8b-71cc-4841-a80e-49161fb222d3         cli.058
aeb0afc3-b5f2-4018-98fa-127ccb29e139         cli.059
cf8218cf-4002-4d6f-80fb-9b34c85b008a     foo.051.nrt
e245c1ee-70f3-4855-9903-071a167f39a5    wat.038a.nrt
1417c29a-ca73-4f82-aa99-01c6f3c3b101    wat.038b.nrt
e3e3f7ad-443f-4505-a605-f0cbc892daca   

Compare between metadata/tracking and backoffice

In [9]:
datasets_missing_layers = [ix for ix in api_datasets.index if ix not in layer_ids_by_ds.index]
layers_missing_datasets = [ix for ix in layer_ids_by_ds.index if ix not in api_datasets.index]
print('Number datasets missing layers:', len(datasets_missing_layers))
print('Number layers missing datasets:', len(layers_missing_datasets))
print()

dml_ids = metadata.loc[datasets_missing_layers]['Unique ID']
dml_with_mdata_ids = dml_ids[pd.notnull(dml_ids.values)]
#lmd_ids = metadata.loc[layers_missing_datasets]['Unique ID']

print('datasets on api without layers:', dml_ids.shape[0])
for ds in dml_ids.index:
    print('https://api.resourcewatch.org/v1/dataset/{}'.format(ds))
    
print('datasets on api without layers that have metadata:', dml_with_mdata_ids.shape[0])
for ds in dml_with_mdata_ids.index:
    print('https://api.resourcewatch.org/v1/dataset/{}'.format(ds))

print('layers on api without datasets:', lmd_ids.shape[0])
for ly in lmd_ids.index:
    print('https://api.resourcewatch.org/v1/layer/{}'.format(layer_ids_by_ds.loc[ly,0]))

NameError: name 'layer_ids_by_ds' is not defined

In [305]:
#https://stackoverflow.com/questions/22676081/pandas-the-difference-between-join-and-merge
# Join attaches data from left onto indices of right

dft = pd.DataFrame(tracking['WRI_ID'])
dfm = pd.DataFrame(metadata['Unique ID'])

table_joined_t = ds_in_api_layers.join(dft, how='left')
table_joined_m = ds_in_api_layers.join(dfm, how='left')

#table_merged = ds_in_api_layers.merge(df, left_index=True, right_index=True)
#pd_table_merged = pd.merge(layer_ids_by_ds, df, left_index=True, right_index=True)

print(table_joined_t.head(1))
print(table_joined_m.head(1))

#print(table_merged.head(1))
#print(pd_table_merged.head(1))

layers_without_wriid = ds_in_api_layers[pd.isnull(table_joined_t['WRI_ID'])]
layers_without_uniqueid = ds_in_api_layers[pd.isnull(table_joined_m['Unique ID'])]

print()
print('These data sets appear in layers, but are not in the tracking sheet:', layers_without_wriid.shape[0])
print('They may be old data sets who were deleted and the layers are hanging out, or they are test data sets')
print()
for ds in layers_without_wriid.index:
    print('https://api.resourcewatch.org/v1/dataset/{}'.format(ds))
    
print()
print('These data sets appear in layers, but are not in the metadata sheet:', layers_without_uniqueid.shape[0])
print('They may be old data sets who were deleted and the layers are hanging out, or they are test data sets')
print()
for ds in layers_without_uniqueid.index:
    print('https://api.resourcewatch.org/v1/dataset/{}'.format(ds))

                                                                         0  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  255cf15f-39c1-46d2-ba82-59540133e4d1   

                                         1     2     3     4     5     6  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  None  None   

                                         7     8     9    10    11    12  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  None  None   

                                        13    14    15   WRI_ID  
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  for.005  
                                                                         0  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  255cf15f-39c1-46d2-ba82-59540133e4d1   

                                         1     2     3     4     5     6  \
8ab2606d-1b1c-4b71-ab29-c6e0a687e9fd  None  None  None  None  None  None   

                                         7     8     9    10    11    12  \
8ab2606d-1b1c-4b71-

In [10]:
dft = pd.DataFrame(tracking['WRI_ID'])
dfm = pd.DataFrame(metadata['Unique ID'])

table_joined_t = api_datasets.join(dft, how='left')
table_joined_m = api_datasets.join(dfm, how='left')

print(table_joined_t.head(1))
print(table_joined_m.head(1))

datasets_without_wriid = api_datasets[pd.isnull(table_joined_t['WRI_ID'])]
datasets_without_uniqueid = api_datasets[pd.isnull(table_joined_m['Unique ID'])]

print()
print('These data sets appear in backoffice, but are not in the tracking sheet:', datasets_without_wriid.shape[0])
print()
for ds in datasets_without_wriid.index:
    print('https://api.resourcewatch.org/v1/dataset/{}'.format(ds))
    
print()
print('These data sets appear in backoffice, but are not in the metadata sheet:', datasets_without_uniqueid.shape[0])
print()
for ds in datasets_without_uniqueid.index:
    print('https://api.resourcewatch.org/v1/dataset/{}'.format(ds))

                                                          Public Title  \
id                                                                       
098b33df-6871-4e53-a5ff-b56a7d989f9a  Subnational Political Boundaries   

                                                                                   Slug  \
id                                                                                        
098b33df-6871-4e53-a5ff-b56a7d989f9a  Politcial-Boundaries-GADM-adminitrative-level-...   

                                       WRI_ID  
id                                             
098b33df-6871-4e53-a5ff-b56a7d989f9a  soc.064  
                                                          Public Title  \
id                                                                       
098b33df-6871-4e53-a5ff-b56a7d989f9a  Subnational Political Boundaries   

                                                                                   Slug  \
id                                           