# Import Libraries

In [1]:
import cartoframes
import pandas as pd
pd.options.display.max_columns = 200

import requests as req
import json
import boto3
import io

import sys
#import logging
import os
#logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# Constants

In [3]:
ADDITIONAL_ALIASES = '/Users/nathansuberi/Documents/GitHub/ResourceWatchCode/Metadata Management/aliases_for_longform.csv'
CARTO_USER = 'wri-rw'#os.environ.get('CARTO_USER')
CARTO_KEY = ''#os.environ.get('CARTO_KEY')

# Configure connection to S3

In [None]:
aws_access_key_id = #os.environ.get('aws_access_key_id')
aws_secret_access_key = #os.environ.get('aws_secret_access_key')

s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/wide_to_long/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

def write_to_S3(df, bucket, key):
    csv_buffer = io.StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

# Configure Carto connection and load georef tables

In [4]:
cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)
georef = {
    'geometry':cc.read('wri_countries_a'),
    'aliases':cc.read('country_aliases_extended')
}

In [5]:
# Read in data sets info from config file
georef_config = pd.read_csv('/Users/nathansuberi/Desktop/RW_Data/georeferencing_tasks/georef_these.csv')
georef_config

Unnamed: 0,wri_id,rw_id,country_name,country_code
0,com.009,c61c364b-1d68-4dd9-ae3d-76c2a0022280,,isoalpha3
1,cit.013,5d269c36-6ccf-4620-838d-431f86c30f69,country,
2,cit.020,6d3163f5-4e08-4830-84f1-2c5d76570a82,country_name,country_code
3,cli.022,995ec4fe-b3cc-4cf4-bd48-b89d4e3ea072,countryname,iso3v10
4,ene.012,d446a52e-c4c1-4e74-ae30-3204620a0365,country_name,country_code
5,for.020,03bfb30e-829f-4299-bab9-b2be1b66b5d4,country,
6,soc.001,0b9f0100-ce5b-430f-ad8f-3363efa05481,country,
7,soc.002,d4ca3cc4-c162-469c-b341-b52284a73eaa,country,
8,soc.012,f48541d3-a622-4908-9400-5ef26257ac96,country,
9,soc.021,e7582657-9c16-4eb1-89e8-0211d94015c6,country,


In [None]:
# Load data sets into memory for processing


# Load data from RW API

In [7]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

# Select all Carto datasets on the API:
provider = "cartodb"
carto_ids = (current_datasets_on_api["provider"]==provider)
carto_data = current_datasets_on_api.loc[carto_ids]

print("Number of Carto datasets: " + str(carto_data.shape[0]))

Number of Carto datasets: 237


# Configuring the alias table

In [9]:
df = georef['aliases']
print(df.columns)
try:
    df = df.drop('the_geom', axis=1)
except:
    logging.info('could not delete column `the_geom`')


try:
    df = df.drop('index', axis=1)
except:
    print('could not delete column `index`')


print(df.columns)

## Adding all countries from our wri-bounds shapefile to the alias table

new_aliases = georef['geometry'][['iso_a3', 'name']].copy()
new_aliases['alias'] = new_aliases['name']
cols = ['iso' if col=='iso_a3' else col for col in new_aliases.columns]
cols = [col.strip() for col in cols]
new_aliases.columns = cols

print(df.columns)
print(df.shape)
print(new_aliases.columns)
print(new_aliases.shape)

df = df.append(new_aliases)

## Adding in new aliases identified by Peter

peters_new_aliases = pd.read_csv(ADDITIONAL_ALIASES, header=1)
peters_new_aliases.columns = ['alias', 'name', 'iso']

df = df.append(peters_new_aliases)

# Make all aliases lower case, remove spacing
df['alias'] = [alias.strip().lower().replace(' ','') for alias in df['alias']]

## check / remove duplicates
sum(df.duplicated(subset=['alias']))
sum(df.duplicated(subset=['name']))
sum(df.duplicated(subset=['iso']))

df = df.drop_duplicates()
    
georef['aliases'] = df

print('Size of current aliasing table: ' + str(georef['aliases'].shape))

Index(['alias', 'index', 'iso', 'name', 'the_geom'], dtype='object')
Index(['alias', 'iso', 'name'], dtype='object')
Index(['alias', 'iso', 'name'], dtype='object')
(305, 3)
Index(['iso', 'name', 'alias'], dtype='object')
(193, 3)
Size of current aliasing table: (305, 3)


# Perform georeferencing

In [None]:
# Tracking all mis-matched names
missed_names = []
missed_isos = []

alias_info = georef['aliases']

for name, info in tables.items():
    
    print('Processing table ' + name)
    print('Table head:')
    
    ### WARNING: non standardized indices in the data cause problems after the merge step
    if 'long_data' in info:
        data = info['long_data'].copy()
        data.index = list(range(data.shape[0]))
    else:
        data = info['data'].copy()
        data.index = list(range(data.shape[0]))
    
    print(data.head(5))

    c_code = table_info[name]['config_options'].get('country_code', False)
    c_name = table_info[name]['config_options'].get('country_name', False)
    
    print('c_code: ***' + str(c_code) + '***')
    print('c_name: ***' + str(c_name) + '***')
    
    
    # Check if isos match our table
    
    if c_code:
        print('already has an iso3 code, in column ' + c_code)
        
        df = data.merge(alias_info,
                           left_on=c_code,
                           right_on='iso', 
                           how='left')
        
        null_isos = pd.isnull(df['iso'])
        
        no_iso_match = df[null_isos.values]
        print('no match for these isos in the data being processed: ')
        print(no_iso_match[c_code].unique())
        missed_isos.extend(no_iso_match[c_code].unique())
        
    
    
    
    
    # If country name is supplied, check how many match up with alias/name in country_aliases
    if c_name:       
        # Ensure that leading or trailing spaces don't break the match
        data[c_name] = ['North Korea' if name=='Korea, Dem. People\x92s Rep.' else name for name in data[c_name]]
    
        _data = data.copy()
        _data['join_col'] = data[c_name].apply(lambda item: item.strip().lower().replace(' ',''))
    
        data_with_alias = _data.merge(alias_info, 
                                         left_on = 'join_col',
                                         right_on = 'alias',
                                         how='left') 

        null_aliases = pd.isnull(data_with_alias['alias'])             
            
        print('data with alias df:')
        print(data_with_alias.shape)
        print(data_with_alias.head(6))
        print('raw data')
        print(data.shape)
        print(data.head(5))
    
        
        if sum(null_aliases):
            no_alias_match = data_with_alias.loc[null_aliases]
            print('missed aliases, matching on column "alias" of country_aliases')
            print(no_alias_match)
            try:
                print(no_alias_match[c_name].unique())
                missed_names.extend(no_alias_match[c_name].unique())
            except:
                c_name = c_name+'_x'
                print(no_alias_match[c_name].unique())
                missed_names.extend(no_alias_match[c_name].unique())
                
        ### data IS ALTERED HERE

        try:
            data['rw_country_code'] = data_with_alias['iso']
        except:
            data['rw_country_code'] = data_with_alias['iso_y']
            
        try:
            data['rw_country_name'] = data_with_alias['name']  
        except:
            data['rw_country_name'] = data_with_alias['name_y'] 

        
    ### SUCCESS
    print('Final head:')
    print(data.head(5))
    tables[name]['geo_data'] = data.copy()

# Checking for missed names

In [None]:
missed_names = list(set(missed_names))
print(len(missed_names))
print(missed_names)


missed_isos = list(set(missed_isos))
print(len(missed_isos))
print(missed_isos)

for name in missed_names:
    for alias in tables['country_aliases']['data']['alias']:
        if alias.lower().replace(' ', '') in name.lower().replace(' ', ''):
            print(name)
df = tables['country_aliases']['data']  
df.loc[df['name'].str.lower().str.contains('cur')]

for iso in missed_isos:
    for iso in tables['geometry']['data']['adm0_a3']:
        if alias.lower().replace(' ', '') in name.lower().replace(' ', ''):
            print(name)
            
for iso in missed_isos:
    for iso in tables['country_aliases']['data']['iso']:
        if alias.lower().replace(' ', '') in name.lower().replace(' ', ''):
            print(name)

# Uploading finished files to Carto and S3

In [None]:
for name, info in tables.items():
    
    print(name)
    for key, data in info.items():
        if key == 'geo_data':
            print(data.head())
            write_to_S3(data,s3_bucket,s3_folder+name+'_long')
            print('saved ' + name + ' long data to s3')
            cc.write(data, name + '_georefed_&_longform', overwrite=True)
            print('saved ' + name + ' long data to Carto')
            #print('failed to write table ' + name)

# Updating Layers on Backoffice

In [None]:
# TO DO