# Import Libraries

In [None]:
# Handling carto data
import cartoframes
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

# Requesting data from the web
import requests as req
import json

# Getting data on s3
import boto3
from io import BytesIO, StringIO
from gzip import GzipFile
import gzip
import boto3

# Logging
import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

# Creating ColorBrewer palettes for quick visualization
import palettable

# Often useful tools
from datetime import timedelta, datetime
from functools import reduce
from collections import defaultdict
import random
from hurry.filesize import size, si, verbose

# Helper script - set environ variables locally

# Authenticate to RW API

In [None]:
AUTH_TOKEN = os.environ.get("rw_api_token")

# Authenticating to Carto

In [None]:
CARTO_USER = os.environ.get('CARTO_WRI_RW_USER')
CARTO_KEY = os.environ.get('CARTO_WRI_RW_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Authenticating to S3

In [None]:
S3_KEY_ID = os.environ.get('aws_access_key_id')
S3_KEY = os.environ.get('aws_secret_access_key')

s3_client = boto3.client(
    's3',
    aws_access_key_id=S3_KEY_ID,
    aws_secret_access_key=S3_KEY
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=S3_KEY_ID,
    aws_secret_access_key=S3_KEY
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

# client: https://gist.github.com/veselosky/9427faa38cee75cd8e27
# resource: https://codereview.stackexchange.com/questions/107412/convert-zip-to-gzip-and-upload-to-s3-bucket
# bucket: https://tobywf.com/2017/06/gzip-compression-for-boto3/
def write_to_S3(df, bucket, key):
    csv_buffer = StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())
    

#https://alexwlchan.net/2017/07/listing-s3-keys/

def get_matching_s3_keys(bucket, prefix='', suffix=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    s3 = boto3.client('s3')
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            size = obj['Size']
            if key.startswith(prefix) and key.endswith(suffix):
                yield key, size

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [None]:
bucket_list = s3_client.list_buckets()
buckets = [bucket["Name"] for bucket in bucket_list["Buckets"]]
#print("Bucket List:", buckets)

In [None]:
all_vector_objects = list(get_matching_s3_keys(bucket='wri-public-data', prefix='resourcewatch/', suffix='.zip'))

vector_summary = pd.DataFrame(all_vector_objects)#[['Key','Size']]
vector_summary.columns = ['Key','Size']
vector_summary = vector_summary.sort_values(by='Size', axis=0, ascending=False)

vector_summary['Size'] = vector_summary.apply(lambda row: size(row['Size'], system=verbose), axis=1)

# Access data

In [None]:
chinese_aiddata = pd.read_excel('/Users/nathansuberi/Desktop/RW_Data/GlobalChineseOfficialFinanceDataset_v1.0/GlobalChineseOfficialFinanceDataset_v1.0.xlsx')

In [None]:
chinese_aiddata.columns

# Georeference data

In [None]:
ISO_ALIAS_INFO = cc.read('country_aliases_extended')
ISO_ALIAS_INFO = ISO_ALIAS_INFO.drop(['alias', 'index', 'the_geom'], axis=1).drop_duplicates()

In [None]:
def georef_by_ccode(df, ccode):
    # Weird behavior of globals in a local scope here:
    # https://stackoverflow.com/questions/10851906/python-3-unboundlocalerror-local-variable-referenced-before-assignment
    df.index = list(range(df.shape[0]))
    data_with_alias = df.merge(ISO_ALIAS_INFO,
                       left_on=ccode,
                       right_on='iso',
                       how='left')
    try:
        null_isos = pd.isnull(data_with_alias['iso'])
    except:
        null_isos = pd.isnull(data_with_alias['iso_y'])

    if sum(null_isos):
        no_iso_match = data_with_alias[null_isos]
        logging.info('no match for these isos in the data being processed: ')
        try:
            missed_isos = no_iso_match[ccode].unique()
            logging.info(missed_isos)
        except:
            ccode = ccode +'_x'
            missed_isos = no_iso_match[ccode].unique()
            logging.info(missed_isos)

    logging.info('df shape: {}'.format(df.shape))
    logging.info('data_with_alias shape: {}'.format(data_with_alias.shape))

    try:
        df['rw_country_code'] = data_with_alias['iso'].values
    except:
        df['rw_country_code'] = data_with_alias['iso_y'].values
    try:
        df['rw_country_name'] = data_with_alias['name']
    except:
        df['rw_country_name'] = data_with_alias['name_y']

    # Enforce correct ordering of columns here
    return df

In [None]:
chinese_aiddata = georef_by_ccode(chinese_aiddata, 'recipient_iso3')

# Clean spaces in the val column

In [None]:
# This was causing a problem with the sum(usd_current) part of the SQL, 
# saying it couldn't sum a text field
inspect = [type(val)==str for val in chinese_aiddata['usd_current']]
chinese_aiddata.loc[inspect, 'usd_current'] = None

# This didn't work - I ended up changing the column type directly in Carto to numeric, 
# which forced the nulls as I expected

# Upload to Carto and S3

In [None]:
table_name = 'com_032_chinese_investments_abroad'

In [None]:
cc.write(chinese_aiddata, table_name, overwrite=True, privacy='public')

In [None]:
write_to_S3(chinese_aiddata, 'wri-public-data', 'resourcewatch/{}.csv'.format(table_name))

# Connect to Back Office

In [None]:
def createHeaders():
    return {
        'content-type': "application/json",
        'authorization': "Bearer {}".format( AUTH_TOKEN )
    }

def connect_to_rw_backoffice(cloud_name):
    ds_specs = {
        "connectorType":"rest",
        "provider":"cartodb",
        "connectorUrl":"https://wri-rw.carto.com/tables/{}".format(cloud_name),
        "application":["rw"],
        "name":cloud_name
    }

    create_res = req.request("POST", 
                      'https://api.resourcewatch.org/v1/dataset', 
                      data=json.dumps(ds_specs), 
                      headers = createHeaders())

    logging.info(create_res.text)

    return create_res.json()['data']['id']

In [None]:
rw_id = connect_to_rw_backoffice(table_name)

In [None]:
rw_id

# Create visualization w/ interaction

In [None]:
def gen_basesql(table_name):
    basesql = ('SELECT wri.cartodb_id, ST_Transform(wri.the_geom, 3857) AS the_geom_webmercator,' + 
    ' data.rw_country_name, data.rw_country_code, data.year, data.sum_val FROM '+
    ' (SELECT rw_country_name, rw_country_code, year, sum(usd_current) as sum_val FROM {}' + 
    ' GROUP BY recipient_iso3, rw_country_name, rw_country_code, year) data' +
    ' LEFT OUTER JOIN wri_countries_a wri' +
    ' ON data.rw_country_code = wri.iso_a3' + 
    ' WHERE data.year=').format(table_name)
    return basesql + '{}'

def setup_interaction_config(obj, col, ds):
    _type = str(ds[col].dtype)
    if _type == 'object':
        _type = 'string'
    if _type in ['int64', 'float64']:
        _type = 'numeric'
        
    template = {'column': col,
      'format': None,
      'prefix': '',
      'property': col,
      'suffix': '',
      'type': _type}
    
    obj.append(template)
    return obj
    
def pick_ramp(len_ramp, ramp_colors = None):
    '''Possibilities: ['Blues', 'BuGn', 'BuPu', 'GnBu', 'Greens', 
            'Greys', 'OrRd', 'Oranges', 'PuBu', 'PuBuGn', 
            'PuRd', 'Purples', 'RdPu', 'Reds', 'YlGn',
           'YlGnBu', 'YlOrBr', 'YlOrRd']'''
    if not ramp_colors:
        poss_colors = list(palettable.colorbrewer.sequential.__dict__.keys())
        # Don't accept the reverse ramps
        poss_colors = [col for col in poss_colors if (col[-1] == str(len_ramp))]
        ramp = random.choice(poss_colors)
    else:
        ramp = '{}_{}'.format(ramp_colors, len_ramp)
        
    colors = palettable.colorbrewer.sequential.__dict__[ramp].hex_colors
    
    return colors

def gen_cartocss_legend(col, breaks, colors):
    cartocss = '#table {polygon-opacity: 1; line-width: 0.5; line-color: #FFF; line-opacity: 1;}'
    #cartocss += ' [{} > {}]?1polygon-fill:{} ;?2'.format(col,breaks[0],colors[0])
    legend = []#[{'color':colors[0], 'name':'>{}'.format(breaks[0])}]
    for i in range(0,len(breaks)-1):
        cartocss += ' [{} > {}][{} < {}]?1polygon-fill:{} ;?2'.format(col,breaks[i],col,breaks[i+1],colors[i])
        legend.append({'color':colors[i], 'name':'<{}'.format(breaks[i+1])})
    cartocss = cartocss.replace('?1', '{').replace('?2', '}')

    return cartocss, legend
    
def autogen_layer_def(year, min_year, rw_id, cloud_name, cartocss, legend, interaction, basesql):  
    str_year = str(year)[:4]
    str_minyear = str(min_year)[:4]
    layer_name = '{}_{}'.format(cloud_name, str_year)
    layer_name = ' '.join(layer_name.split('_')[2:]).title()
    layer_template = {
          'application': ['rw'],
          'language':'en',
          'applicationConfig': {},
          'dataset': rw_id,
          'default': True if str_year == str_minyear else False,
          'description': '',
          'env': 'production',
          'interactionConfig': {
              'output':{}#interaction
            },
            'geoInfo':True,
            'type':'tabular',
          'iso': [],
          'layerConfig': {'account': 'wri-rw',
           'body': {'layers': [{'options': {'cartocss': cartocss,
               'cartocss_version': '2.3.0',
               'sql': basesql.format(year)},
              'type': 'mapnik'}],
            'maxzoom': 18,
            'minzoom': 3}},
          'legendConfig': {'items': legend,
           'type': 'choropleth'},
          'name': layer_name,
          'protected': False,
          'provider': 'cartodb'
    }
    
    layer_template['layerConfig']['timeline'] = True
    layer_template['layerConfig']['order'] = int(str_year)
    layer_template['layerConfig']['timelineLabel'] = str(str_year)
   
    return layer_template
        
def upload_layer_def_to_backoffice(layer_def, rw_id):
    url = "https://api.resourcewatch.org/v1/dataset/{}/layer" .format(rw_id)
    res = req.request("POST", url, data=json.dumps(layer_def), headers = createHeaders())
    return res.text

In [None]:
breaks = [0, 100000, 1000000, 1000000000, 10000000000, 100000000000]
colors = pick_ramp(len(breaks)+1, 'Greens')
TABLE_NAME = table_name
DF = chinese_aiddata
VAL_COL = 'sum_val'
YEAR_COL = 'year'

basesql = gen_basesql(TABLE_NAME)

cartocss, legend = gen_cartocss_legend(VAL_COL, breaks, colors)
#interaction = reduce(lambda obj, col: setup_interaction_config(obj, col, DF), DF.columns, [])
min_year = DF[YEAR_COL].min()
layer_defs = list(map(lambda year: autogen_layer_def(year, min_year, rw_id, TABLE_NAME, cartocss, legend, interaction, basesql), DF[YEAR_COL].unique()))
logging.debug(layer_defs)

layer_defs_on_backoffice = list(map(lambda ldef: upload_layer_def_to_backoffice(ldef, rw_id), layer_defs))
logging.debug(layer_defs_on_backoffice)

# Start over

In [None]:
def remove_dataset_and_layers(wri_id):
    list_layers_res = req.request("GET", 
                      'https://api.resourcewatch.org/v1/dataset/{}/layer'.format(wri_id))
    layers = list_layers_res.json()['data']
    layer_ids = [l['id'] for l in layers]
        
    deleted_layers = []
    for l_id in layer_ids:
        delete_layer = req.request("DELETE", 
                      'https://api.resourcewatch.org/v1/dataset/{}/layer/{}'.format(wri_id, l_id),
                       headers = createHeaders())
        
        deleted_layers.append(delete_layer.text)
    
    deleted_dataset = req.request("DELETE", 
                      'https://api.resourcewatch.org/v1/dataset/{}'.format(wri_id),
                       headers = createHeaders())
    
    return deleted_dataset.text, deleted_layers

remove_dataset_and_layers(rw_id)