# Import libraries

In [1]:
import cartoframes
import pandas as pd
pd.options.display.max_columns = 200

import requests as req
import json
import boto3
import io
import datetime
from datetime import datetime
from collections import defaultdict
from dateutil import parser

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

from functools import reduce

# Assumptions

* id_columns and data columns are provided as ; separated lists w/ no spaces, i.e. rw_country_name;rw_country_code;commodity_name;category
* All data columns have a prefix, followed by a 4-digit year

# Authenticating to Carto

In [None]:
CARTO_USER = 'wri-rw'#os.environ.get('CARTO_USER')
CARTO_KEY = ''#os.environ.get('CARTO_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Authenticating to S3

In [None]:
aws_access_key_id = ''#os.environ.get('aws_access_key_id')
aws_secret_access_key = ''#os.environ.get('aws_secret_access_key')

s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/wide_to_long/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

def write_to_S3(df, bucket, key):
    csv_buffer = io.StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

# Load data from RW API

In [2]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

# Select all Carto datasets on the API:
provider = "cartodb"
carto_ids = (current_datasets_on_api["provider"]==provider)
carto_data = current_datasets_on_api.loc[carto_ids]

logging.info("Number of Carto datasets: " + str(carto_data.shape[0]))

JSONDecodeError: Expecting value: line 2 column 1 (char 1)

# Load longforming config & data

In [None]:
# Read in data sets info from config file
# longform_config = pd.read_csv('/Users/nathansuberi/Desktop/RW_Data/longforming_tasks/longform_these.csv')
# longform_config = longform_config.set_index('wri_id')
# longform_config

#### Download Google Spreadsheets ####
# Longform Config
!curl "https://docs.google.com/spreadsheets/d/1OjLN9yDbAyuh51uWezOIei5hWkTMU3yTJys_S7miUpU/export?format=tsv" > longform_config.tsv
longform_config = pd.read_csv(open("longform_config.tsv", "r"), sep="\t", index_col=[0])
os.remove("longform_config.tsv")

In [None]:
longform_config

In [None]:
# Load data sets into memory for processing
def load_data(obj, elem):
    print(elem)
    wri_id = elem[0].strip()
    rw_id = elem[1].strip()
    try:
        table_name = carto_data.loc[rw_id]['table_name']
        obj[wri_id] = {
            'name':table_name,
            'wide':cc.read(table_name)
        }
        logging.info('Table name: {}'.format(obj[wri_id]['name']))
        logging.info('Table shape: {}'.format(obj[wri_id]['wide'].shape))
    except:
        obj[wri_id] = 'Unavailable'
        print('Unavailable')
    return obj

data_tables = reduce(load_data, zip(longform_config.index,longform_config['rw_id']), {})

# Helper Functions

In [None]:
# Use known prefixes to reformat tables
def pick_value_col(col, pfx):
    if (pfx in col) and (len(col) == len(pfx) + 4):
        return True
    else:
        return False

def prepare_date(date, pfx, parse_date):
    if parse_date:
        dt = parser.parse(date[date.index(pfx) + len(pfx):])
        dt = dt.replace(month=1)
        return dt.replace(day=1)
    else:
        return date[date.index(pfx) + len(pfx):]

# Perform Longforming

In [None]:
for wri_id, info in data_tables.items():
    name = info['name']
    wide = info['wide']
    #logging.debug(wide.head())
    
    prefixes = longform_config.loc[wri_id, 'prefixes'].split(';')
    id_cols = longform_config.loc[wri_id, 'id_cols'].split(';')
    parse_date = True if longform_config.loc[wri_id, 'parse_date'] == True else False
    logging.debug("Parsing date? {}".format(parse_date))
    
    logging.info('initial shape: ' + str(wide.shape))
    
    df = pd.DataFrame(columns = id_cols + ['variable'])
    
    for pfx in prefixes:
        logging.info('working on pfx ' + pfx)
        
        value_cols = [col for col in wide.columns if pick_value_col(col, pfx)]
        logging.info('columns pulled: ' + str(value_cols))

        _df = pd.melt(wide, id_vars=id_cols, value_vars=value_cols)
        logging.info(_df['variable'])
        d = _df['variable']
        _df['variable'] = [prepare_date(date, pfx, parse_date) for date in _df['variable']]

        col_names = [pfx+'_data' if col=='value' else col for col in _df.columns]
        _df.columns = col_names

        df = df.merge(_df, on=id_cols  + ['variable'], how='outer')
        logging.debug('intermediate df shape: ' + str(df.shape))

    logging.info('final shape of ' + wri_id + ': ' + str(df.shape))

    new_cols = ['datetime' if col=='variable' else col for col in df.columns]
    df.columns = new_cols
    logging.info('final columns: ' + str(df.columns))
    
    data_tables[wri_id]['long'] = df

In [None]:
#data_tables['ene.029']['long']['yr__data'] = list(map(cast_as_num, data_tables['ene.029']['long']['yr__data'] ))
#data_tables['cit.020'].keys()
data_tables.keys()

# Upload to Carto

In [None]:
for ds, info in data_tables.items():
    name = info['name']
    
    wide = info['wide']
    #print(wide.head())
    long = info['long']
    print(long.head())
    
    
    write_to_S3(wide,s3_bucket,s3_folder+name+'_wide.csv')
    print('saved ' + name + ' wide data to s3')
    write_to_S3(long,s3_bucket,s3_folder+name+'.csv')
    print('saved ' + name + ' long data to s3')
    
    cc.write(long, name, overwrite=True)
    cc.write(wide, name+'_wide', overwrite=True)