# Import Libraries

In [None]:
# Handling carto data
import cartoframes
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

# Requesting data from the web
import requests as req
import json

# Getting data on s3
import boto3
from io import BytesIO, StringIO
from gzip import GzipFile
import gzip
import boto3

# Logging
import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

# Creating ColorBrewer palettes for quick visualization
import palettable

# Often useful tools
from datetime import timedelta, datetime
from functools import reduce
from collections import defaultdict
import random
from hurry.filesize import size, si, verbose

# Helper script - set environ variables locally

# Authenticate to RW API

In [None]:
AUTH_TOKEN = os.environ.get("rw_api_token")

# Authenticating to Carto

In [None]:
CARTO_USER = os.environ.get('CARTO_WRI_RW_USER')
CARTO_KEY = os.environ.get('CARTO_WRI_RW_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Authenticating to S3

In [None]:
S3_KEY_ID = os.environ.get('aws_access_key_id')
S3_KEY = os.environ.get('aws_secret_access_key')

s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/georeffed/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=S3_KEY_ID,
    aws_secret_access_key=S3_KEY
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=S3_KEY_ID,
    aws_secret_access_key=S3_KEY
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

# client: https://gist.github.com/veselosky/9427faa38cee75cd8e27
# resource: https://codereview.stackexchange.com/questions/107412/convert-zip-to-gzip-and-upload-to-s3-bucket
# bucket: https://tobywf.com/2017/06/gzip-compression-for-boto3/
def write_to_S3(df, bucket, key):
    csv_buffer = StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())
    

#https://alexwlchan.net/2017/07/listing-s3-keys/

def get_matching_s3_keys(bucket, prefix='', suffix=''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    :param suffix: Only fetch keys that end with this suffix (optional).
    """
    s3 = boto3.client('s3')
    kwargs = {'Bucket': bucket}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']
            size = obj['Size']
            if key.startswith(prefix) and key.endswith(suffix):
                yield key, size

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [None]:
bucket_list = s3_client.list_buckets()
buckets = [bucket["Name"] for bucket in bucket_list["Buckets"]]
print("Bucket List:", buckets)

In [None]:
all_vector_objects = list(get_matching_s3_keys(bucket='wri-public-data', prefix='resourcewatch/', suffix='.zip'))

vector_summary = pd.DataFrame(all_vector_objects)#[['Key','Size']]
vector_summary.columns = ['Key','Size']
vector_summary = vector_summary.sort_values(by='Size', axis=0, ascending=False)

vector_summary['Size'] = vector_summary.apply(lambda row: size(row['Size'], system=verbose), axis=1)

In [None]:
vector_summary