# Import Libraries

In [None]:
import cartoframes
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

import requests as req
import json
import boto3
from io import BytesIO, StringIO
from gzip import GzipFile
import gzip
import boto3

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
import random

from functools import reduce
from collections import defaultdict

# Authenticating to Carto

In [None]:
CARTO_USER = 'wri-rw'#os.environ.get('CARTO_USER')
CARTO_KEY = ''#os.environ.get('CARTO_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Authenticating to S3

In [None]:
aws_access_key_id = ''#os.environ.get('aws_access_key_id')
aws_secret_access_key = ''#os.environ.get('aws_secret_access_key')

s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/georeffed/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

# client: https://gist.github.com/veselosky/9427faa38cee75cd8e27
# resource: https://codereview.stackexchange.com/questions/107412/convert-zip-to-gzip-and-upload-to-s3-bucket
# bucket: https://tobywf.com/2017/06/gzip-compression-for-boto3/
def write_to_S3(df, bucket, key):
    ### Old way
    csv_buffer = StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())
    
    
    # Zip the csv before posting to s3
    # Encode df as csv
#     tmp = 'tmp.zip'
#     with gzip.open(tmp, 'w') as gz:
#         gz.write(df.to_csv().encode("utf-8")) # convert unicode strings to bytes!
#         gz.close()
#         s3_resource.Object(bucket, key).put(Body=open(tmp, 'rb'))
    
    # A GzipFile must wrap a real file or a file-like object. We do not want to
    # write to disk, so we use a BytesIO as a buffer.
#     gz_body = BytesIO()
#     gz = GzipFile(None, 'wb', 9, gz_body)
#     gz.write(text_body) 
#     gz.close()
    
#     # GzipFile has written the compressed bytes into our gz_body
#     s3_client.put_object(
#         Bucket=bucket,
#         Key=key,  # Note: NO .gz extension!
#         ContentType='text/csv',  # the original type
#         ContentEncoding='gzip',  # MUST have or browsers will error
#         Body=gz_body.getvalue()
#     )
    
    

    
    # This will work with posted files: 
    # with gzip.open('soc_074_employment_in_agriculture.zip', 'rb') as f:
    #     data = f.read()


# Load data from RW API

In [None]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

# Select all Carto datasets on the API:
provider = "cartodb"
carto_ids = (current_datasets_on_api["provider"]==provider)
carto_data = current_datasets_on_api.loc[carto_ids]

logging.info("Number of Carto datasets: {}".format(carto_data.shape[0]))

# Load georeferencing config & data

In [None]:
georef = {
    'geometry':cc.read('wri_countries_a'),
    'aliases':cc.read('country_aliases_extended').drop(['index', 'the_geom'], axis=1),
    'known_non_un_isos':cc.read('known_non_un_isos').drop(['index', 'the_geom'], axis=1)
}

georef['iso_aliases'] = georef['aliases'].drop('alias', axis=1).drop_duplicates()

In [None]:
# Read in data sets info from config file
#georef_config = pd.read_csv('/Users/nathansuberi/Desktop/RW_Data/georeferencing_tasks/georef_these.csv')
#georef_config = georef_config.set_index('wri_id')
#georef_config

#### Download Google Spreadsheets ####
# Georeference Config
!curl "https://docs.google.com/spreadsheets/d/1S4Zh8V_keiDhqfxlATyC8veb3LtZ5W6uM1dyogOL7f0/export?format=tsv" > georef_config.tsv
georef_config = pd.read_csv(open("georef_config.tsv", "r"), sep="\t", index_col=[0])
os.remove("georef_config.tsv")

In [None]:
georef_config

In [None]:
# Load data sets into memory for processing
def load_data(obj, elem):
    logging.info('Input: {}'.format(elem))
    wri_id = elem[0].strip()
    rw_id = elem[1].strip()
    try:
        table_name = carto_data.loc[rw_id]['table_name']
        obj[wri_id] = {
            'name':table_name,
            'data':cc.read(table_name)
        }
        logging.info('Table name: {}'.format(obj[wri_id]['name']))
        logging.info('Table shape: {}'.format(obj[wri_id]['data'].shape))
    except:
        obj[wri_id] = 'Unavailable'
        logging.info('Unavailable')
    return obj

data_tables = reduce(load_data, zip(georef_config.index,georef_config['rw_id']), {})

# Configuring the alias table

In [None]:
df = georef['aliases']

## Adding all countries from our wri-bounds shapefile to the alias table

new_aliases = georef['geometry'][['iso_a3', 'name']].copy()
new_aliases['alias'] = new_aliases['name']
cols = ['iso' if col=='iso_a3' else col for col in new_aliases.columns]
cols = [col.strip() for col in cols]
new_aliases.columns = cols

logging.info('Existing aliases')
logging.info(df.columns)
logging.info(df.shape)
logging.info('Adding aliases from country table')
logging.info(new_aliases.columns)
logging.info(new_aliases.shape)

df = df.append(new_aliases)

# ## Adding in new aliases identified by team
# logging.info('Adding aliases from csv')
# new_aliases = pd.read_csv(ADDITIONAL_ALIASES, header=0)
# new_aliases.columns = ['alias', 'name', 'iso']
# logging.info(new_aliases.head(5))

#### Download Google Spreadsheets ####
# Additional Alias List
!curl "https://docs.google.com/spreadsheets/d/11k_6GbFgtF6eAQ3iAjPzt2KWc2n0SsP5P6g7kqILbkM/export?format=tsv" > additional_aliases.tsv
new_aliases = pd.read_csv(open("additional_aliases.tsv", "r"), sep="\t", index_col=None)
new_aliases.columns = ['alias', 'name', 'iso']
os.remove("additional_aliases.tsv")

df = df.append(new_aliases)

# Make all aliases lower case, remove spacing
df['alias'] = [str(alias).strip().lower().replace(' ','') for alias in df['alias']]

## check / remove duplicates
sum(df.duplicated(subset=['alias']))
sum(df.duplicated(subset=['name']))
sum(df.duplicated(subset=['iso']))

try:
    df = df.drop('the_geom', axis=1)
except:
    logging.info('unable to drop the_geom from country alias table')
try:
    df = df.drop('cartodb_georef_status', axis=1)
except:
    logging.info('unable to drop cartodb_georef_status from country alias table')
try:
    df = df.drop('index', axis=1)
except:
    logging.info('unable to drop index from country alias table')
try:
    df = df.drop('cartodb_id', axis=1)
except:
    logging.info('unable to drop cartodb_id from country alias table')

df = df.drop_duplicates()
    
georef['aliases'] = df

logging.info('Size of current aliasing table: ' + str(georef['aliases'].shape))

In [None]:
LOOK_FOR_ISO='fsm'
LOOK_FOR_NAME='korea'
georef['aliases']['iso'].fillna('', inplace=True)
georef['aliases']['name'].fillna('', inplace=True)
logging.info(georef['aliases'].loc[georef['aliases']['iso'].str.lower().str.contains(LOOK_FOR_ISO)])
logging.info('')
logging.info(georef['aliases'].loc[georef['aliases']['name'].str.lower().str.contains(LOOK_FOR_NAME)])

# Perform georeferencing

In [None]:
### 
## THERE ARE MULTIPLE MATCHES FOR ISO - need to adjust this to only pick once
###

## NOTE: This can be replaced by making a separate table to join on ISO
# def accept_new(agg, nxt):
#     cur = agg['cur']
#     if not nxt:
#         agg['lastval'] = nxt
#         agg['indices'].append(cur)
#     else:
#         if agg['lastval'] != nxt:
#             agg['lastval'] = nxt
#             agg['indices'].append(cur)
#     agg['cur'] += 1
#     return agg
    
# def clean_repeats(isos):
#     vals = isos.values
#     seen = []
#     agg = reduce(accept_new, vals, {'lastval':None, 'indices':[], 'cur':0})
#     ixs = agg['indices']
#     logging.info('INDICES: {}'.format(ixs))
#     return ixs



# Tracking all mis-matched names
missed_names = {}
missed_isos = {}

name_alias_info = georef['aliases']
iso_alias_info = georef['iso_aliases']

for wri_id, info in data_tables.items():
    
    logging.info('Processing table ' + wri_id)
    if type(info) == str:
        logging.info('Unavailable, skipping')
        continue
   
    ### WARNING: non standardized indices in the data cause problems after the merge step
    name = info['name']
    data = info['data'].copy()
    data.index = list(range(data.shape[0]))
    logging.info('Table head: {}'.format(data.head(15)))

    c_code = georef_config.loc[wri_id, 'country_code']
    c_code = None if pd.isnull(c_code) else c_code
    c_name = georef_config.loc[wri_id, 'country_name']
    c_name = None if pd.isnull(c_name) else c_name
    
    logging.info('c_code: ***{}***'.format(c_code))
    logging.info('c_name: ***{}***'.format(c_name))
    
    # Check if isos match our table
    process_by_name = True
    if c_code:
        logging.info('already has an iso3 code, in column {}'.format(c_code))
        _data = data.copy()
        
        data_with_alias = _data.merge(iso_alias_info,
                           left_on=c_code,
                           right_on='iso', 
                           how='left')
        try:
            null_isos = pd.isnull(data_with_alias['iso'])
        except:
            null_isos = pd.isnull(data_with_alias['iso_y'])
            
        if sum(null_isos):
            no_iso_match = data_with_alias[null_isos]
            logging.info('no match for these isos in the data being processed: ')
            logging.info(no_iso_match[c_code].unique())
            try:
                logging.info(no_iso_match[c_code].unique())
                missed_isos[wri_id] = no_iso_match[c_code].unique()
            except:
                c_code = c_code+'_x'
                logging.info(no_iso_match[c_code].unique())
                missed_isos[wri_id] = no_iso_match[c_code].unique()
    
        ### data IS ALTERED HERE
        
        logging.info('OG data shape: {}'.format(data.shape))
        logging.info('Augmented data shape: {}'.format(data_with_alias.shape))
        
        try:
#             ixs = clean_repeats(data_with_alias['iso'])
#             data['rw_country_code'] = pd.Series([val for ix, val in enumerate(data_with_alias['iso'].values) if ix in ixs])
            data['rw_country_code'] = data_with_alias['iso'].values
        except:
#             ixs = clean_repeats(data_with_alias['iso_y'])
#             data['rw_country_code'] = pd.Series([val for ix, val in enumerate(data_with_alias['iso_y'].values) if ix in ixs])
            data['rw_country_code'] = data_with_alias['iso_y'].values
        try:
#            data['rw_country_name'] = pd.Series([val for ix, val in enumerate(data_with_alias['name'].values) if ix in ixs])
            data['rw_country_name'] = data_with_alias['name']
        except:
#            data['rw_country_name'] = pd.Series([val for ix, val in enumerate(data_with_alias['name_y'].values) if ix in ixs])
            data['rw_country_name'] = data_with_alias['name_y']
       
        try:
            data = data.drop('the_geom', axis=1)
        except:
            logging.info('unable to drop the_geom from {} data'.format(name))
            
        try:
            data = data.drop('cartodb_georef_status', axis=1)
        except:
            logging.info('unable to drop cartodb_georef_status from {} data'.format(name))

        process_by_name = False
    
    # If country name is supplied, check how many match up with alias/name in country_aliases
    if c_name and process_by_name:       
        # Ensure that leading or trailing spaces don't break the match
        #data[c_name] = ['North Korea' if name=='Korea, Dem. People\x92s Rep.' else name for name in data[c_name]]
        _data = data.copy()
        
        _data['join_col'] = data[c_name].apply(lambda item: item.strip().lower().replace(' ','').replace('’', '\''))
    
        data_with_alias = _data.merge(name_alias_info, 
                                         left_on = 'join_col',
                                         right_on = 'alias',
                                         how='left') 

        null_aliases = pd.isnull(data_with_alias['alias'])             
            
        logging.info('data with alias df:')
        logging.info(data_with_alias.shape)
        logging.info(data_with_alias.head(6))
        logging.info('raw data')
        logging.info(_data.shape)
        logging.info(_data.head(5))
    
        
        if sum(null_aliases):
            no_alias_match = data_with_alias.loc[null_aliases]
            logging.info('missed aliases, matching on column "alias" of country_aliases')
            logging.info(no_alias_match)
            try:
                logging.info(no_alias_match[c_name].unique())
                missed_names[wri_id] = no_alias_match[c_name].unique()
            except:
                c_name = c_name+'_x'
                logging.info(no_alias_match[c_name].unique())
                missed_names[wri_id] = no_alias_match[c_name].unique()
                
        ### data IS ALTERED HERE

        try:
            data['rw_country_code'] = data_with_alias['iso']
        except:
            data['rw_country_code'] = data_with_alias['iso_y']
            
        try:
            data['rw_country_name'] = data_with_alias['name']  
        except:
            data['rw_country_name'] = data_with_alias['name_y'] 
            
        try:
            data = data.drop('the_geom', axis=1)
        except:
            logging.info('unable to drop the_geom from {} data'.format(name))
            
        try:
            data = data.drop('cartodb_georef_status', axis=1)
        except:
            logging.info('unable to drop cartodb_georef_status from {} data'.format(name))

        
    ### SUCCESS
    logging.info('Final head:')
    logging.info(data.head(5))


    data_tables[wri_id]['data'] = data.copy()

In [None]:
# Processed
print('Processed')
for name, data in data_tables.items():
    if not isinstance(data,str):
        print(name)
        
# Not processed
print('Not Processed')
for name, data in data_tables.items():
    if isinstance(data,str):
        print(name)

NOTE: Need to change layers for these

~ done ~ cit.013
~ longform ~ cit.020
~ longform ~ cli.022
~ remove column index? ~ ene.012
~ needs a year column ~ for.020
~ stored in insights ~ soc.001
~ no year col ~ soc.002
~ no year col update table name to be soc_012_... instead of soc_12_... ~ soc.012
~ no year col, except for a 2016 epi score ~ soc.021
~ no year col ~ soc.022
~ data refers to multiple years, need to datetime it's year col ~ soc.024
~ no year col, 2016 data ~ soc.026
~ no year col ~ soc.045
~ convert year to datetime ~ soc.055
~ no year col ~ soc.067

~ shouldn't be georeffed b/c can't lost geometry col ~ com.022

# Checking for missed names

In [None]:
print(missed_names)
print(missed_isos)

print('Newly missed names:')
for wri_id, names in missed_names.items():
    print('Missed names in data set {}'.format(wri_id))
    for name in names:
        if name not in georef['known_non_un_isos']['name'].values:
            print(name)
        
print('Newly missed isos:')
for wri_id, isos in missed_isos.items():
    print('Missed isos in data set {}'.format(wri_id))
    for iso in isos:
        if iso not in georef['known_non_un_isos']['iso'].values:
            print(iso)

In [None]:
## Process to investigate misses for a specific dataset
# Here, discovered issue with using apostrophe "’" instead of "'"
# Led to augmenting data prep for the georeferencing step above

df = data_tables['for.020']
df['join_col'] = df['country'].apply(lambda item: item.strip().lower().replace(' ',''))

df_a = df.merge(georef['aliases'],
                           left_on='join_col',
                           right_on='alias', 
                           how='left')
df_a.loc[pd.isnull(df_a['alias']),['country','join_col','alias', 'iso', 'name']]

print('democraticpeople’srepublicofkorea' in georef['aliases']['alias'])

LOOK_FOR = 'people\'s'
print('Viewing aliases with a name that contains {}:'.format(LOOK_FOR))
df = georef['aliases']
print(df.loc[df['alias'].str.lower().str.contains(LOOK_FOR)])

# Compile known not-included country names

In [None]:
known_non_un_names_isos = cc.read('known_non_un_isos')

In [None]:
def flatten(obj, new_list):
    obj.extend(new_list)
    return obj

other_isos = reduce(flatten, missed_isos.values(), [])
other_names = reduce(flatten, missed_names.values(), [])

def gather_names(iso):
    name = input('Official name of {}?'.format(iso))
    return((iso, name))

def gather_isos(name):
    iso = input('Official iso of {}?'.format(name))
    return((iso, name))

isos_and_names = list(map(gather_names, other_isos))
names_and_isos = list(map(gather_isos, other_names))

In [None]:
isos_and_names = pd.DataFrame()
for ds, isos in missed_isos.items():
    codecol = georef_config.loc[ds, 'country_code']
    namecol = georef_config.loc[ds, 'country_name']
    isos_and_names = isos_and_names.append(data_tables[ds]['data'].set_index(codecol).loc[isos, namecol].reset_index())
isos_and_names.drop_duplicates(inplace=True)
isos_and_names.columns = ['iso', 'name']
isos_and_names['reason'] = 'not un single country'


In [None]:
isos_and_names

In [None]:
names_and_isos = pd.DataFrame()

In [None]:
print(isos_and_names)
print(names_and_isos)

In [None]:
# Deleted isos
deleted_isos = '''AFI French Afar and Issas
ATB British Antarctic Territory
ATN Dronning Maud Land
CTE Canton and Enderbury Islands
DDR German Democratic Republic
DHY Dahomey
GEL Gilbert and Ellice Islands
HVO Upper Volta
JTN Johnston Island
MID Midway Islands
NHB New Hebrides
PCI Pacific Islands, Trust Territory of the
PCZ Panama Canal Zone
PHI Philippines – Code changed to PHL
PUS U.S. Miscellaneous Pacific Islands
RHO Southern Rhodesia
SKM Sikkim
VDR Viet-Nam, Democratic Republic of
WAK Wake Island
YMD Yemen, Democratic'''

by_line = deleted_isos.split('\n')
by_pair = [(line.split(' ')[0],' '.join(line.split(' ')[1:]) ) for line in by_line]
deleted_isos = pd.DataFrame(by_pair)
deleted_isos.columns = ['iso', 'name']

not_reported_isos = pd.DataFrame(isos_and_names)
not_reported_isos.columns = ['iso', 'name']

#not_reported_names = pd.DataFrame(names_and_isos)
#not_reported_names.columns = ['iso', 'name']
#not_reported_names = not_reported_names.drop([0, 7, 19, 32])

deleted_isos['reason'] = 'deleted'
not_reported_isos['reason'] = 'not un single country'
not_reported_names['reason'] = 'not un single country'

df = deleted_isos.copy()
df = df.append(not_reported_isos)
df = df.append(not_reported_names)

In [None]:
known_non_un_names_isos = known_non_un_names_isos.append(isos_and_names)
known_non_un_names_isos
cc.write(known_non_un_names_isos, 'known_non_un_isos', overwrite=True)

# Uploading finished files to Carto and S3

In [None]:
for ds, info in data_tables.items():
    if type(info) == str:
        logging.info('Unavailable, skipping')
        continue
        
    name = info['name']
    data = info['data']
    print(data.head())
    
    write_to_S3(data,s3_bucket,s3_folder+name+'.csv')
    print('saved ' + name + ' georeffed data to s3')
    
    cc.write(data, name, overwrite=True)
    print('saved ' + name + ' georeffed data to Carto')