# Import Libraries

In [94]:
import cartoframes
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

import requests as req
import json
import boto3
import io

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
import random

from functools import reduce
from collections import defaultdict

# Authenticating to Carto

In [7]:
CARTO_USER = 'wri-rw'#os.environ.get('CARTO_USER')
CARTO_KEY = ''#os.environ.get('CARTO_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Authenticating to S3

In [None]:
aws_access_key_id = #os.environ.get('aws_access_key_id')
aws_secret_access_key = #os.environ.get('aws_secret_access_key')

s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/wide_to_long/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

def write_to_S3(df, bucket, key):
    csv_buffer = io.StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

# Load data from RW API

In [68]:
# Base URL for getting dataset metadata from RW API
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}
for ix, dset in enumerate(data):
    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[dset["id"]] = {
        "name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }

# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

# Select all Carto datasets on the API:
provider = "cartodb"
carto_ids = (current_datasets_on_api["provider"]==provider)
carto_data = current_datasets_on_api.loc[carto_ids]

logging.info("Number of Carto datasets: {}".format(carto_data.shape[0]))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.resourcewatch.org
DEBUG:urllib3.connectionpool:https://api.resourcewatch.org:443 "GET /v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer&application=rw&page%5Bsize%5D=1000 HTTP/1.1" 200 671040
INFO:root:Number of Carto datasets: 234


# Load georeferencing config & data

In [9]:
georef = {
    'geometry':cc.read('wri_countries_a'),
    'aliases':cc.read('country_aliases_extended'),
    'known_non_un_isos':cc.read('known_non_un_isos')
}

#### Download Google Spreadsheets ####
# Additional Alias List
!curl "https://docs.google.com/spreadsheets/d/11k_6GbFgtF6eAQ3iAjPzt2KWc2n0SsP5P6g7kqILbkM/export?format=tsv" > additional_aliases.tsv
additional_aliases = pd.read_csv(open("additional_aliases.tsv", "r"), sep="\t", index_col=[0])
os.remove("additional_aliases.tsv")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2480    0  2480    0     0   3865      0 --:--:-- --:--:-- --:--:--  3862


In [14]:
# Read in data sets info from config file
#georef_config = pd.read_csv('/Users/nathansuberi/Desktop/RW_Data/georeferencing_tasks/georef_these.csv')
#georef_config = georef_config.set_index('wri_id')
#georef_config

#### Download Google Spreadsheets ####
# Georeference Config
!curl "https://docs.google.com/spreadsheets/d/1S4Zh8V_keiDhqfxlATyC8veb3LtZ5W6uM1dyogOL7f0/export?format=tsv" > georef_config.tsv
georef_config = pd.read_csv(open("georef_config.tsv", "r"), sep="\t", index_col=[0])
os.remove("georef_config.tsv")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   110    0   110    0     0    224      0 --:--:-- --:--:-- --:--:--   224


In [15]:
georef_config

Unnamed: 0_level_0,rw_id,country_name,country_code
wri_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
soc.076,f6d8caef-578f-4afe-a8de-f5e972c43f0c,country_name,country_code


In [58]:
# Load data sets into memory for processing
def load_data(obj, elem):
    logging.info('Input: {}'.format(elem))
    wri_id = elem[0].strip()
    rw_id = elem[1].strip()
    try:
        table_name = carto_data.loc[rw_id]['table_name']
        obj[wri_id] = {
            'name':table_name,
            'data':cc.read(table_name)
        }
        logging.info('Table name: {}'.format(obj[wri_id]['name']))
        logging.info('Table shape: {}'.format(obj[wri_id]['data'].shape))
    except:
        obj[wri_id] = 'Unavailable'
        logging.info('Unavailable')
    return obj

data_tables = reduce(load_data, zip(georef_config.index,georef_config['rw_id']), {})

Input: ('soc.076', 'f6d8caef-578f-4afe-a8de-f5e972c43f0c')
Table name: soc_076_country_population
Table shape: (253, 63)


# Configuring the alias table

In [18]:
df = georef['aliases']

## Adding all countries from our wri-bounds shapefile to the alias table

new_aliases = georef['geometry'][['iso_a3', 'name']].copy()
new_aliases['alias'] = new_aliases['name']
cols = ['iso' if col=='iso_a3' else col for col in new_aliases.columns]
cols = [col.strip() for col in cols]
new_aliases.columns = cols

logging.info('Existing aliases')
logging.info(df.columns)
logging.info(df.shape)
logging.info('Adding aliases from country table')
logging.info(new_aliases.columns)
logging.info(new_aliases.shape)

df = df.append(new_aliases)

## Adding in new aliases identified by team
logging.info('Adding aliases from csv')
new_aliases = pd.read_csv(ADDITIONAL_ALIASES, header=0)
new_aliases.columns = ['alias', 'name', 'iso']
logging.info(new_aliases.head(5))

df = df.append(new_aliases)

# Make all aliases lower case, remove spacing
df['alias'] = [alias.strip().lower().replace(' ','') for alias in df['alias']]

## check / remove duplicates
sum(df.duplicated(subset=['alias']))
sum(df.duplicated(subset=['name']))
sum(df.duplicated(subset=['iso']))

try:
    df = df.drop('the_geom', axis=1)
except:
    logging.info('unable to drop the_geom from country alias table')
try:
    df = df.drop('cartodb_georef_status', axis=1)
except:
    logging.info('unable to drop cartodb_georef_status from country alias table')
try:
    df = df.drop('index', axis=1)
except:
    logging.info('unable to drop index from country alias table')
try:
    df = df.drop('cartodb_id', axis=1)
except:
    logging.info('unable to drop cartodb_id from country alias table')

df = df.drop_duplicates()
    
georef['aliases'] = df

logging.info('Size of current aliasing table: ' + str(georef['aliases'].shape))

Existing aliases
Index(['alias', 'index', 'iso', 'name', 'the_geom'], dtype='object')
(305, 5)
Adding aliases from country table
Index(['iso', 'name', 'alias'], dtype='object')
(193, 3)
Adding aliases from csv
                                   alias                              name  \
0      Venezuela, Bolivarian Republic of                         Venezuela   
1  Democratic People's Republic of Korea                       North Korea   
2       Micronesia (Federated States of)    Federated States of Micronesia   
3                        Dem. Rep. Congo  Democratic Republic of the Congo   
4       Bolivia (Plurinational State of)                           Bolivia   

   iso  
0  VEN  
1  PRK  
2  FSM  
3  COD  
4  BOL  
unable to drop cartodb_georef_status from country alias table
unable to drop cartodb_id from country alias table
Size of current aliasing table: (309, 3)


In [19]:
LOOK_FOR_ISO='fsm'
LOOK_FOR_NAME='korea'
georef['aliases']['iso'].fillna('', inplace=True)
logging.info(georef['aliases'].loc[georef['aliases']['iso'].str.lower().str.contains(LOOK_FOR_ISO)])
logging.info()
logging.info(georef['aliases'].loc[georef['aliases']['name'].str.lower().str.contains(LOOK_FOR_NAME)])

                             alias  iso                            name
63                      micronesia  FSM  Federated States of Micronesia
182    federatedstatesofmicronesia  FSM  Federated States of Micronesia
260            micronesia,fed.sts.  FSM  Federated States of Micronesia
265   micronesia,federatedstatesof  FSM  Federated States of Micronesia
2    micronesia(federatedstatesof)  FSM  Federated States of Micronesia

                                  alias  iso               name
5                            koreanorth  PRK        North Korea
6                            koreasouth  KOR        South Korea
39                       korea,dem.rep.  PRK        North Korea
58                      republicofkorea  KOR        South Korea
61    democraticpeople'srepublicofkorea  PRK        North Korea
70                           korea,rep.  KOR        South Korea
85                                korea  KOR        South Korea
130                          southkorea  KOR        Sou

# Perform georeferencing

In [111]:
### 
## THERE ARE MULTIPLE MATCHES FOR ISO - need to adjust this to only pick once
###
def accept_new(agg, nxt):
    cur = agg['cur']
    if not nxt:
        agg['lastval'] = nxt
        agg['indices'].append(cur)
    else:
        if agg['lastval'] != nxt:
            agg['lastval'] = nxt
            agg['indices'].append(cur)
    agg['cur'] += 1
    return agg
    
def clean_repeats(isos):
    vals = isos.values
    seen = []
    agg = reduce(accept_new, vals, {'lastval':None, 'indices':[], 'cur':0})
    ixs = agg['indices']
    logging.info('INDICES: {}'.format(ixs))
    return ixs


# Tracking all mis-matched names
missed_names = {}
missed_isos = {}

alias_info = georef['aliases']

for wri_id, info in data_tables.items():
    
    logging.info('Processing table ' + wri_id)
    if type(info) == str:
        logging.info('Unavailable, skipping')
        continue
   
    ### WARNING: non standardized indices in the data cause problems after the merge step
    name = info['name']
    data = info['data'].copy()
    data.index = list(range(data.shape[0]))
    logging.info('Table head: {}'.format(data.head(5)))

    c_code = georef_config.loc[wri_id, 'country_code']
    c_code = None if pd.isnull(c_code) else c_code
    c_name = georef_config.loc[wri_id, 'country_name']
    c_name = None if pd.isnull(c_name) else c_name
    
    logging.info('c_code: ***{}***'.format(c_code))
    logging.info('c_name: ***{}***'.format(c_name))
    
    # Check if isos match our table
    process_by_name = True
    if c_code:
        logging.info('already has an iso3 code, in column {}'.format(c_code))
        _data = data.copy()
        
        data_with_alias = _data.merge(alias_info,
                           left_on=c_code,
                           right_on='iso', 
                           how='left')
        
        null_isos = pd.isnull(data_with_alias['iso'])
        if sum(null_isos):
            no_iso_match = data_with_alias[null_isos]
            logging.info('no match for these isos in the data being processed: ')
            logging.info(no_iso_match[c_code].unique())
            try:
                logging.info(no_iso_match[c_code].unique())
                missed_isos[wri_id] = no_iso_match[c_code].unique()
            except:
                c_code = c_code+'_x'
                logging.info(no_iso_match[c_code].unique())
                missed_isos[wri_id] = no_iso_match[c_code].unique()
    
        ### data IS ALTERED HERE

        clean_repeats(data_with_alias['name'])
        
        try:
            ixs = clean_repeats(data_with_alias['iso'])
            data['rw_country_code'] = pd.Series([val for ix, val in enumerate(data_with_alias['iso'].values) if ix in ixs])
        except:
            ixs = clean_repeats(data_with_alias['iso_y'])
            data['rw_country_code'] = pd.Series([val for ix, val in enumerate(data_with_alias['iso_y'].values) if ix in ixs])
        
        try:
            data['rw_country_name'] = pd.Series([val for ix, val in enumerate(data_with_alias['name'].values) if ix in ixs])
        except:
            data['rw_country_name'] = pd.Series([val for ix, val in enumerate(data_with_alias['name_y'].values) if ix in ixs])
            
        try:
            data = data.drop('the_geom', axis=1)
        except:
            logging.info('unable to drop the_geom from {} data'.format(name))
            
        try:
            data = data.drop('cartodb_georef_status', axis=1)
        except:
            logging.info('unable to drop cartodb_georef_status from {} data'.format(name))

        process_by_name = False
    
    # If country name is supplied, check how many match up with alias/name in country_aliases
    if c_name and process_by_name:       
        # Ensure that leading or trailing spaces don't break the match
        #data[c_name] = ['North Korea' if name=='Korea, Dem. People\x92s Rep.' else name for name in data[c_name]]
        _data = data.copy()
        
        _data['join_col'] = data[c_name].apply(lambda item: item.strip().lower().replace(' ','').replace('’', '\''))
    
        data_with_alias = _data.merge(alias_info, 
                                         left_on = 'join_col',
                                         right_on = 'alias',
                                         how='left') 

        null_aliases = pd.isnull(data_with_alias['alias'])             
            
        logging.info('data with alias df:')
        logging.info(data_with_alias.shape)
        logging.info(data_with_alias.head(6))
        logging.info('raw data')
        logging.info(_data.shape)
        logging.info(_data.head(5))
    
        
        if sum(null_aliases):
            no_alias_match = data_with_alias.loc[null_aliases]
            logging.info('missed aliases, matching on column "alias" of country_aliases')
            logging.info(no_alias_match)
            try:
                logging.info(no_alias_match[c_name].unique())
                missed_names[wri_id] = no_alias_match[c_name].unique()
            except:
                c_name = c_name+'_x'
                logging.info(no_alias_match[c_name].unique())
                missed_names[wri_id] = no_alias_match[c_name].unique()
                
        ### data IS ALTERED HERE

        try:
            data['rw_country_code'] = data_with_alias['iso']
        except:
            data['rw_country_code'] = data_with_alias['iso_y']
            
        try:
            data['rw_country_name'] = data_with_alias['name']  
        except:
            data['rw_country_name'] = data_with_alias['name_y'] 
            
        try:
            data = data.drop('the_geom', axis=1)
        except:
            logging.info('unable to drop the_geom from {} data'.format(name))
            
        try:
            data = data.drop('cartodb_georef_status', axis=1)
        except:
            logging.info('unable to drop cartodb_georef_status from {} data'.format(name))

        
    ### SUCCESS
    logging.info('Final head:')
    logging.info(data.head(30)[[c_name, c_code, 'rw_country_name', 'rw_country_code']])
    data_tables[wri_id]['data'] = data.copy()

INFO:root:Processing table soc.076
INFO:root:Table head:   country_code                                       country_name  \
0          KAZ                                         Kazakhstan   
1          LAO                                            Lao PDR   
2          LAC  Latin America & Caribbean (excluding high income)   
3          PAN                                             Panama   
4          PHL                                        Philippines   

  indicator_code     indicator_name      yr_1960      yr_1961      yr_1962  \
0    SP.POP.TOTL  Population, total    9714260.0   10129861.0   10532062.0   
1    SP.POP.TOTL  Population, total    2120896.0    2170343.0    2221122.0   
2    SP.POP.TOTL  Population, total  206288466.0  212141158.0  218187764.0   
3    SP.POP.TOTL  Population, total    1132921.0    1167035.0    1202373.0   
4    SP.POP.TOTL  Population, total   26273025.0   27164617.0   28081231.0   

       yr_1963      yr_1964      yr_1965      yr_1966      

In [43]:
for name, data in data_tables.items():
    if 'data' != 'Unavailable':
        print(name)

soc_076_country_population


In [112]:
data_tables['soc.076']['data']

Unnamed: 0,country_code,country_name,indicator_code,indicator_name,yr_1960,yr_1961,yr_1962,yr_1963,yr_1964,yr_1965,yr_1966,yr_1967,yr_1968,yr_1969,yr_1970,yr_1971,yr_1972,yr_1973,yr_1974,yr_1975,yr_1976,yr_1977,yr_1978,yr_1979,yr_1980,yr_1981,yr_1982,yr_1983,yr_1984,yr_1985,yr_1986,yr_1987,yr_1988,yr_1989,yr_1990,yr_1991,yr_1992,yr_1993,yr_1994,yr_1995,yr_1996,yr_1997,yr_1998,yr_1999,yr_2000,yr_2001,yr_2002,yr_2003,yr_2004,yr_2005,yr_2006,yr_2007,yr_2008,yr_2009,yr_2010,yr_2011,yr_2012,yr_2013,yr_2014,yr_2015,yr_2016,rw_country_code,rw_country_name
0,KAZ,Kazakhstan,SP.POP.TOTL,"Population, total",9.714260e+06,1.012986e+07,1.053206e+07,1.091355e+07,1.126733e+07,1.158887e+07,1.187294e+07,1.212050e+07,1.234141e+07,1.255012e+07,1.275724e+07,1.296692e+07,1.317658e+07,1.338221e+07,1.357705e+07,1.375679e+07,1.392010e+07,1.407068e+07,1.421511e+07,1.436242e+07,1.451892e+07,1.468379e+07,1.485399e+07,1.503050e+07,1.521405e+07,1.540301e+07,1.560093e+07,1.580175e+07,1.598251e+07,1.624950e+07,1.634800e+07,1.645050e+07,1.643910e+07,1.633042e+07,1.609520e+07,1.581563e+07,1.557789e+07,1.533370e+07,15071300,14928426,14883626,14858335,14858948,14909018,1.501298e+07,15147029,15308084,1.548419e+07,15674000,16092701,16321581,16556600,16791425,17035275,17289224,1.754413e+07,1.779703e+07,KAZ,Kazakhstan
1,LAO,Lao PDR,SP.POP.TOTL,"Population, total",2.120896e+06,2.170343e+06,2.221122e+06,2.273349e+06,2.327137e+06,2.382594e+06,2.439196e+06,2.496920e+06,2.556852e+06,2.620434e+06,2.688428e+06,2.762265e+06,2.840841e+06,2.919287e+06,2.990965e+06,3.051577e+06,3.098973e+06,3.135842e+06,3.168843e+06,3.207328e+06,3.258144e+06,3.323377e+06,3.401242e+06,3.489977e+06,3.586381e+06,3.687898e+06,3.794043e+06,3.905163e+06,4.020295e+06,4.138408e+06,4.258472e+06,4.380073e+06,4.502363e+06,4.623280e+06,4.740380e+06,4.851923e+06,4.957180e+06,5.056519e+06,5150763,5241284,5329304,5414568,5497273,5579656,5.664605e+06,5754026,5849356,5.949787e+06,6052190,6152036,6246274,6333487,6415169,6494557,6576397,6.663967e+06,6.758353e+06,LAO,Laos
2,LAC,Latin America & Caribbean (excluding high income),SP.POP.TOTL,"Population, total",2.062885e+08,2.121412e+08,2.181878e+08,2.243975e+08,2.307267e+08,2.371430e+08,2.436333e+08,2.502033e+08,2.568619e+08,2.636260e+08,2.705062e+08,2.775029e+08,2.846058e+08,2.918040e+08,2.990822e+08,3.064290e+08,3.138353e+08,3.213006e+08,3.288327e+08,3.364439e+08,3.441406e+08,3.519243e+08,3.597827e+08,3.676905e+08,3.756144e+08,3.835295e+08,3.914199e+08,3.992874e+08,4.071456e+08,4.150169e+08,4.229146e+08,4.308386e+08,4.387717e+08,4.466899e+08,4.545615e+08,4.623606e+08,4.700848e+08,4.777329e+08,485282781,492708788,499996505,507131154,514126613,521033040,5.279197e+08,534836570,541798268,5.487899e+08,555795034,562785589,569737872,576648286,583515410,590319145,597035363,6.036449e+08,6.101364e+08,,
3,PAN,Panama,SP.POP.TOTL,"Population, total",1.132921e+06,1.167035e+06,1.202373e+06,1.238823e+06,1.276276e+06,1.314626e+06,1.353804e+06,1.393799e+06,1.434657e+06,1.476479e+06,1.519299e+06,1.563115e+06,1.607834e+06,1.653256e+06,1.699113e+06,1.745205e+06,1.791453e+06,1.837890e+06,1.884515e+06,1.931389e+06,1.978578e+06,2.026065e+06,2.073844e+06,2.121939e+06,2.170409e+06,2.219276e+06,2.268574e+06,2.318332e+06,2.368618e+06,2.419491e+06,2.471009e+06,2.523181e+06,2.576018e+06,2.629644e+06,2.684183e+06,2.739730e+06,2.796344e+06,2.853941e+06,2912328,2971197,3030347,3089684,3149265,3209174,3.269541e+06,3330465,3391905,3.453807e+06,3516268,3579385,3643222,3707782,3772938,3838462,3903986,3.969249e+06,4.034119e+06,PAN,Panama
4,PHL,Philippines,SP.POP.TOTL,"Population, total",2.627302e+07,2.716462e+07,2.808123e+07,2.901677e+07,2.996288e+07,3.091393e+07,3.186756e+07,3.282660e+07,3.379704e+07,3.478759e+07,3.580473e+07,3.685106e+07,3.792540e+07,3.902608e+07,4.014996e+07,4.129512e+07,4.246119e+07,4.365033e+07,4.486627e+07,4.611400e+07,4.739697e+07,4.871559e+07,5.006849e+07,5.145503e+07,5.287397e+07,5.432365e+07,5.580407e+07,5.731331e+07,5.884520e+07,6.039187e+07,6.194735e+07,6.350846e+07,6.507549e+07,6.665025e+07,6.823623e+07,6.983572e+07,7.144611e+07,7.306476e+07,74693695,76335812,77991569,79665315,81352060,83031954,8.467849e+07,86274237,87809419,8.929349e+07,90751864,92220879,93726624,95277940,96866642,98481032,100102249,1.017164e+08,1.033202e+08,PHL,Philippines
5,SWZ,Swaziland,SP.POP.TOTL,"Population, total",3.491740e+05,3.574530e+05,3.656360e+05,3.738970e+05,3.824690e+05,3.915460e+05,4.011830e+05,4.113520e+05,4.221400e+05,4.335880e+05,4.457290e+05,4.586050e+05,4.722300e+05,4.865610e+05,5.015120e+05,5.170240e+05,5.332140e+05,5.501180e+05,5.675590e+05,5.853440e+05,6.033720e+05,6.212760e+05,6.392370e+05,6.583200e+05,6.799760e+05,7.050850e+05,7.342430e+05,7.667070e+05,8.004560e+05,8.326820e+05,8.613730e+05,8.856230e+05,9.060340e+05,9.240250e+05,9.417740e+05,9.607920e+05,9.817640e+05,1.003995e+06,1026009,1045629,1061468,1072927,1080930,1087392,1.095053e+06,1105873,1120514,1.138434e+06,1158897,1180675,1202843,1225258,1248158,1271456,1295097,1.319011e+06,1.343098e+06,SWZ,Swaziland
6,AUT,Austria,SP.POP.TOTL,"Population, total",7.047539e+06,7.086299e+06,7.129864e+06,7.175811e+06,7.223801e+06,7.270889e+06,7.322066e+06,7.376998e+06,7.415403e+06,7.441055e+06,7.467086e+06,7.500482e+06,7.544201e+06,7.586115e+06,7.599038e+06,7.578903e+06,7.565525e+06,7.568430e+06,7.562305e+06,7.549425e+06,7.549433e+06,7.568710e+06,7.574140e+06,7.561910e+06,7.561434e+06,7.564985e+06,7.569794e+06,7.574586e+06,7.585317e+06,7.619567e+06,7.677850e+06,7.754891e+06,7.840709e+06,7.905633e+06,7.936118e+06,7.948278e+06,7.959017e+06,7.968041e+06,7976789,7992324,8011566,8042293,8081957,8121423,8.171966e+06,8227829,8268641,8.295487e+06,8321496,8343323,8363404,8391643,8429991,8479375,8541575,8.633169e+06,8.747358e+06,AUT,Austria
7,ATG,Antigua and Barbuda,SP.POP.TOTL,"Population, total",5.533900e+04,5.614400e+04,5.714400e+04,5.829400e+04,5.952400e+04,6.078100e+04,6.205900e+04,6.336000e+04,6.465500e+04,6.591000e+04,6.709800e+04,6.818800e+04,6.917600e+04,7.006600e+04,7.087800e+04,7.160900e+04,7.228500e+04,7.287500e+04,7.332400e+04,7.352800e+04,7.344200e+04,7.306600e+04,7.244800e+04,7.163900e+04,7.072500e+04,6.978200e+04,6.880900e+04,6.784500e+04,6.705800e+04,6.662700e+04,6.669600e+04,6.730700e+04,6.842700e+04,6.993800e+04,7.171900e+04,7.361900e+04,7.562800e+04,7.773900e+04,79851,81831,83584,85057,86266,87293,8.825700e+04,89253,90301,9.138100e+04,92478,93581,94661,95719,96777,97824,98875,9.992300e+04,1.009630e+05,ATG,Antigua and Barbuda
8,BRN,Brunei Darussalam,SP.POP.TOTL,"Population, total",8.174500e+04,8.559600e+04,8.951600e+04,9.357600e+04,9.784800e+04,1.024250e+05,1.073160e+05,1.124940e+05,1.179500e+05,1.236530e+05,1.295830e+05,1.357260e+05,1.420730e+05,1.485600e+05,1.551090e+05,1.616710e+05,1.682240e+05,1.747730e+05,1.812570e+05,1.876560e+05,1.939490e+05,2.000850e+05,2.061280e+05,2.121360e+05,2.182270e+05,2.245120e+05,2.309720e+05,2.376220e+05,2.444580e+05,2.515140e+05,2.587850e+05,2.662740e+05,2.739630e+05,2.817510e+05,2.895250e+05,2.971920e+05,3.046990e+05,3.120380e+05,319222,326289,333241,340117,346867,353389,3.595230e+05,365158,370250,3.748640e+05,379252,383772,388662,394013,399748,405716,411704,4.175420e+05,4.231960e+05,BRN,Brunei
9,ETH,Ethiopia,SP.POP.TOTL,"Population, total",2.215128e+07,2.267119e+07,2.322139e+07,2.379843e+07,2.439702e+07,2.501363e+07,2.564138e+07,2.628121e+07,2.694608e+07,2.765416e+07,2.841508e+07,2.924521e+07,3.013258e+07,3.102512e+07,3.185171e+07,3.256682e+07,3.314689e+07,3.362239e+07,3.406832e+07,3.459023e+07,3.526490e+07,3.612029e+07,3.713685e+07,3.828588e+07,3.951880e+07,4.080034e+07,4.212073e+07,4.349328e+07,4.493206e+07,4.645891e+07,4.808652e+07,4.982108e+07,5.164777e+07,5.353296e+07,5.543112e+07,5.730988e+07,5.915515e+07,6.097645e+07,62794151,64640054,66537331,68492257,70497192,72545144,7.462440e+07,76727083,78850689,8.100041e+07,83184892,85416253,87702670,90046756,92444183,94887724,97366774,9.987303e+07,1.024032e+08,ETH,Ethiopia


# Checking for missed names

In [46]:
print(missed_names)
print(missed_isos)

print('Newly missed names:')
for wri_id, names in missed_names.items():
    print('Missed names in data set {}'.format(wri_id))
    for name in names:
        if name not in georef['known_non_un_isos']['name'].values:
            print(name)
        
print('Newly missed isos:')
for wri_id, isos in missed_isos.items():
    print('Missed isos in data set {}'.format(wri_id))
    for iso in isos:
        if iso not in georef['known_non_un_isos']['iso'].values:
            print(iso)

{}
{}
Newly missed names:
Newly missed isos:


In [211]:
## Process to investigate misses for a specific dataset
# Here, discovered issue with using apostrophe "’" instead of "'"
# Led to augmenting data prep for the georeferencing step above

df = data_tables['for.020']
df['join_col'] = df['country'].apply(lambda item: item.strip().lower().replace(' ',''))

df_a = df.merge(georef['aliases'],
                           left_on='join_col',
                           right_on='alias', 
                           how='left')
df_a.loc[pd.isnull(df_a['alias']),['country','join_col','alias', 'iso', 'name']]

print('democraticpeople’srepublicofkorea' in georef['aliases']['alias'])

LOOK_FOR = 'people\'s'
print('Viewing aliases with a name that contains {}:'.format(LOOK_FOR))
df = georef['aliases']
print(df.loc[df['alias'].str.lower().str.contains(LOOK_FOR)])

False

# Compile known not-included country names

In [90]:
def flatten(obj, new_list):
    obj.extend(new_list)
    return obj

other_isos = reduce(flatten, missed_isos.values(), [])
other_names = reduce(flatten, missed_names.values(), [])

def gather_names(iso):
    name = input('Official name of {}?'.format(iso))
    return((iso, name))

def gather_isos(name):
    iso = input('Official iso of {}?'.format(name))
    return((iso, name))

isos_and_names = list(map(gather_names, other_isos))
names_and_isos = list(map(gather_isos, other_names))

Official name of XKX?Kosovo
Official name of PSE?Palestinian Territory, Occupied
Official name of TWN?Taiwan, Province of China
Official name of SUN?Union of Soviet Socialist Republics
Official name of SJM?Svalbard and Jan Mayen
Official name of CSK?Czechoslovakia
Official name of GIB?Gibraltar
Official name of CHI?None
Official name of REU?Réunion
Official name of HKG?Hong Kong
Official name of CUW?Curaçao
Official name of MSR?Montserrat
Official name of PRI?Puerto Rico
Official name of VAT?Holy See
Official name of TCA?Turks and Caicos Islands
Official name of MNP?Northern Mariana Islands
Official name of FLK?Falkland Islands (Malvinas)
Official name of MAC?Macao
Official name of NFK?Norfolk Island
Official name of TKL?Tokelau
Official name of VIR?Virgin Islands, U.S.
Official name of ABW?Aruba
Official name of ASM?American Samoa
Official name of PYF?French Polynesia
Official name of SCG?Serbia and Montenegro
Official name of COK?Cook Islands
Official name of GGY?Guernsey
Official na

In [156]:
print(isos_and_names)
print(names_and_isos)

[('XKX', 'Kosovo'), ('PSE', 'Palestinian Territory, Occupied'), ('TWN', 'Taiwan, Province of China'), ('SUN', 'Union of Soviet Socialist Republics'), ('SJM', 'Svalbard and Jan Mayen'), ('CSK', 'Czechoslovakia'), ('GIB', 'Gibraltar'), ('CHI', 'None'), ('REU', 'Réunion'), ('HKG', 'Hong Kong'), ('CUW', 'Curaçao'), ('MSR', 'Montserrat'), ('PRI', 'Puerto Rico'), ('VAT', 'Holy See'), ('TCA', 'Turks and Caicos Islands'), ('MNP', 'Northern Mariana Islands'), ('FLK', 'Falkland Islands (Malvinas)'), ('MAC', 'Macao'), ('NFK', 'Norfolk Island'), ('TKL', 'Tokelau'), ('VIR', 'Virgin Islands, U.S.'), ('ABW', 'Aruba'), ('ASM', 'American Samoa'), ('PYF', 'French Polynesia'), ('SCG', 'Serbia and Montenegro'), ('COK', 'Cook Islands'), ('GGY', 'Guernsey'), ('BMU', 'Bermuda'), ('GUF', 'French Guiana'), ('NCL', 'New Caledonia'), ('SHN', 'Saint Helena, Ascension and Tristan da Cunha'), ('INX', 'None'), ('GUM', 'Guam'), ('SXM', 'Sint Maarten (Dutch part)'), ('IMN', 'Isle of Man'), ('MTQ', 'Martinique'), ('ANT

In [170]:
# Deleted isos
deleted_isos = '''AFI French Afar and Issas
ATB British Antarctic Territory
ATN Dronning Maud Land
CTE Canton and Enderbury Islands
DDR German Democratic Republic
DHY Dahomey
GEL Gilbert and Ellice Islands
HVO Upper Volta
JTN Johnston Island
MID Midway Islands
NHB New Hebrides
PCI Pacific Islands, Trust Territory of the
PCZ Panama Canal Zone
PHI Philippines – Code changed to PHL
PUS U.S. Miscellaneous Pacific Islands
RHO Southern Rhodesia
SKM Sikkim
VDR Viet-Nam, Democratic Republic of
WAK Wake Island
YMD Yemen, Democratic'''

by_line = deleted_isos.split('\n')
by_pair = [(line.split(' ')[0],' '.join(line.split(' ')[1:]) ) for line in by_line]
deleted_isos = pd.DataFrame(by_pair)
deleted_isos.columns = ['iso', 'name']

not_reported_isos = pd.DataFrame(isos_and_names)
not_reported_isos.columns = ['iso', 'name']

not_reported_names = pd.DataFrame(names_and_isos)
not_reported_names.columns = ['iso', 'name']
not_reported_names = not_reported_names.drop([0, 7, 19, 32])

deleted_isos['reason'] = 'deleted'
not_reported_isos['reason'] = 'not un'
not_reported_names['reason'] = 'not un'

In [171]:
df = deleted_isos.copy()
df = df.append(not_reported_isos)
df = df.append(not_reported_names)

cc.write(df, 'known_non_un_isos')

Table successfully written to CARTO: https://wri-rw.carto.com/dataset/known_non_un_isos


# Delete rows w/ no official name

In [None]:
for name, data in data_tables.items():
    data_tables[]

# Uploading finished files to Carto and S3

In [36]:
for name, data in data_tables.items():
    #print(data.head())
    #write_to_S3(data,s3_bucket,s3_folder+name+'_long')
    #print('saved ' + name + ' long data to s3')
    cc.write(data, name, overwrite=True)
    print('saved ' + name + ' long data to Carto')
    #print('failed to write table ' + name)

saved soc.076 long data to Carto
