In [13]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
from datetime import datetime
from copy import deepcopy
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os
import geopandas as gpd
import fsspec
import re

from time import sleep

# Mapbox
from mapbox import Geocoder


# Import dataset
aurora_cara = pd.read_csv("Aurora v2.1 data file - caracterización.csv")
aurora_feedback = pd.read_csv("Aurora v2.1 data file - ayudaHumanitaria.csv")
aurora_monitoreo = pd.read_csv("Aurora v2.1 data file - monitoreo.csv")

default_value = 999999

ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 4


In [2]:
# functions
def loadLocalJsonDoc(filepath, dataProp=''):
    """
    return deserialised json in dictionary

    Parameters
    ----------
    filepath: file location or buffer.
    dataProp: (optional) specified property to access required data
    """
    output = {}
    with open(file=filepath, mode='r', encoding='utf-8') as f:
        json_load = json.load(f)
        if (dataProp):
            output = json_load[dataProp]
        else:
            output = json_load
    return output


def changeCountriesByExpression(country, valueDict: dict[str, str]):
    output = ""
    for key, value in valueDict.items():
        match = re.match(r"^"+key+r".$", country)
        if (match):
            return value

    return output if len(output) else country


def processCountries(countries: list[str], valueDict: dict[str, str]):
    output = []
    for country in countries:
        try:
            new_country = changeCountriesByExpression(
                country=country, valueDict=valueDict)
            output.append(new_country)
        except Exception as e:
            output.append(default_value)
    return output


def getCountriesWithCoordinates(countries: list[str], geo_countries: gpd.GeoDataFrame):
    output = {}
    for country in countries:
        try:
            filtered_country = geo_countries[geo_countries["NAME"].str.lower(
            ) == country].reindex()
            centroidValue = (filtered_country.centroid).iloc[0]
            output[country] = {"x": centroidValue.x, "y": centroidValue.y}
        except Exception as e:
            print(e)
            output[country] = {"x": default_value, "y": default_value}

    return output


def toUnixTimestamp(time, format: str = "%d/%m/%Y"):
    start = datetime(1970, 1, 1)
    target = datetime.strptime(time, format)
    in_seconds = (target - start).total_seconds()
    in_milliseconds = int(in_seconds) * 1000
    return in_milliseconds


def getCoordinate(value: str, side: str, valueDict: dict[str, tuple[int, int]], expressionDict: dict[str, str]):
    try:
        country = changeCountriesByExpression(value, expressionDict)
        return valueDict[country][side]
    except Exception as e:
        return default_value


def processFieldCoordinates(df: pd.DataFrame, columnDict: dict[str, dict[str, str]], valueDict: dict[str, tuple[int, int]], expressionDict: dict[str, str]):
    local_df = deepcopy(df)
    for column in columnDict.keys():
        local_df[columnDict[column]["x"]] = local_df[column].str.lower().apply(
            lambda x: getCoordinate(x, "x", valueDict, expressionDict))
        local_df[columnDict[column]["y"]] = local_df[column].str.lower().apply(
            lambda x: getCoordinate(x, "y", valueDict, expressionDict))

    return local_df

def processGeocodeData(data):
    features = data['features']
    for feature in features:
        id: str = feature['id']
        match = id.startswith("country")
        if (match):
            return (feature['properties']['short_code'], feature["place_name"])

    return "zz"


def getMapboxGeocoder(token:str):
    if(token):
        return Geocoder(access_token=token)
    else: 
        raise Exception("Invalid Token")

def reverseGeocode(longitude: int, latitude: int, token: str):
    mb_geocoder = getMapboxGeocoder(token)
    response = mb_geocoder.reverse(lat=latitude, lon=longitude)
    if (response.status_code == 200):
        data = response.json()
        return data
    else:
        return None


def processReverseGeoding(data: list[tuple[int, int]], token:str):
    output = []
    for lon, lat in data:
        try:
            result = reverseGeocode(lon, lat, token)
            _decoded = processGeocodeData(result)
            output.append(_decoded)
            sleep(1)
        except:
            output.append(("zz", ""))

    return output

def addReverseGeocodedToDataFrame(df: DataFrame, token:str  ):
    local_df = deepcopy(df)
    coordinates = list(zip(list(local_df['longitude'].astype(float).to_list()), list(
    local_df['latitude'].astype(float).to_list())))
    reversed_geocoded_df = processReverseGeoding(coordinates,token)
    local_df["country_code"] = [x[0] for x in reversed_geocoded_df]
    local_df["country_name"] = [x[1] for x in reversed_geocoded_df]
    return local_df

In [3]:
aurora = pd.merge(aurora_cara, aurora_feedback)

In [4]:
# Drop observations of Aurora team phones, test registers and geographical atypical rows 


user_ids_to_remove = [311571598, 311398466, 311396734, 311361421, 311361350, 311361257, 311337494, 311325070,
                      311325038, 311272934, 310820267, 310543580, 310357249, 310191611, 308421831, 306028996,
                      310191611, 308421831, 306028996, 311725039, 311719001, 311718121, 311699383, 311696700,
                      312179120, 311965863, 311965863, 316773170, 311440316, 313260546, 316563135, 316734459,
                      317064115]

for user_id in user_ids_to_remove:
    aurora = aurora.drop(aurora[aurora.UserId == user_id].index)



In [6]:
aurora=aurora[aurora['Consentimiento'] != 'NO'] 
aurora=aurora[aurora['¿Cómo interactúa con el sistema?'] != 'QR-Enganche'] 
aurora=aurora[aurora['Latitud'] != "None"] 

In [7]:
#Rename variables 
newColumns = {  'UserId' : 'objectid',
                'Edad'	: 'e06_edad',
                'Género' :	'e07_gener',
                'Latitud' : 'lat',
                'Longitud' : 'lon',
                "¿En qué país naciste?" : 'e08_pais_',
                'Otro país de nacimiento'	: 'e09_otro_p',
                '¿En qué país iniciaste tu viaje actual?'	: 'e10_pais_',
                'Otro país de inicio'	: 'e11_otro_p',
                '¿En qué país vivías hace un año?'	 : 'e12_pais_',
                'Otro país'	: 'e13_otro_p',
                'Restringir una o más raciones de alimentos' :	'e15__has_',
                'Dormir a la intemperie' :	'e16__tu_',
                'Ha necesitado asistencia médica' : 'asistencia_medica',
                '¿Cuántas personas te acompañan en tu viaje?' :	'e17__cua',
                'Hay niños, niñas o adolescentes'	 : 'e18__entr',
                'Total NNA'	: 'e19_cu',
                'NNA de 0 a 5' :	'e20__cua',
                'NNA de 6 a 11' :	'e21__cua',
                'NNA de 12 a 17'	: 'e22__cua',
                'Lugar interacción'	: 'e24__me_c',
                'Mujer embarazo viajando' : 'm01__en_t',
                'Mujer lactando viajando' : 'lactante',
                'Tienes alguna enfermedad crónica'	: 'm02__en_t',
                'Tienes alguna condición de discapacidad'	: 'm03__dent',
                'Cuáles han sido tus 3 principales necesidades': 'necesidades', 
                '¿Recibiste ayuda humanitaria en el lugar actual?' : 'm09__acce',
                'Cual ayuda humanitaria' :	'm12__cua',       
                'Qué tan fácil fue acceder a la ayuda' :	'm14_respec',
                'Qué tan satisfecho te sientes respecto a la ayuda' : 'm15__que',
               'Recomendarías la ayuda ' : 'm16_de_acu',
               'Cual ayuda humanitaria NNA' :	'm18_me_con',      
               'NNA: Qué tan fácil fue acceder a la ayuda' : 'm19_respec',
               'NNA: Qué tan satisfecho te sientes respecto a la ayuda' :	'm20__que',
               'NNA: Recomendarías la ayuda' : 'm21_de_acu',

}

aurora_carto = aurora.rename(columns=newColumns)

In [8]:
available_countries = [x.lower() for x in list(set(list(aurora_carto["e08_pais_"].unique(
)) + list(aurora_carto["e10_pais_"].unique()) + list(aurora_carto["e12_pais_"].unique()))) if type(x) == str]

countries_dict = loadLocalJsonDoc("defaults/countries_dict.json")

available_countries = processCountries(available_countries, countries_dict)

# adding coordinates value
country_data_path = "simplecache::https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/cultural/ne_110m_admin_0_countries.zip"

country_df = ""

with fsspec.open(country_data_path) as file:
    country_df = gpd.read_file(file)

countriesWithCoordinates = getCountriesWithCoordinates(
    available_countries, country_df)
country_column_dict = loadLocalJsonDoc("defaults/country_column_dict.json")
aurora_carto = processFieldCoordinates(
    aurora_carto, country_column_dict, countriesWithCoordinates, countries_dict)
aurora_carto['lon_eng'] = aurora_carto['lon']
aurora_carto['lat_eng'] = aurora_carto['lat']
aurora_carto['longitude'] = aurora_carto['lon']
aurora_carto['latitude'] = aurora_carto['lat']

single positional indexer is out-of-bounds



  centroidValue = (filtered_country.centroid).iloc[0]


In [9]:
#Create the variable time
# the format was missing other elements thus the timestring was not parsing
aurora_carto["timeunix"] = aurora_carto["Inicio interacción"].apply(lambda x: toUnixTimestamp(x, '%Y-%m-%d %H:%M:%S.%f+00:00'))

In [10]:
MAPBOX_TOKEN = os.environ.get("MAPBOX_TOKEN")
# This is heavy process that takes a while to finish
# should be used sparingly and closer to end processes.
aurora_carto = addReverseGeocodedToDataFrame(aurora_carto, MAPBOX_TOKEN)

In [11]:
#filling missing values
# should be done at the very end
aurora_carto = aurora_carto.fillna(default_value)

In [12]:
#database for Carto
aurora_carto.to_csv('aurora_round_2.csv', index=False)