In [168]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
from copy import deepcopy
from datetime import datetime
import json

defaultMissingValue = 999999


def loadLocalJsonDoc(filepath, dataProp=''):
    """
    return deserialised json in dictionary

    Parameters
    ----------
    filepath: file location or buffer.
    dataProp: (optional) specified property to access required data
    """
    output = {}
    with open(file=filepath, mode='r', encoding='utf-8') as f:
        json_load = json.load(f)
        if (dataProp):
            output = json_load[dataProp]
        else:
            output = json_load
    return output

# Import dataset
services = pd.read_csv(
    'Aurora_OfficeRonda_2_-_latest_version_-_False_-_2023-10-04-16-15-49.csv', sep=';', index_col=False)

# Fill  missing values
services = services.fillna(defaultMissingValue)

# rename variables
newColumns = loadLocalJsonDoc("defaults/rename_columns.json")

services_carto = services.rename(columns=newColumns)

In [169]:
def toUnixTimestamp(time, format:str = "%d/%m/%Y"):
    start = datetime(1970, 1, 1)
    target = datetime.strptime(time, format)
    in_seconds = (target - start).total_seconds()
    in_milliseconds = int(in_seconds) * 1000
    return in_milliseconds

def codifyServices(value: str, values_dict: dict[str, int], otherValue:str):
    if(type(value) == float or type(value) == int):
        return otherValue
    raw_values = value.split(" ")
    output = []
    for value in raw_values:
        try:      
            codedValue = values_dict[value]
            output.append(str(codedValue) )
        except Exception as e:
            output.append(otherValue)
            
    return "|".join(output)

def processColumn(dfColumn:Series, values_dict: dict[int, str], other_value: str):
    reversed_values_dict = dict( [( x[1],x[0])  for x in values_dict.items()])
    return dfColumn.apply(lambda x: codifyServices(x, reversed_values_dict, other_value))

def processMultValueColumns(df: DataFrame, columnObjectsList: list[dict]):
    """
    df: DataFrame object
    columnsObjectsList: list of column object
    columnObject: dictionary {"target_column": str, "output_column": str, values_dict: dict, other_value: str}

    return DataFrame Object
    """
    for columnObject in columnObjectsList:
        try:
            target_column = columnObject["target_column"]
            output_column = columnObject["output_column"]
            values_dict = columnObject["values_dict"]
            other_value = str(columnObject["other_value"])
            df[output_column] =  processColumn(df[target_column], values_dict, other_value)
        except Exception as e:
            print(e)
            continue
    return df

def exportToFile(df: DataFrame,fileType: str, exportName: str):
    """ 
    df -> Pandas DataFrame object
    fileType -> Either "csv" or "json"
    exportName -> File location
    """
    if(fileType == "csv"):
        name = f"{exportName}.csv"
        df.to_csv(name)
        print (f"data export to {name}")
    else:
        name = f"{exportName}.json"
        df.to_json(name, orient="records")
        print (f"data export to {name}")
    

In [171]:
# parsing date field into unix timestamp
services_carto["timeunix"] = services_carto["fecha"].apply(lambda x: toUnixTimestamp(time=x, format="%Y-%m-%d"))

In [172]:
def codifyServices(value: str, values_dict: dict[str, int], otherValue:str):
    if(type(value) == float or type(value) == int):
        return otherValue
    raw_values = value.split(" ")
    output = []
    for value in raw_values:
        try:      
            codedValue = values_dict[value]
            output.append(str(codedValue) )
        except Exception as e:
            output.append(otherValue)
            
    return "|".join(output)

def processColumn(dfColumn:Series, values_dict: dict[int, str], other_value: str):
    reversed_values_dict = dict( [( x[1],x[0])  for x in values_dict.items()])
    return dfColumn.apply(lambda x: codifyServices(x, reversed_values_dict, other_value))

def processMultValueColumns(df: DataFrame, columnObjectsList: list[dict]):
    """
    df: DataFrame object
    columnsObjectsList: list of column object
    columnObject: dictionary {"target_column": str, "output_column": str, values_dict: dict, other_value: str}

    return DataFrame Object
    """
    for columnObject in columnObjectsList:
        try:
            target_column = columnObject["target_column"]
            output_column = columnObject["output_column"]
            values_dict = columnObject["values_dict"]
            other_value = str(columnObject["other_value"])
            df[output_column] =  processColumn(df[target_column], values_dict, other_value)
        except Exception as e:
            print(e)
            continue
    return df

def exportToFile(df: DataFrame,fileType: str, exportName: str):
    """ 
    df -> Pandas DataFrame object
    fileType -> Either "csv" or "json"
    exportName -> File location
    """
    if(fileType == "csv"):
        name = f"{exportName}.csv"
        df.to_csv(name)
        print (f"data export to {name}")
    else:
        name = f"{exportName}.json"
        df.to_json(name, orient="records")
        print (f"data export to {name}")
    

In [153]:
# serv_tipo (separating by pipe symbol and categorized with number)
codify_dict = loadLocalJsonDoc("defaults/codification_dict.json")
services_dict = codify_dict["services_dict"]

# re structure variable cuenta_con
cuenta_con_dict = codify_dict["cuenta_con_dict"] 

# re structure variable children services (cual_serv1)
cual_serv1_dict = codify_dict["cual_serv1_dict"] 

# re structure variable women services (cual_ser_2)
cual_ser_2_dict = codify_dict["cual_ser_2_dict"] 

# re structure variable data storage (almacenamientoregistros)
registro_dict = codify_dict["registro_dict"] 

# variable funding 
financ_dict = codify_dict["financ_dict"] 

#  variable challenges
reto_dict = codify_dict["reto_dict"] 

# variable lenguages
idio_dict = codify_dict["idio_dict"] 

# variable medios
medio_dict = codify_dict["medio_dict"] 

values = [
    {
        "target_column": "serv_tipo",
        "output_column": "serv_tipo1",
        "values_dict": services_dict,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "cuenta_con",
        "output_column": "cuenta_c_1",
        "values_dict": cuenta_con_dict,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "cual_serv1",
        "output_column": "cual_ser_1",
        "values_dict": cual_serv1_dict,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "cual_ser_2",
        "output_column": "cual_ser_3",
        "values_dict": cual_ser_2_dict,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "almacenamientoregistros",
        "output_column": "almacenamientoregistros_",
        "values_dict": cual_ser_2_dict,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "financiamiento",
        "output_column": "financb",
        "values_dict": financ_dict,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "princ_reto",
        "output_column": "princ_re_1",
        "values_dict": reto_dict ,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "idioma_ent",
        "output_column": "idioma_e_1",
        "values_dict": idio_dict ,
        "other_value": defaultMissingValue
    },
    {
        "target_column": "medios_bri",
        "output_column": "medios_b_1",
        "values_dict": medio_dict ,
        "other_value": defaultMissingValue
    },
]

In [154]:
output_df = processMultValueColumns(services_carto, values)

In [155]:
exportToFile(output_df, "csv", "output")

data export to output.csv


# daily report of surveys 

In [156]:
import datetime

services.fecha = pd.to_datetime(services.fecha)

In [157]:
# totals 
survey_counts = services.groupby('encuesta').organizacionprincipal.count().reset_index()
survey_counts = survey_counts.rename(columns={"organizacionprincipal": "total_encuestas"})
survey_counts

# totals by organization
unpivoted = services.groupby(['encuesta', 'fecha'])['pais'].count().reset_index()
pivoted = unpivoted.pivot(
    columns='fecha',
    index= 'encuesta',
    values='pais')
pivoted

fecha,2023-10-03,2023-10-04
encuesta,Unnamed: 1_level_1,Unnamed: 2_level_1
joseeduardoherrera,1.0,
luisdavidcalderonpatino,1.0,
luisricardosolermadrid,2.0,
mariaalejandraolartedelgado,3.0,
otro,1.0,1.0


In [158]:
#capacity vs attentions
services['servicios'] = services['servicios'].str.split(' ')
prueba = services.explode('servicios')

In [159]:
prueba= prueba[['_coordenadas_latitude', 
                '_coordenadas_longitude', 
                'servicios', 
                'organizacionprincipal', 
                'organizacionimplementadora',
                'pais',
                'punto_reporte', 
                '_uuid',
                'diarioalimentacionynutricion',
                'ayeralimentacionynutricion',
                'promalimentacionynutricion',
                'diarioalojamientotemporal',
                'ayeralojamientotemporal',
                'promalojamientotemporal',
                'diarioapoyooayudapsicosocial',
                'ayerapoyooayudapsicosocial',
                'promapoyooayudapsicosocial',
                'diarioasesorialegal',
                'ayerasesorialegal',
                'promasesorialegal',
                'diarioeducacion',
                'ayereducacion',
                'promeducacion',
                'diarioserviciosproteccion', 
                'ayerserviciosproteccion',
                'promserviciosproteccion',
                'diariorestablecimientodecontactofamiliar',
                'ayerrestablecimientodecontactofamiliar',
                'promrestablecimientodecontactofamiliar',
                'diariosaludprimerosauxiliosyatencionmedica',
                'ayersaludprimerosauxiliosyatencionmedica',
                'promsaludprimerosauxiliosyatencionmedica',
                'diarioaguapotable',
                'ayeraguapotable',
                'promaguapotable',
                'diariosaneamiento',
                'ayersaneamiento',
                'promsaneamiento',
                'diariohigiene',
                'ayerhigiene',
                'promhigiene',
                'diariotransportehumanitario',
                'ayertransportehumanitario',
                'promtransportehumanitario',
                'diariotransferenciasmonetarias',
                'ayertransferenciasmonetarias',
                'promtransferenciasmonetarias',
                'diariootro',
                'ayerotro',
                'promotro',
                'diariootro1',
                'ayerotro1',
                'promotro1', 
                ]].reset_index()


In [160]:
prueba["capacidad"]=" "
prueba["ayer"]=" "
prueba["semana_pasada"]=" "


In [161]:
# Look up function

def fillColumnWithLookup(df: DataFrame, values: list[tuple[str, str]], lookup_column: str, target_column:str):
    """ 
    returns DataFrame with new columns filled with lookup parameters

    Parameters
    ----------
    df: DataFrame
    values: list of 2-dimensional tuple. First is testing value.
    Second is the output columns to which to place values that match the first.
    lookup_column: a string of the field or column or dict property to which test the testing value
    target_column: a string of a field or column or dict property to get the required value
    """
    working_df = deepcopy(df)
    for (value, output_column) in values:
        working_df.loc[(working_df[lookup_column]== value), target_column] = working_df[output_column]
    return working_df

def processColumnFill(df: DataFrame, columnValues: list[dict]):
    """ 
    returns DataFrame with new columns filled with lookup parameters

    Parameters
    ----------
    df: DataFrame
    columnValues: list of Objects having:
        values: list of 2-dimensional tuple. First is testing value.
        Second is the output columns to which to place values that match the first.
        lookup_column: a string of the field or column or dict property to which test the testing value
        target_column: a string of a field or column or dict property to get the required value
    """
    working_df = deepcopy(df)
    for column in columnValues:
        lookup_column = column["lookup_column"]
        target_column = column["target_column"]
        values = column["values"]
        working_df = fillColumnWithLookup(df=working_df, values=values, target_column=target_column, lookup_column=lookup_column)
    return working_df

In [162]:
diarValues = [
    ('alimentacionynutricion', 'diarioalimentacionynutricion'),
    ('alojamientotemporal', 'diarioalojamientotemporal'),
    ('acompañamientoayudapsicosocial', 'diarioapoyooayudapsicosocial'),
    ('aguapotable', 'diarioaguapotable'),
    ('asesorialegal', 'diarioasesorialegal'),
    ('educacion', 'diarioeducacion'),
    ('higiene', 'diariohigiene'),
    ('restablecimientodecontactofamiliar', 'diariorestablecimientodecontactofamiliar'),
    ('saludprimerosauxiliosyatencionmedica', 'diariosaludprimerosauxiliosyatencionmedica'),
    ('saneamiento', 'diariosaneamiento'),
    ('serviciosdeproteccion', 'diarioserviciosproteccion'),
    ('transferencias', 'diariotransferenciasmonetarias'),
    ('transportehumanitario', 'diariotransportehumanitario'),
    ('otro', 'diariootro'),
]
ayerValues = [
    ('alimentacionynutricion', 'ayeralimentacionynutricion'),
    ('alojamientotemporal', 'ayeralojamientotemporal'),
    ('acompañamientoayudapsicosocial', 'ayerapoyooayudapsicosocial'),
    ('aguapotable', 'ayeraguapotable'),
    ('asesorialegal', 'ayerasesorialegal'),
    ('educacion', 'ayereducacion'),
    ('higiene', 'ayerhigiene'),
    ('restablecimientodecontactofamiliar', 'ayerrestablecimientodecontactofamiliar'),
    ('saludprimerosauxiliosyatencionmedica', 'ayersaludprimerosauxiliosyatencionmedica'),
    ('saneamiento', 'ayersaneamiento'),
    ('serviciosdeproteccion', 'ayerserviciosproteccion'),
    ('transferencias', 'ayertransferenciasmonetarias'),
    ('transportehumanitario', 'ayertransportehumanitario'),
    ('otro', 'ayerotro'),
]
promValues = [
    ('alimentacionynutricion', 'promalimentacionynutricion'),
    ('alojamientotemporal', 'promalojamientotemporal'),
    ('acompañamientoayudapsicosocial', 'promapoyooayudapsicosocial'),
    ('aguapotable', 'promaguapotable'),
    ('asesorialegal', 'promasesorialegal'),
    ('educacion', 'promeducacion'),
    ('higiene', 'promhigiene'),
    ('restablecimientodecontactofamiliar', 'promrestablecimientodecontactofamiliar'),
    ('saludprimerosauxiliosyatencionmedica', 'promsaludprimerosauxiliosyatencionmedica'),
    ('saneamiento', 'promsaneamiento'),
    ('serviciosdeproteccion', 'promserviciosproteccion'),
    ('transferencias', 'promtransferenciasmonetarias'),
    ('transportehumanitario', 'promtransportehumanitario'),
    ('otro', 'promotro'),
]


columnValues = [
    {
        "target_column": "capacidad",
        "lookup_column": "servicios",
        "values": diarValues
    },
    {
        "target_column": "ayer",
        "lookup_column": "servicios",
        "values": ayerValues
    },
    {
        "target_column": "semana_pasada",
        "lookup_column": "servicios",
        "values": promValues
    }
]

In [163]:
prueba = processColumnFill(df=prueba, columnValues=columnValues)

In [164]:
newColumns = { '_uuid' : 'submission_id',
               'servicios' : 'servicio',
               'semana_pasada' : 'semana+anterior',
               'ayer' : 'promedio+ayer',
               '_coordenadas_latitude' : 'observation_lat',
               '_coordenadas_longitude' : 'observation_lon',
               'punto_reporte' : 'Región'
               }

prueba = prueba.rename(columns=newColumns)

In [165]:
wide= prueba[[ 'submission_id',
                 'servicio',
                 'capacidad', 
                 'semana+anterior',
                 'promedio+ayer',
                 'observation_lat',
                 'observation_lon',
                 'Región',
                 'organizacionprincipal', 
                 'organizacionimplementadora',
                 
                ]].reset_index()

In [166]:
wide1= wide.dropna(subset=['servicio']).reset_index(drop=True)



In [167]:
output_df = processMultValueColumns(wide1, values)
exportToFile(output_df, "csv", "bd_serv_prem_wide")

'serv_tipo'
'cuenta_con'
'cual_serv1'
'cual_ser_2'
'almacenamientoregistros'
'financiamiento'
'princ_reto'
'idioma_ent'
'medios_bri'
data export to bd_serv_prem_wide.csv
