<br>

# Introdução

In [None]:
#!pip3 install dask --upgrade
#!pip3 install dask-labextension  --upgrade
#!pip3 install pyarrow --upgrade        # Necessário para usar o parquet
#!pip3 install traquitanas --upgrade
#!jupyter labextension install dask-labextension

In [None]:
import os
import sys
import time
import ctypes
import datetime
import numpy as np
import pandas as pd
import pyarrow.parquet as pq

In [None]:
import dask.dataframe as dd
from dask import compute
from dask.delayed import delayed
from dask.distributed import Client, LocalCluster
from dask.distributed import wait, progress
from dask.diagnostics import ProgressBar

In [None]:
import dask
import dask.distributed  # populate config with distributed defaults
dask.config.get('distributed.client')

In [None]:
mod_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.append(mod_path)
from sisagua.ibge import *

In [None]:
from paths import *

<br>

## Client

In [None]:
import multiprocessing as mp
from dask.distributed import Client, LocalCluster

In [None]:
#import close_process
#close_process.process()

In [None]:
cluster = LocalCluster(
    n_workers=int(0.9 * mp.cpu_count()),
    threads_per_worker=4,
    memory_limit='4GB',
    processes=True,
    env={'MALLOC_TRIM_THRESHOLD_': '65536'}
)

client = Client(cluster)
client

<br>

## Parameters

In [None]:
# Parameters
cod_ibge = '3505203' # Bariri
#cod_ibge = '3501608' # Americana
#cod_ibge = '3548906' # São Carlos
#cod_ibge = '3526902' # Limeira

In [None]:
estado_d = find_states(cod_ibge)
estado = estado_d['sigla']

cod_ibge_ajustado = adjust_id_ibge(cod_ibge)

<br>

## Dask Zifiles

In [None]:
# Parameters
filenames = [
    # Cadastro
    os.path.join(input_path_parquet_partitioned, 'cadastro', 'cadastro_pontos_captacao'),
    os.path.join(input_path_parquet_partitioned, 'cadastro', 'cadastro_populacao_abastecida'),
    os.path.join(input_path_parquet_partitioned, 'cadastro', 'cadastro_tratamento_de_agua'),
    
    # Controle
    os.path.join(input_path_parquet_partitioned, 'controle', 'controle_mensal_amostras_fora_padrao'),
    os.path.join(input_path_parquet_partitioned, 'controle', 'controle_mensal_demais_parametros'),
    os.path.join(input_path_parquet_partitioned, 'controle', 'controle_mensal_infraestrutura_operacionais'),
    os.path.join(input_path_parquet_partitioned, 'controle', 'controle_mensal_parametros_basicos_*'),
    os.path.join(input_path_parquet_partitioned, 'controle', 'controle_semestral_*'),
    
    # Vigilância
    os.path.join(input_path_parquet_partitioned, 'vigilancia', 'vigilancia_cianobacterias_cianotoxinas'),
    os.path.join(input_path_parquet_partitioned, 'vigilancia', 'vigilancia_demais_parametros'),
    os.path.join(input_path_parquet_partitioned, 'vigilancia', 'vigilancia_parametros_basicos_*'),
]

In [None]:
for filename in filenames:    
    # Path
    basename = os.path.basename(filename).replace('_*', '')
    subdir = os.path.basename(os.path.dirname(filename))
    print('Na pasta "{}", processando arquivo "{}"'.format(subdir, basename))
    
    # Set and create output paths
    output_path_city = os.path.join(
        output_path,
        '{}'.format(cod_ibge),
        'dados brutos',
        subdir,
    )
    os.makedirs(output_path_city, exist_ok=True)
    
    # Add Filter to Filename
    filter_path = os.path.join('Uf={}'.format(estado), 'Código Ibge={}/*.parquet'.format(cod_ibge_ajustado))
    filename = os.path.join(filename, filter_path)
    
    # Set Filters
    filters = [[('Uf', '==', estado), ('Código Ibge', '=', int(cod_ibge_ajustado))]]

    # Read Dataframes
    df = dd.read_parquet(
        filename,
        filters=filters,
    )

    # Calculate
    df = df.compute()
    df.head()

    # Write to Excel
    df.to_excel(
        os.path.join(output_path_city, '{}.xlsx'.format(basename)),
        index=False,
    )

<br>

## End

In [None]:
cluster.close()
time.sleep(1)
client.close()