In [1]:
import aiohttp
import asyncio
import os
import json
import logging
from pathlib import Path
import logging

from minio import Minio
import shutil

In [2]:
minio_connection = ""  

In [3]:
# Configuração básica de logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)

In [4]:
# carregar para funcionar
try:
    minio_conn = json.loads(minio_connection)
except json.JSONDecodeError:
    with open('../variables/minio_connection.json', "r") as minio_connection_file:
        minio_conn = json.loads(minio_connection_file.read())

In [5]:
s3_client = None

try:
    endpoint_raw = minio_conn["endpoint"]
    access_key = minio_conn["access_key"]
    secret_key = minio_conn["key"]

    endpoint_sem_http = endpoint_raw.replace("http://", "").replace("https://", "")
    is_secure = endpoint_raw.startswith("https")

    s3_client = Minio(
        endpoint=endpoint_sem_http,
        access_key=access_key,
        secret_key=secret_key,
        secure=is_secure
    )

    logging.info("Cliente MinIO criado com sucesso.")

except KeyError as e:
    logging.error(f"Erro de configuração: chave ausente - {e}")
except Exception as e:
    logging.error(f"Erro ao inicializar o cliente MinIO: {e}")

2025-08-29 14:23:20,629 - INFO - Cliente MinIO criado com sucesso.


In [6]:
urls = [
    "https://balanca.economia.gov.br/balanca/bd/tabelas/NCM_SH.csv"
]

In [7]:
# Controle de concorrência (None = sem limite)
MAX_CONCURRENT_DOWNLOADS = 3

sem = asyncio.Semaphore(MAX_CONCURRENT_DOWNLOADS) if MAX_CONCURRENT_DOWNLOADS else None

In [8]:
bucket = "landing"
caminho_destino = "mdic/produtos_produtos_sh/"
download_dir = "download"

In [9]:
async def _download(session: aiohttp.ClientSession, url: str, retries: int):
    """Baixa um arquivo com retries e tratamento de erros"""
    for tentativa in range(1, retries + 1):
        try:
            timeout = aiohttp.ClientTimeout(total=600)
            async with session.get(url, timeout=timeout) as response:
                status = response.status
                if status != 200:
                    logger.error(f"Url: {url}, Status: {status}")
                    return None

                content_type = response.headers.get("Content-Type", "")
                if "text/csv" not in content_type and "application/octet-stream" not in content_type:
                    logger.error(f"Url: {url}, Tipo inesperado: {content_type}")
                    return None

                content_length = response.headers.get("Content-Length")
                if content_length is None or int(content_length) == 0:
                    logger.error(f"Url: {url}, Tamanho indefinido ou zero")
                    return None

                filename = url.split("/")[-1]
                path_file = os.path.join(download_dir, filename)
                with open(path_file, "wb") as f:
                    async for chunk in response.content.iter_chunked(1024*1024):
                        f.write(chunk)

                logger.info(f"Baixado: {filename}")
                return filename

        except Exception as e:
            logger.warning(f"Tentativa {tentativa}/{retries} falhou para {url}: {e}")
            await asyncio.sleep(5)
    return False

In [10]:
async def fetch(session: aiohttp.ClientSession, url: str):
    """Orquestra o download com semáforo"""
    if sem:
        async with sem:
            return await _download(session, url, retries=3)
    else:
        return await _download(session, url, retries=3)

In [11]:
async def main():
    os.makedirs(download_dir, exist_ok=True)
    logger.info("Iniciando downloads...")

    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        results = await asyncio.gather(*tasks)

    arquivos_csv = [Path(download_dir) / r for r in results if r]
    logger.info(f"Arquivos baixados: {len(arquivos_csv)}")

    # Upload para MinIO
    for arquivo in arquivos_csv:
        caminho_relativo = arquivo.relative_to(download_dir)
        destino = f"{caminho_destino}{caminho_relativo.as_posix()}"
        s3_client.fput_object(
            bucket_name=bucket,
            object_name=destino,
            file_path=str(arquivo)
        )
        logger.info(f"Arquivo enviado para MinIO: {destino}")

    # Limpeza da pasta
    try:
        shutil.rmtree(download_dir)
        logger.info(f"Pasta '{download_dir}' removida com sucesso.")
    except Exception as e:
        logger.error(f"Erro ao remover pasta '{download_dir}': {e}")

await main()

2025-08-29 14:23:20,692 - INFO - Iniciando downloads...
2025-08-29 14:23:22,182 - INFO - Baixado: NCM_SH.csv
2025-08-29 14:23:22,184 - INFO - Arquivos baixados: 1
2025-08-29 14:23:22,260 - INFO - Arquivo enviado para MinIO: mdic/produtos_produtos_sh/NCM_SH.csv
2025-08-29 14:23:22,265 - INFO - Pasta 'download' removida com sucesso.
