In [1]:
import json
import logging
import zipfile
from typing import List
import requests
from urllib.parse import quote

from minio import Minio
from pathlib import Path
import shutil

In [2]:
minio_connection = ""

In [3]:
# Configuração básica de logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

In [4]:
# carregar para funcionar
try:
    minio_conn = json.loads(minio_connection)
except json.JSONDecodeError:
    with open("../variables/minio_connection.json", "r") as minio_connection_file:
        minio_conn = json.loads(minio_connection_file.read())

In [5]:
s3_client = None

try:
    endpoint_raw = minio_conn["endpoint"]
    access_key = minio_conn["access_key"]
    secret_key = minio_conn["key"]

    endpoint_sem_http = endpoint_raw.replace("http://", "").replace("https://", "")
    is_secure = endpoint_raw.startswith("https")

    s3_client = Minio(
        endpoint=endpoint_sem_http,
        access_key=access_key,
        secret_key=secret_key,
        secure=is_secure
    )

    logging.info("Cliente MinIO criado com sucesso.")

except KeyError as e:
    logging.error(f"Erro de configuração: chave ausente - {e}")
except Exception as e:
    logging.error(f"Erro ao inicializar o cliente MinIO: {e}")

2025-08-26 10:19:59,784 - INFO - Cliente MinIO criado com sucesso.


In [6]:
url = "https://arquivos.receitafederal.gov.br/dados/cnpj/regime_tributario/"
bucket = "landing"
schema = "rfb"
table = "regime_tributario_lucro_presumido"

archives = [
    "Lucro Presumido.zip",
]

# Pasta de staging
download_path = Path("download")
download_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Pasta de download criada: {download_path}")

2025-08-26 10:19:59,797 - INFO - Pasta de download criada: download


In [7]:
def download_and_extract(url: str, download_path: Path, retries: int = 3) -> None:
    filename = url.split("/")[-1]
    path_file = download_path / filename

    for tentativa in range(1, retries + 1):
        try:
            response = requests.get(url, stream=True, timeout=600)
            status = response.status_code

            if status != 200:
                logger.error(f"Url: {url}, Status: {status}, Message: Status inesperado")
                return

            content_type = response.headers.get("Content-Type")
            if content_type != "application/zip":
                logger.error(f"Url: {url}, Status: {status}, Message: Tipo inesperado: {content_type}")
                return

            content_length = response.headers.get("Content-Length")
            if content_length is None or int(content_length) == 0:
                logger.error(f"Url: {url}, Status: {status}, Message: Tamanho indefinido ou inesperado")
                return

            # Download
            with open(path_file, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024*1024):
                    f.write(chunk)
            logger.info(f"Baixado: {filename}")

            # Extração
            try:
                with zipfile.ZipFile(path_file, "r") as zf:
                    for member in zf.infolist():
                        if member.filename.endswith(".csv"):
                            target_path = download_path / Path(member.filename).name
                            zf.extract(member, path=download_path)
                            (download_path / member.filename).rename(target_path)

                logger.info(f"Extraído: {filename}")
            except zipfile.BadZipFile as bz:
                logger.error(f"Url: {url}, Error: {bz}, Message: Erro ao tentar descompactar o arquivo: {filename}")
            return  # sucesso

        except Exception as e:
            logger.warning(f"Tentativa {tentativa}/{retries} falhou para {url}: {e}")
    logger.error(f"Falha ao baixar {url} após {retries} tentativas")

In [8]:
for arquivo in archives:
    file_url = url + quote(arquivo) 
    download_and_extract(file_url, download_path)

2025-08-26 10:20:24,625 - INFO - Baixado: Lucro%20Presumido.zip
2025-08-26 10:20:25,535 - INFO - Extraído: Lucro%20Presumido.zip


In [9]:
# Uploud para MinIO
arquivos_para_uploud = list(download_path.rglob("*.csv"))

for arquivo in arquivos_para_uploud:
    caminho_relativo = arquivo.relative_to(download_path)
    destino = f"rfb/regime_tributario/lucro_presumido/{caminho_relativo.as_posix()}"
    s3_client.fput_object(
        bucket_name="landing",
        object_name=destino,
        file_path=str(arquivo)
    )

In [10]:
# Limpeza da pasta download após upload
downloads_path = Path("download")
try:
    if downloads_path.exists():
        shutil.rmtree(downloads_path)
        logger.info(f"Pasta '{downloads_path}' removida com sucesso após upload.")
    else:
        logger.warning(f"Pasta '{downloads_path}' não encontrada para remoção.")
except Exception as e:
    logger.error(f"Erro ao tentar remover '{downloads_path}': {e}")

2025-08-26 10:20:27,601 - INFO - Pasta 'download' removida com sucesso após upload.
