In [1]:
import logging
from pyspark.sql import SparkSession, DataFrame
import json
from minio import Minio

In [2]:
minio_connection = ""

In [3]:
# Configuração básica de logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

In [4]:
# carregar para funcionar
try:
    minio_conn = json.loads(minio_connection)
except json.JSONDecodeError:
    with open("../variables/minio_connection.json", "r") as minio_connection_file:
        minio_conn = json.loads(minio_connection_file.read())

In [5]:
class LazySparkSession:
    packages = [
        "io.delta:delta-spark_2.13:4.0.0",
        "org.apache.hadoop:hadoop-aws:3.4.0",
        "com.amazonaws:aws-java-sdk-bundle:1.12.787",
    ]

    def __init__(self, access_key, secret_key, endpoint):
        self._access_key = access_key
        self._secret_key = secret_key
        self._endpoint = endpoint
        

    def start(
        self,
        app_name: str = "Airflow Spark Delta Minio App",
        executor_memory: str = "1g",
        driver_memory: str = "1g",
        driver_maxresultsize: str = "1g",
        master_url: str = "local[*]",
    ):

        builder = (
            SparkSession
            .Builder()
            .appName(app_name)
            .config("spark.hadoop.fs.s3a.access.key", self._access_key)
            .config("spark.hadoop.fs.s3a.secret.key", self._secret_key)
            .config("spark.hadoop.fs.s3a.endpoint", self._endpoint)
            .config("spark.hadoop.delta.enableFastS3AListFrom", "true")
            #
            .config("spark.executor.memory", executor_memory)
            .config("spark.driver.memory", driver_memory)
            .config("spark.driver.maxResultSize", driver_maxresultsize)
            #
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            #
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            #
            .config("spark.jars.packages", ",".join(self.packages))
            .master(master_url)
        )

        return builder.getOrCreate()

In [6]:
spark = LazySparkSession(
    access_key=minio_conn.get("access_key"), 
    secret_key=minio_conn.get("key"), 
    endpoint=minio_conn.get("endpoint")
).start()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/26 10:55:08 WARN Utils: Your hostname, DESKTOP-EDEM2DH, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/26 10:55:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/edcarlos/projeto-lakehouse/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/edcarlos/.ivy2.5.2/cache
The jars for the packages stored in: /home/edcarlos/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-632a8fbe-bb7b-4177-adc1-a61fb6c3d20a;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central

In [7]:
s3_client = None

try:
    endpoint_raw = minio_conn["endpoint"]
    access_key = minio_conn["access_key"]
    secret_key = minio_conn["key"]

    endpoint_sem_http = endpoint_raw.replace("http://", "").replace("https://", "")
    is_secure = endpoint_raw.startswith("https")

    s3_client = Minio(
        endpoint=endpoint_sem_http,
        access_key=access_key,
        secret_key=secret_key,
        secure=is_secure
    )

    logging.info("Cliente MinIO criado com sucesso.")

except KeyError as e:
    logging.error(f"Erro de configuração: chave ausente - {e}")
except Exception as e:
    logging.error(f"Erro ao inicializar o cliente MinIO: {e}")

2025-08-26 10:55:17,594 - INFO - Cliente MinIO criado com sucesso.


In [8]:
# Parâmetros de arquivo
bucket = "landing"
pasta = "rfb/regime_tributario/lucro_real/"
input_path = f"s3a://{bucket}/{pasta}/*.csv"

bucket_output = "bronze"
schema = "rfb"
subcategoria = "regime_tributario"
tabela = "lucro_real"

pasta_saida = f"{subcategoria}_{tabela}"
output_path = f"s3a://{bucket_output}/{schema}/{pasta_saida}/"

logger.info(f"Lendo arquivo de entrada: {input_path}")
logger.info(f"Saída será salva em: {output_path}")

2025-08-26 10:55:17,610 - INFO - Lendo arquivo de entrada: s3a://landing/rfb/regime_tributario/lucro_real//*.csv
2025-08-26 10:55:17,611 - INFO - Saída será salva em: s3a://bronze/rfb/regime_tributario_lucro_real/
2025-08-26 10:55:17,611 - INFO - Saída será salva em: s3a://bronze/rfb/regime_tributario_lucro_real/


In [9]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .load(input_path)

25/08/26 10:55:18 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
25/08/26 10:55:19 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: s3a://landing/rfb/regime_tributario/lucro_real//*.csv.
java.io.FileNotFoundException: No such file or directory: s3a://landing/rfb/regime_tributario/lucro_real/*.csv
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:4071)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3922)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getFileStatus$26(S3AFileSystem.java:3899)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.invokeTrackingDuration(IOStatist

In [10]:
def padronizar_texto(df:DataFrame) -> DataFrame:
    """Padroniza todas as colunas de texto(strip + lower)."""
    for coluna, tipo in df.dtypes:
        if tipo =='string':
            df = df.withColumn(coluna, F.lower(F.trim(F.col(coluna))))
    return df

In [11]:
def padronizar_numeros_e_codigos(df: DataFrame) -> DataFrame:
    """Padroniza colunas númericas e códigos (CNPJs como string 14 dígitos)."""
    for coluna, tipo in df.dtypes:
        if tipo in ["int", "bigint"]:
            df = df.withColumn(coluna, F.col(coluna).cast("int"))
        elif tipo in ["double", "float"]:
            df = df.withColumn(coluna, F.col(coluna).cast("double"))

    # Padronização genérica de CNPJs
    for c in ["cnpj", "cnpj_da_scp"]:
        if c in df.columns:
            df = df.withColumn(c, F.lpad(F.regexp_replace(F.col(c), r"\D", ""), 14, "0"))

    return df

In [12]:
# Salvar em Delta na camada bronze
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(output_path)
logger.info(f"Arquivo .csv processado e salvo em Delta no caminho: {output_path}")

# Para verificar, conta linhas
count = df.count()
logger.info(f"Total de linhas: {count}")

25/08/26 10:55:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/08/26 10:55:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/08/26 10:55:29 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to rfb/regime_tributario_lucro_real/part-00002-1ef954af-042d-4028-9a73-8fe9eaa6a737-c000.snappy.parquet. This is Unsupported
2025-08-26 10:55:32,209 - INFO - Arquivo .csv processado e salvo em Delta no caminho: s3a://bronze/rfb/regime_tributario_lucro_real/
2025-08-26 10:55:32,690 - INFO - Total de linhas: 1722534


In [13]:
df.show(20, truncate=False)
spark.stop()

+----+------------------+-----------+-------------------+---------------------------+
|ano |cnpj              |cnpj_da_scp|forma_de_tributacao|quantidade_de_escrituracoes|
+----+------------------+-----------+-------------------+---------------------------+
|2023|00.000.000/0001-91|0          |LUCRO REAL         |1                          |
|2023|00.000.028/0001-29|0          |LUCRO REAL         |1                          |
|2023|00.000.208/0001-00|0          |LUCRO REAL         |1                          |
|2023|00.000.410/0001-32|0          |LUCRO REAL         |1                          |
|2023|00.000.424/0001-56|0          |LUCRO REAL         |1                          |
|2023|00.000.776/0001-01|0          |LUCRO REAL         |1                          |
|2023|00.000.802/0001-00|0          |LUCRO REAL         |1                          |
|2023|00.000.885/0001-29|0          |LUCRO REAL         |1                          |
|2023|00.000.917/0001-96|0          |LUCRO REAL       