## Ingestão de Dados: Meios de Pagamentos


Neste notebook vamos desenvolver toda a lógica de extração dos dados (disponibilizados na camada Landing, no formato original) para um formato padrão, tendo como destino a camada bronze. O formato padrão utilizado será o Delta.



Ajustei a versão do docker compose para criar uma imagem personalizada que já instala automaticamente as libraries abaixo:

! pip install -q s3fs # Necessário para integração pyspark-minio(data storagge)

! pip install -q boto3 # Provê funções mais configurações para integração com o data storage

! pip install -q catppuccin-jupyterlab # opcional- traz temas mais elegantes para o Jupyter Lab


In [14]:
# Import and Libraries

import requests
from datetime import datetime
import delta
import json
import os

import boto3
import s3fs

import logging

# Configuração do logger
logger = logging.getLogger("minio_logger")
logger.setLevel(logging.INFO)

# Configurando o formato do log
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)



# PySpark Libraries

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode


In [2]:
# Variáveis Globais e de ambiente para o projeto.

os.environ["MINIO_KEY"] = "developer"
os.environ["MINIO_SECRET"] = "developer01"
os.environ["MINIO_ENDPOINT"] = "http://minio:9000"


# Paths Data Storage

bucket_name = "bank-databr"


root_path_dir = f"{bucket_name}"

landing_path_dir = f"{root_path_dir}/landing/bacen"
bronze_path_dir = f"{root_path_dir}/bronze"
silver_path_dir = f"{root_path_dir}/silver"


# Data de referência - Partition da tabela delta final

dt_partition = datetime.now().strftime("%Y-%M-%d")


### 01. Testando conexão e integração como o Storage (MinIO)

#### 01.1 - Criando Client para conexão com o Storage MinIO

In [3]:



s3_client = boto3.client(
    's3',
    endpoint_url = os.environ.get("MINIO_ENDPOINT"),
    aws_access_key_id = os.environ.get("MINIO_KEY"),
    aws_secret_access_key = os.environ.get("MINIO_SECRET")
)


#### 01.2 - Gravando Arquivo de Exemplo

In [6]:

class DataModel:
    """ Simplifica e Padroniza objetos a serem gravados no Storage MinIO S3 """

    def __init__(self, bucket_name: str, destination_file_path: str, file_content: str):
        self.bucket_name = bucket_name
        self.destination_file_path = destination_file_path
        self.file_content = file_content
        


# Arquivo de Exemplo para testes no Client MinIO
teste_data = DataModel(    
    bucket_name = bucket_name,
    destination_file_path = f"/landing/bacen/teste.json",
    file_content = json.dumps({"message": "hello world"})    
)



def write_file(s3_client: boto3.client, data_params: DataModel) -> None:
    """ Grava arquivo no Storage 
        Args:
            s3_client: Client de conexão/integração com o storage
            data_params: Objeto a ser gravado no storage padrão DataModel
    """

    logger.info("Gravando Arquivo")
    
    try:
        

        s3_client.put_object(
            Bucket = data_params.bucket_name, 
            Key = data_params.destination_file_path,
            Body = data_params.file_content
        )
    
        logger.info(f"\t* Arquivo {data_params.destination_file_path} gravado com sucesso")
    except Exception as e:
        logger.erro(f"\t* Erro ao tentar gravar arquivo {data_params.destination_file_path,}/{data_params.file_content}, error: {e}")


        
# Gravando um novo Arquivo no Storage
write_file(s3_client, teste_data)




2025-01-18 04:41:02,947 - INFO - Gravando Arquivo
2025-01-18 04:41:03,080 - INFO - 	* Arquivo /landing/bacen/teste.json gravado com sucesso


#### 01.3 - Listando todos os arquivos no Bucket informado

In [7]:


def list_files_in_bucket(s3_client:boto3.client, destination_file_path:str) -> list:
    """ Lista todos os arquivos no caminho indicado dentro do Storage 
        Args:
            s3_client: Client de integração/conexão com o storage 
            path: caminho a ser listado dentro do bucket
    """

    logger.info(f"Listando todos os arquivos em {destination_file_path}")

    bucket_name, *prefix_parts = destination_file_path.replace("s3://", "").split("/", 1)
    prefix = prefix_parts[0] if prefix_parts else ""
   
    try:
        paginator = s3_client.get_paginator("list_objects_v2")
        for page in paginator.paginate(Bucket=bucket_name, Prefix = prefix):
    
            if "Contents" in page:
                for obj in page['Contents']:
                    yield obj['Key'] # retorna o nome do arquivo
            else:
                log.warn("\t* Nenhum arquivo encontrado")
                
    except Exception as e:
        logger.error(f"Erro ao lista arquivos: {e}")


# Listando todos os arquivos na pasta raiz do Storage
print('\nListando todos os arquivos na Landing')
for file in list_files_in_bucket(s3_client, landing_path_dir):
    print(f"\t* {file}")





2025-01-18 04:41:12,162 - INFO - Listando todos os arquivos em bank-databr/landing/bacen



Listando todos os arquivos na Landing
	* landing/bacen/cartoes_trimestral/data_11_01_2025_22_02_56.json
	* landing/bacen/cartoes_trimestral/teste.json
	* landing/bacen/meios_pagamentos_mensal/data_11_01_2025_22_02_56.json
	* landing/bacen/meios_pagamentos_trimestral/data_11_01_2025_22_02_56.json
	* landing/bacen/teste.json


#### 01.4 - Deletando arquivo de Testes

In [8]:



def remove_files_in_bucket(s3_client:boto3.client, file_path:str, file_name:str) -> None:
    """ Remove arquivo, se existir, no folder indicado
        Args:
            s3_client: Client de integração/conexão com o storage
            file_path: Caminho do arquivo a ser deletado no bucket/folder indicado
            file_name: Nome do arquivo a ser deletado no bucket/folder indicado
    
    """
    logger.info(f"Removendo arquivo: {file_path}/{file_name}")

    try:
        s3_client.delete_object(Bucket=file_path, Key=file_name)
        logger.info(f"\t* Aquivo {file_path}/{file_name} deletado com sucesso!")

    except Exception as e:
        logger.error(f"\t* Erro ao tentar deletar arquivo: {file_path}/{file_name}, error: {e}")

# Deletando arquivo no Storage
remove_files_in_bucket(s3_client, bucket_name, "teste.json")



2025-01-18 04:41:17,249 - INFO - Removendo arquivo: bank-databr/teste.json
2025-01-18 04:41:17,302 - INFO - 	* Aquivo bank-databr/teste.json deletado com sucesso!


### 02. Criando e configurando Spark Session

In [15]:

from delta import configure_spark_with_delta_pip 

spark = SparkSession.builder \
                    .appName("MeiosDePagamentoBancoCentral") \
                    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.11.1026") \
                    .config("spark.hadoop.fs.s3a.endpoint", os.environ["MINIO_ENDPOINT"]) \
                    .config("spark.hadoop.fs.s3a.access.key", os.environ["MINIO_KEY"]) \
                    .config("spark.hadoop.fs.s3a.secret.key", os.environ["MINIO_SECRET"]) \
                    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
                    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .enableHiveSupport() \
                    .getOrCreate()

spark = configure_spark_with_delta_pip(spark).getOrCreate()


PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

### 03. From Landing to Bronze

Explodindo o schema original dos arquivos e convertendo para o formato Delta na Bronze.

In [9]:


# Pagamentos com Cartões (Landing to Bronze)

df_cartoes = spark.read \
                  .option("inferSchema", True) \
                  .json(landing_path_dir + "/cartoes_trimestral/data_11_01_2025_22_02_56.json")

df_cartoes = df_cartoes.withColumn('value_struct', explode(col("value"))) \
                       .select("value_struct.*")     

df_cartoes.show(n=3, truncate=True, vertical=True)

NameError: name 'spark' is not defined

In [17]:
# Todos os Meios de Pagamentos visão Mensal (Landing to Bronze)

df_pagamentos_mensal_raw = spark.read \
                            .option("inferSchema", True) \
                            .json(landing_path_dir + "/meios_pagamentos_mensal/data_11_01_2025_22_02_56.json")

df_pagamentos_mensal_raw.printSchema()

# Explodindo o valor da coluna "value" em múltiplas colunas

df_pagamentos_mensal_transformed = df_pagamentos_mensal_raw.withColumn('value_struct', explode(col("value"))) \
                                                           .select("value_struct.*") 



df_pagamentos_mensal_transformed.show(n=3, vertical=True)


root
 |-- @odata.context: string (nullable = true)
 |-- value: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- AnoMes: string (nullable = true)
 |    |    |-- quantidadeBoleto: double (nullable = true)
 |    |    |-- quantidadeCheque: double (nullable = true)
 |    |    |-- quantidadeDOC: double (nullable = true)
 |    |    |-- quantidadePix: double (nullable = true)
 |    |    |-- quantidadeTEC: double (nullable = true)
 |    |    |-- quantidadeTED: double (nullable = true)
 |    |    |-- valorBoleto: double (nullable = true)
 |    |    |-- valorCheque: double (nullable = true)
 |    |    |-- valorDOC: double (nullable = true)
 |    |    |-- valorPix: double (nullable = true)
 |    |    |-- valorTEC: double (nullable = true)
 |    |    |-- valorTED: double (nullable = true)

-RECORD 0----------------------
 AnoMes           | 202411     
 quantidadeBoleto | 332501.15  
 quantidadeCheque | 9018.91    
 quantidadeDOC    | 0.0        
 quantidadePix

In [89]:

# Todos os Meios de Pagamentos visão Trimestral (Landing to Bronze)

data_source_file_path = "s3a://" + landing_path_dir + "/meios_pagamentos_trimestral/data_11_01_2025_22_02_56.json"
print(data_source_file_path)

df_pagamentos_trimestral_raw = spark.read \
                                    .option("inferSchema", True) \
                                   .json(data_source_file_path)

# Schema Original do arquivo origem

df_pagamentos_trimestral_raw.printSchema()


# Explodindo o valor da coluna "value" em múltiplas colunas

df_pagamentos_trimestral_transformed = df_pagamentos_trimestral_raw.withColumn('value_struct', explode(col("value"))) \
                                                                       .select("value_struct.*") 

destionation_table_path = f"{bronze_path_dir}/b_pagamentos_trimestrais_bc"

print(destionation_table_path)
# df_pagamentos_trimestral_transformed.write \
#                                     .format("delta") \
#                                     .mode("overwrite") \
#                                     .option("replaceWhere", f"partition = {dt_partition}") \
#                                     .partitionBy("dt_partition") \
#                                     .save(destionation_table_path)

                                 


df_pagamentos_trimestral_transformed.show(n=3, vertical=True)


s3a://bank-databr/landing/bacen/meios_pagamentos_trimestral/data_11_01_2025_22_02_56.json


Py4JJavaError: An error occurred while calling o89.json.
: org.apache.spark.SparkException: Cannot find catalog plugin class for catalog 'spark_catalog': org.apache.spark.sql.delta.catalog.DeltaCatalog.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.catalogPluginClassNotFoundForCatalogError(QueryExecutionErrors.scala:1925)
	at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:70)
	at org.apache.spark.sql.connector.catalog.CatalogManager.loadV2SessionCatalog(CatalogManager.scala:67)
	at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$v2SessionCatalog$2(CatalogManager.scala:86)
	at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	at org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$v2SessionCatalog$1(CatalogManager.scala:86)
	at scala.Option.map(Option.scala:230)
	at org.apache.spark.sql.connector.catalog.CatalogManager.v2SessionCatalog(CatalogManager.scala:85)
	at org.apache.spark.sql.connector.catalog.CatalogManager.catalog(CatalogManager.scala:51)
	at org.apache.spark.sql.connector.catalog.CatalogManager.currentCatalog(CatalogManager.scala:122)
	at org.apache.spark.sql.connector.catalog.CatalogManager.currentNamespace(CatalogManager.scala:93)
	at org.apache.spark.sql.catalyst.optimizer.ReplaceCurrentLike.apply(finishAnalysis.scala:110)
	at org.apache.spark.sql.catalyst.optimizer.ReplaceCurrentLike.apply(finishAnalysis.scala:107)
	at org.apache.spark.sql.catalyst.optimizer.Optimizer$FinishAnalysis$.$anonfun$apply$1(Optimizer.scala:293)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.optimizer.Optimizer$FinishAnalysis$.apply(Optimizer.scala:293)
	at org.apache.spark.sql.catalyst.optimizer.Optimizer$FinishAnalysis$.apply(Optimizer.scala:275)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.IndexedSeqOptimized.foldLeft(IndexedSeqOptimized.scala:60)
	at scala.collection.IndexedSeqOptimized.foldLeft$(IndexedSeqOptimized.scala:68)
	at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:38)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:152)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:148)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:144)
	at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:162)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:182)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:179)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:207)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:206)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.inferFromDataset(JsonDataSource.scala:103)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.infer(JsonDataSource.scala:98)
	at org.apache.spark.sql.execution.datasources.json.JsonDataSource.inferSchema(JsonDataSource.scala:64)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat.inferSchema(JsonFileFormat.scala:59)
	at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:208)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:205)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:407)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:362)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.delta.catalog.DeltaCatalog
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:60)
	... 67 more
