## Ingestão de Dados: Meios de Pagamentos

Vamos desenvolver toda a lógica da extração e consumo dos dados disponibiizados na camada Landing


Vamos desenvolver toda a lógica da extração dos dados disponibilizados na camada Landing, manipulando e gravando no Delta Lake entre as camadas Bronze, Silver e Gold.

In [1]:

# IMPORTS AND LIBRARIES
import os
from datetime import datetime
import logging

# Configuração do logger
logger = logging.getLogger("minio_logger")
logger.setLevel(logging.INFO)


# Configurando o formato do log
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


# PySpark Libraries
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, lit


In [2]:
# Variáveis Globais e de ambiente para o projeto.

os.environ["MINIO_KEY"] = "developer"
os.environ["MINIO_SECRET"] = "developer01"
os.environ["MINIO_ENDPOINT"] = "http://minio:9000"


# Paths Data Storage

bucket_name = "bank-databr"


root_path_dir = f"{bucket_name}"

landing_path_dir = f"{root_path_dir}/landing/bacen"
bronze_path_dir = f"{root_path_dir}/bronze"
silver_path_dir = f"{root_path_dir}/silver"


# Data de referência - Partition da tabela delta final

dt_partition = datetime.now().strftime("%Y-%M-%d")


### 02. Criando e configurando Spark Session

In [3]:

spark = SparkSession.builder \
                    .appName("MeiosDePagamentoBancoCentral") \
                    .config("spark.hadoop.fs.s3a.endpoint", os.environ["MINIO_ENDPOINT"]) \
                    .config("spark.hadoop.fs.s3a.access.key", os.environ["MINIO_KEY"]) \
                    .config("spark.hadoop.fs.s3a.secret.key", os.environ["MINIO_SECRET"]) \
                    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
                    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .getOrCreate()


/opt/spark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
software.amazon.awssdk#s3 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d6e38cbb-3492-4bde-9b69-525a6ae4aea9;1.0
	confs: [default]
	found software.amazon.awssdk#s3;2.26.30 in central
	found software.amazon.awssdk#aws-xml-protocol;2.26.30 in central
	found software.amazon.awssdk#aws-query-protocol;2.26.30 in central
	found software.amazon.awssdk#protocol-core;2.26.30 in central
	found software.amazon.awssdk#sdk-core;2.26.30 in central
	found software.amazon.awssdk#annotations;2.26.30 in central
	found software.amazon.awssdk#http-client-spi;2.26.30 in central
	found software.amazon.awssdk#utils;2.26.30 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found software.amazon.awssdk#metric

### 03. From Landing to Bronze

Explodindo o schema original dos arquivos e convertendo para o formato Delta na Bronze.

In [4]:

# Pagamentos com Cartões (Landing to Bronze)

df_cartoes = spark.read \
                  .option("inferSchema", True) \
                  .json(landing_path_dir + "/cartoes_trimestral/data_18_01_2025_17_39_57.json")

df_cartoes = df_cartoes.withColumn('value_struct', explode(col("value"))) \
                       .select("value_struct.*")     

df_cartoes.show(n=3, truncate=True, vertical=True)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/root/.jupyter/workspace/notebooks/bank-databr/landing/bacen/cartoes_trimestral/data_18_01_2025_17_39_57.json.

In [None]:
# Todos os Meios de Pagamentos visão Mensal (Landing to Bronze)

df_pagamentos_mensal_raw = spark.read \
                            .option("inferSchema", True) \
                            .json(landing_path_dir + "/meios_pagamentos_mensal/data_18_01_2025_17_39_57.json")

df_pagamentos_mensal_raw.printSchema()

# Explodindo o valor da coluna "value" em múltiplas colunas

df_pagamentos_mensal_transformed = df_pagamentos_mensal_raw.withColumn('value_struct', explode(col("value"))) \
                                                           .select("value_struct.*") 



df_pagamentos_mensal_transformed.show(n=3, vertical=True)


In [10]:

# Todos os Meios de Pagamentos visão Trimestral (Landing to Bronze)

data_source_file_path = "s3a://" + landing_path_dir + "/meios_pagamentos_trimestral/data_18_01_2025_17_39_57.json"
print(data_source_file_path)

df_pagamentos_trimestral_raw = spark.read \
                                    .option("inferSchema", True) \
                                   .json(data_source_file_path)

# Schema Original do arquivo origem
df_pagamentos_trimestral_raw.printSchema
df_pagamentos_trimestral_raw.show()






In [13]:


# Data Manipulation, Transformation

# Explodindo o valor da coluna "value" em múltiplas colunas

df_pagamentos_trimestral_transformed = df_pagamentos_trimestral_raw.withColumn('value_struct', explode(col("value"))) \
                                                                   .select("value_struct.*") \
                                                                   .withColumn('dt_partition', lit(dt_partition) )



# Schema Original do arquivo origem
df_pagamentos_trimestral_transformed.printSchema
df_pagamentos_trimestral_transformed.show(n=3, vertical=False)


In [20]:
""" Gravando tabela de Saída, formato Delta Camada Silver """

def table_exists(schema_name:str, table_name: str) -> None:
    """ Verifica se a tabela já existe no schema indicado """

    return spark._jsparkSession.catalog() \
                               .tableExists(f"{schema_name}.{table_name}")  



schema_name = "db_bank_databr"
table_name = "b_pagamentos_trimestrais_bc"

destionation_table_path = f"s3a://{bronze_path_dir}/{table_name}"
print(destionation_table_path)

# spark._jsparkSession.catalog().tableExists('b_pagamentos_trimestrais_bc')

spark.sql("CREATE SCHEMA IF NOT EXISTS db_bank_databr")
print('Listando todos os databases/schemas válidos:')
spark.catalog.listDatabases()

if table_exists(schema_name, table_name):
    
    logger.info(f"tabela já existe escrevendo nova partição em\n\t*{destionation_table_path}")

    df_pagamentos_trimestral_transformed.write \
                                        .format("delta") \
                                        .mode("overwrite") \
                                        .option("replaceWhere", f"partition = {dt_partition}") \
                                        .partitionBy("dt_partition") \
                                        .save(destionation_table_path)
else:

    logger.info(f"tabela não existe - criando tabela e escrevendo nova partição em\n\t*{destionation_table_path}")

    df_pagamentos_trimestral_transformed.write \
                                        .format("delta") \
                                        .mode("overwrite") \
                                        .partitionBy("dt_partition") \
                                        .save(destionation_table_path)                                       

                                 
logger.info("Schema final:")
df_pagamentos_trimestral_transformed.show(n=3, vertical=True)