In [0]:

# Importando as bibliotecas necessárias
import os
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType

# 1. Ler a connection string 
connection_string = os.getenv("EVENT_HUB_CONNECTION_STRING") 

if not connection_string:
    raise ValueError("EVENT_HUB_CONNECTION_STRING não foi encontrada. Verifique a configuração do cluster.")

# 2. CRIPTOGRAFAR
encrypted_connection_string = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connection_string)

# 3. Criar a configuração do Event Hub
eh_conf = {
  'eventhubs.connectionString' : encrypted_connection_string,
  'eventhubs.consumerGroup' : '$Default'
}

# Caminhos para o nosso Data Lake (ADLS Gen2)
storage_account_name = "adlshydra" 
raw_path = f"abfss://raw@{storage_account_name}.dfs.core.windows.net/"
checkpoint_path = f"abfss://raw@{storage_account_name}.dfs.core.windows.net/checkpoint_ingestion"

# Nome da nossa tabela na camada Raw
raw_table_name = "raw_transactions"

In [0]:
# Estrutura dos dados
schema = StructType([
    StructField("transaction_id", StringType(), False),
    StructField("timestamp_utc", StringType(), False),
    StructField("user_id", StringType(), False), # Lendo como String para flexibilidade
    StructField("card_number", StringType(), False),
    StructField("amount_brl", DoubleType(), False),
    StructField("merchant_name", StringType(), False),
    StructField("merchant_city", StringType(), False)
])

In [0]:
# Lendo os dados do Event Hubs como um stream
raw_df = spark \
    .readStream \
    .format("eventhubs") \
    .options(**eh_conf) \
    .load()

# Convertendo o corpo da mensagem (body) para string e aplicando o schema
parsed_df = raw_df \
    .select(col("body").cast("string")) \
    .select(from_json(col("body"), schema).alias("data")) \
    .select("data.*")

parsed_df.printSchema()

In [0]:
# Escrevendo o stream de dados na nossa tabela Delta na camada raw
streaming_query = parsed_df \
    .writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .trigger(availableNow=True) \
    .toTable(raw_table_name) # vailableNow=True processa todos os dados disponíveis e para