In [1]:
import os
from utils.spark import get_spark_session
import os

from pyspark.sql.functions import col, expr, split, window, count, max
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.types import StringType, IntegerType

from utils.schema_registry_utils import SchemaRegistryUtils

class MultiplexIngestor:
   
  def __init__(self, spark, table_name, kafka_options):
    self.spark = spark
    self.kafka_options = kafka_options
    #spark.sql("SHOW DATABASES").show()
    self.table_name = table_name


  def create_table(self):
    self.spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {self.table_name} (
    key binary,
    value binary,
    partition int,
    offset long,
    timestamp timestamp,
    topic string) 
    USING ICEBERG PARTITIONED BY (topic)
    """).show()
    spark.table(BRONZE_TABLE).printSchema()
    
  def extract_data(self):
    df_simple_transactions = (
      self.spark
        .readStream
        .format("kafka")
        .options(**self.kafka_options)
        .load()
        .select("key","value","partition","offset","timestamp","topic"))
    return df_simple_transactions


  def load_data_to_console(self, df_transformed):
    write_stream_query = (
      df_transformed
        .writeStream
        .outputMode("append")
        .format("console")
        .start()
        .awaitTermination()
    )
    return write_stream_query

  def load_data_to_bronze(self, df_transformed):
    query = (
      df_transformed
        .writeStream
        .format("iceberg")
        .outputMode("append")
        .option("checkpointLocation", "s3a://sistemas/checkpoints/multiplex_bronze")
        .toTable("nessie.test")
    )
    return query


APP_NAME = "bronze_multiplex"
SPARK_URL = os.getenv("SPARK_MASTER_URL")

KAFKA_CLUSTER = os.getenv("KAFKA_BROKERS", "broker:29092")
CONSUMER_GROUP = os.getenv("CG_API_KEY_CONSUME", "cg_war")
STARTING_OFFSETS = os.getenv("STARTING_OFFSETS", "latest")
MAX_OFFSETS_PER_TRIGGER = os.getenv("MAX_OFFSETS_PER_TRIGGER", 1000)
BRONZE_TABLE = 'nessie.test'


kafka_options = {
"kafka.bootstrap.servers": KAFKA_CLUSTER,
"subscribe": "mainnet.mined.block.metadata,mainnet.mined.txs.token.transfer",
"startingOffsets": STARTING_OFFSETS,
"group.id": CONSUMER_GROUP,
"maxOffsetsPerTrigger": MAX_OFFSETS_PER_TRIGGER 
}

spark = get_spark_session(APP_NAME)
engine = MultiplexIngestor(spark, BRONZE_TABLE, kafka_options)

engine.create_table()



spark

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2866d781-db5b-418d-98b9-e55fa65faca8;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.1 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.95.0 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found sof

++
||
++
++

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- topic: string (nullable = true)



In [2]:
data_extracted = engine.extract_data()
query = engine.load_data_to_bronze(data_extracted)
#query = engine.load_data_to_console(data_extracted)
query.awaitTermination()

24/10/04 00:11:00 ERROR TaskSetManager: Task 43 in stage 3.0 failed 4 times; aborting job
24/10/04 00:11:00 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 1, writer: IcebergStreamingWrite(table=nessie.test, format=PARQUET)] is aborting.
24/10/04 00:11:00 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 1, writer: IcebergStreamingWrite(table=nessie.test, format=PARQUET)] aborted.
24/10/04 00:11:00 ERROR MicroBatchExecution: Query [id = f0393b8a-2e05-453c-991e-0346cca6de12, runId = 82bf1b0b-b657-4de8-bbb2-48ad3da3a062] terminated with error
org.apache.spark.SparkException: Job aborted due to stage failure: Task 43 in stage 3.0 failed 4 times, most recent failure: Lost task 43.3 in stage 3.0 (TID 239) (172.21.0.5 executor 0): java.lang.NoSuchMethodError: 'void software.amazon.awssdk.utils.IoUtils.closeQuietly(java.lang.AutoCloseable, software.amazon.awssdk.thirdparty.org.slf4j.Logger)'
	at software.amazon.awssdk.core.util.Sdk

StreamingQueryException: [STREAM_FAILED] Query [id = f0393b8a-2e05-453c-991e-0346cca6de12, runId = 82bf1b0b-b657-4de8-bbb2-48ad3da3a062] terminated with exception: Job aborted due to stage failure: Task 43 in stage 3.0 failed 4 times, most recent failure: Lost task 43.3 in stage 3.0 (TID 239) (172.21.0.5 executor 0): java.lang.NoSuchMethodError: 'void software.amazon.awssdk.utils.IoUtils.closeQuietly(java.lang.AutoCloseable, software.amazon.awssdk.thirdparty.org.slf4j.Logger)'
	at software.amazon.awssdk.core.util.SdkUserAgent.kotlinVersion(SdkUserAgent.java:173)
	at software.amazon.awssdk.core.util.SdkUserAgent.getAdditionalJvmLanguages(SdkUserAgent.java:123)
	at software.amazon.awssdk.core.util.SdkUserAgent.getUserAgent(SdkUserAgent.java:98)
	at software.amazon.awssdk.core.util.SdkUserAgent.initializeUserAgent(SdkUserAgent.java:81)
	at software.amazon.awssdk.core.util.SdkUserAgent.<init>(SdkUserAgent.java:51)
	at software.amazon.awssdk.core.util.SdkUserAgent.create(SdkUserAgent.java:58)
	at software.amazon.awssdk.core.client.builder.SdkDefaultClientBuilder.lambda$mergeGlobalDefaults$2(SdkDefaultClientBuilder.java:259)
	at software.amazon.awssdk.utils.builder.SdkBuilder.applyMutation(SdkBuilder.java:61)
	at software.amazon.awssdk.core.client.config.SdkClientConfiguration.merge(SdkClientConfiguration.java:66)
	at software.amazon.awssdk.core.client.builder.SdkDefaultClientBuilder.mergeGlobalDefaults(SdkDefaultClientBuilder.java:255)
	at software.amazon.awssdk.core.client.builder.SdkDefaultClientBuilder.syncClientConfiguration(SdkDefaultClientBuilder.java:169)
	at software.amazon.awssdk.services.s3.DefaultS3ClientBuilder.buildClient(DefaultS3ClientBuilder.java:27)
	at software.amazon.awssdk.services.s3.DefaultS3ClientBuilder.buildClient(DefaultS3ClientBuilder.java:22)
	at software.amazon.awssdk.core.client.builder.SdkDefaultClientBuilder.build(SdkDefaultClientBuilder.java:140)
	at org.apache.iceberg.aws.AwsClientFactories$DefaultAwsClientFactory.s3(AwsClientFactories.java:117)
	at org.apache.iceberg.aws.s3.S3FileIO.client(S3FileIO.java:327)
	at org.apache.iceberg.aws.s3.S3FileIO.newOutputFile(S3FileIO.java:135)
	at org.apache.iceberg.io.OutputFileFactory.newOutputFile(OutputFileFactory.java:117)
	at org.apache.iceberg.io.RollingFileWriter.newFile(RollingFileWriter.java:115)
	at org.apache.iceberg.io.RollingFileWriter.openCurrentWriter(RollingFileWriter.java:106)
	at org.apache.iceberg.io.RollingDataWriter.<init>(RollingDataWriter.java:47)
	at org.apache.iceberg.io.FanoutDataWriter.newWriter(FanoutDataWriter.java:53)
	at org.apache.iceberg.io.FanoutWriter.writer(FanoutWriter.java:63)
	at org.apache.iceberg.io.FanoutWriter.write(FanoutWriter.java:51)
	at org.apache.iceberg.io.FanoutDataWriter.write(FanoutDataWriter.java:31)
	at org.apache.iceberg.spark.source.SparkWrite$PartitionedDataWriter.write(SparkWrite.java:781)
	at org.apache.iceberg.spark.source.SparkWrite$PartitionedDataWriter.write(SparkWrite.java:751)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.write(WriteToDataSourceV2Exec.scala:493)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$1(WriteToDataSourceV2Exec.scala:448)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:486)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:425)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:491)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:388)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)

Driver stacktrace:

In [3]:
spark.stop()