In [1]:
from utils.spark import get_spark_session
import os

spark = get_spark_session("iceberg_DDL")

spark

:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-66b1a90a-298f-4a4a-9437-86880320deb1;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.6.1 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.95.0 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found sof

In [2]:
import os

from pyspark.sql.functions import col, expr, split, window, count, max
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.types import StringType, IntegerType

from utils.schema_registry_utils import SchemaRegistryUtils

class APIKeyMonitor:
   
  def __init__(self, spark, kafka_options):
    self.spark = spark
    self.kafka_options = kafka_options


  def extract_data(self, avro_schema: str):
    df_simple_transactions = (
      self.spark
        .readStream
        .format("kafka")
        .options(**self.kafka_options)
        .load()
        .withColumn("offset", col("offset").cast(IntegerType()))
        .withColumn("timestamp", col("timestamp").cast(StringType()))
        .withColumnRenamed("key", "topic_key")
        .withColumnRenamed("value", "topic_value")
        .withColumnRenamed("timestamp", "kafka_timestamp")
        .withColumnRenamed("partition", "kafka_partition")
        .select("topic_key", "topic_value",  "kafka_partition", "offset",
                "kafka_timestamp", "timestamptype", "topic"))
    return df_simple_transactions


  def transform_data(self, df_stream):
    return df_stream


  def load_data_to_console(self, df_transformed):
    write_stream_query = (
      df_transformed
        .writeStream
        .outputMode("append")
        .format("console")
        .start()
        .awaitTermination()
    )
    return write_stream_query

  def load_data_to_bronze(self, df_transformed):
    query = (
      df_transformed
        .writeStream
        .format("iceberg")
        .outputMode("append")
        .option("checkpointLocation", "hdfs://namenode:9000/dm_lakehouse/checkpoint/all_topics")
        .toTable("bronze_fast.all_topics")
    )
    return query


APP_NAME = "Handle_Simple_Transactions"
SPARK_URL = os.getenv("SPARK_MASTER_URL")

KAFKA_CLUSTER = os.getenv("KAFKA_BROKERS", "broker:29092")
CONSUMER_GROUP = os.getenv("CG_API_KEY_CONSUME", "cg_war")
STARTING_OFFSETS = os.getenv("STARTING_OFFSETS", "latest")
MAX_OFFSETS_PER_TRIGGER = os.getenv("MAX_OFFSETS_PER_TRIGGER", 1000)

SCHEMA_REGISTRY_URL = "http://schema-registry:8081"
SCHEMA_REGISTRY_SUBJECT = "mainnet.application.logs-value"

kafka_options = {
"kafka.bootstrap.servers": KAFKA_CLUSTER,
"subscribe": "mainnet.mined.block.metadata,mainnet.mined.txs.token.transfer",
"startingOffsets": STARTING_OFFSETS,
"group.id": CONSUMER_GROUP,
"maxOffsetsPerTrigger": MAX_OFFSETS_PER_TRIGGER 
}

engine = APIKeyMonitor(spark, kafka_options)
sc_client = SchemaRegistryUtils.get_schema_registry_client(SCHEMA_REGISTRY_URL)
avro_schema_logs = SchemaRegistryUtils.get_avro_schema(sc_client, SCHEMA_REGISTRY_SUBJECT)

data_extracted = engine.extract_data(avro_schema_logs)
data_transformed = engine.transform_data(data_extracted)
#query = engine.load_data_to_bronze(data_transformed)
query = engine.load_data_to_console(data_transformed)
query.awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+-----------+---------------+------+---------------+-------------+-----+
|topic_key|topic_value|kafka_partition|offset|kafka_timestamp|timestamptype|topic|
+---------+-----------+---------------+------+---------------+-------------+-----+
+---------+-----------+---------------+------+---------------+-------------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[65 32 37 34 37 6...|[00 00 00 00 04 8...|              0|  3879|2024-10-01 03:03:...|            0|mainnet.mined.txs...|
|[37 61 33 61 36 3...|[00 00 00 00 04 8...|              0|  3880|2024-10-01 03:03:...|            0|mainnet.mined.txs...|
|[65 38 38 38 35 3...|[00 00 00 00 04 8...|              0|  3881|2024-10-01 03:03:...|            0|mainnet.mined.txs...|
|[64 65 30 32 38 3...|[00 00 00 00 04 8...|              0|  3882|2024-10-01 03:03:...|            0|mainnet.mined.txs...|
|[34 31 65 61 66 6...|[00 00 00 00 04 8...

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[32 30 38 36 37 3...|[00 00 00 00 02 F...|              0|   200|2024-10-01 03:04:...|            0|mainnet.mined.blo...|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+

-------------------------------------------
Batch: 5
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset| 

                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[32 30 38 36 37 3...|[00 00 00 00 02 F...|              0|   201|2024-10-01 03:04:...|            0|mainnet.mined.blo...|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/bitnami/python/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


-------------------------------------------
Batch: 12
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[39 33 33 66 61 3...|[00 00 00 00 04 8...|              0|  3939|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[66 33 66 66 33 3...|[00 00 00 00 04 8...|              0|  3940|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[33 34 66 64 35 3...|[00 00 00 00 04 8...|              0|  3941|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[36 66 39 65 30 6...|[00 00 00 00 04 8...|              0|  3942|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
+--------------------+-------------------

                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[34 37 34 65 35 3...|[00 00 00 00 04 8...|              0|  3943|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[62 34 38 31 65 3...|[00 00 00 00 04 8...|              0|  3944|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[62 32 39 37 36 6...|[00 00 00 00 04 8...|              0|  3945|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[65 64 37 33 64 3...|[00 00 00 00 04 8...|              0|  3946|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[31 65 63 39 37 3...|[00 00 00 00 04 8..

KeyboardInterrupt: 

-------------------------------------------
Batch: 14
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[65 39 65 63 33 3...|[00 00 00 00 04 8...|              0|  3948|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[37 64 39 61 64 3...|[00 00 00 00 04 8...|              0|  3949|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+

-------------------------------------------
Batch: 15
-------------------------------------------
+--------------------+--------------------+---------------+------

                                                                                

-------------------------------------------
Batch: 17
-------------------------------------------
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|           topic_key|         topic_value|kafka_partition|offset|     kafka_timestamp|timestamptype|               topic|
+--------------------+--------------------+---------------+------+--------------------+-------------+--------------------+
|[62 39 30 32 34 3...|[00 00 00 00 04 8...|              0|  3961|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[36 62 66 31 37 6...|[00 00 00 00 04 8...|              0|  3962|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[30 64 31 62 33 3...|[00 00 00 00 04 8...|              0|  3963|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
|[66 33 38 36 38 3...|[00 00 00 00 04 8...|              0|  3964|2024-10-01 03:04:...|            0|mainnet.mined.txs...|
+--------------------+-------------------

In [3]:
spark.stop()

24/10/01 03:04:37 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 18, writer: ConsoleWriter[numRows=20, truncate=true]] is aborting.
24/10/01 03:04:37 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 18, writer: ConsoleWriter[numRows=20, truncate=true]] aborted.
24/10/01 03:04:37 ERROR MicroBatchExecution: Query [id = 8a456d78-f365-4abb-97ec-58a71b0fc516, runId = 260702d8-ad4d-411b-af8c-fd946a37f224] terminated with error
java.lang.IllegalStateException: SparkContext has been shutdown
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2390)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:385)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:359)
	at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:307)
	at org.apache.spark.sql.execut