In [None]:
from pyspark.sql import SparkSession

"""
in order to check this script you need to enter in the docker container and execute the file manually.
check the kafka UI to see the topic dequeueing the messages and check the spark ui as well 
to see the processing.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StringType, StructType, StructField

# Create a Spark session with explicit Kafka package
spark = SparkSession \
    .builder \
    .appName("KafkaSparkIntegration") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("WARN")

# Print the Java classpath to verify Kafka libraries are included
print("Java classpath:")
print(spark.sparkContext._jvm.System.getProperty("java.class.path"))

# Read from Kafka
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "test-topic") \
    .option("startingOffsets", "earliest") \
    .load()

# Process the data 
df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

# Output to console with truncate=False to show full messages
query = df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .start()

query.awaitTermination()


Java classpath:
/usr/local/spark/conf/:/usr/local/spark/jars/netty-transport-native-epoll-4.1.96.Final-linux-aarch_64.jar:/usr/local/spark/jars/kubernetes-model-resource-6.7.2.jar:/usr/local/spark/jars/spark-sketch_2.12-3.5.3.jar:/usr/local/spark/jars/commons-compress-1.23.0.jar:/usr/local/spark/jars/kubernetes-model-storageclass-6.7.2.jar:/usr/local/spark/jars/metrics-jmx-4.2.19.jar:/usr/local/spark/jars/spark-catalyst_2.12-3.5.3.jar:/usr/local/spark/jars/jersey-server-2.40.jar:/usr/local/spark/jars/spire_2.12-0.17.0.jar:/usr/local/spark/jars/hive-serde-2.3.9.jar:/usr/local/spark/jars/snakeyaml-2.0.jar:/usr/local/spark/jars/aopalliance-repackaged-2.6.1.jar:/usr/local/spark/jars/avro-1.11.2.jar:/usr/local/spark/jars/json4s-core_2.12-3.7.0-M11.jar:/usr/local/spark/jars/netty-transport-native-unix-common-4.1.96.Final.jar:/usr/local/spark/jars/audience-annotations-0.5.0.jar:/usr/local/spark/jars/commons-pool-1.5.4.jar:/usr/local/spark/jars/datanucleus-core-4.1.17.jar:/usr/local/spark/jars