In [None]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Streaming from Kafka") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
    .config("spark.sql.shuffle.partitions", 4) \
    .master("local[*]") \
    .getOrCreate()

spark

In [None]:
KAFKA_BOOTSTRAP_SERVERS= "kafka1:19091,kafka2:19092,kafka3:19093"
schema = StructType([
    StructField("eventId", StringType()),
    StructField("eventOffset", StringType()),
    StructField("eventPublisher", StringType()),
    StructField("customerId", StringType()),
    StructField("data", StructType([
        StructField("devices", ArrayType(StructType([
            StructField("deviceId", StringType()),
            StructField("temperature", IntegerType()),
            StructField("measure", StringType()),
            StructField("status", StringType()),
        ])))
    ])),
    StructField("eventTime", StringType())
])

In [None]:
df = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", "devices") \
    .load() \
    .select(from_json(col("value").cast("string"), schema).alias("parsed_value")) \
    .select(col("parsed_value.*"))
df.show(10, True)

In [None]:
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").show(10, False)

In [None]:
lines = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", "devices") \
    .load() \
    .select(from_json(col("value").cast("string"), schema).alias("parsed_value")) \
    .select(col("parsed_value.*"))

In [None]:
query = lines \
    .writeStream \
    .trigger(processingTime='5 seconds')
    .outputMode('Append') \
    .format('console') \
    .start()

query.awaitTermination()