# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **Examples on Structured Streaming (Kafka with Watermarking)** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [None]:
import findspark
findspark.init()

#### Spark Session creation

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Kafka-Watermarking") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

### Kafka Stream init

In [None]:
kafka_lines = spark \
                .readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", "cfb7d55576f2:9093") \
                .option("subscribe", "kafka-spark-example") \
                .load()

kafka_lines.printSchema()

### Transform binary data into string

In [None]:
kafka_df = kafka_lines.withColumn("value_str", kafka_lines.value.cast("string"))

In [None]:
from pyspark.sql.functions import explode, split

words = kafka_df.select(explode(split(kafka_df.value, " ")).alias("word"), "timestamp")
words.printSchema()

### Use Watermarking to handle late arrival events

In [None]:
from pyspark.sql.functions import window
windowed_counts =  words \
                        .withWatermark("timestamp", "2 minutes") \
                        .groupBy(window(words.timestamp, 
                                        "60 seconds", # Window duration 
                                        "30 seconds"), # Slide duration
                                 words.word) \
                        .count()

### Sink configuration

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

query = windowed_counts \
                .writeStream \
                .outputMode("update") \
                .trigger(processingTime='30 seconds') \
                .format("console") \
                .option("truncate", "false") \
                .start()

query.awaitTermination(300)

In [31]:
query.stop()
sc.stop()