# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Spark: Structured Streaming (Kafka + Watermarking)** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [94]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [95]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Kafka-Watermarking") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

### Creación del Kafka Stream

In [96]:
kafka_lines = spark \
                .readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", "cfb7d55576f2:9093") \
                .option("subscribe", "kafka-spark-example") \
                .load()

kafka_lines.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### Transform binary data into string

In [97]:
kafka_df = kafka_lines.withColumn("value_str", kafka_lines.value.cast("string"))

In [98]:
from pyspark.sql.functions import explode, split

words = kafka_df.select(explode(split(kafka_df.value, " ")).alias("word"), "timestamp")
words.printSchema()

root
 |-- word: string (nullable = false)
 |-- timestamp: timestamp (nullable = true)



### Aplicando el mecanismo para manejar datos tardios con marcas de agua (watermarking)

In [99]:
from pyspark.sql.functions import window, avg, min, max, count
windowed_counts =  words \
                        .withWatermark("timestamp", "2 minutes") \
                        .groupBy(window(words.timestamp, 
                                        "30 seconds", # Window duration 
                                        "5 seconds"), # Slide duration
                                 words.word) \
                        .agg(count("*").alias("word_count"))

windowed_counts.printSchema()

result = windowed_counts \
              .groupBy("word") \
              .agg(min("word_count").alias("min_word_count"), max("word_count").alias("max_word_count"))

result.printSchema()


root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- word: string (nullable = false)
 |-- word_count: long (nullable = false)

root
 |-- word: string (nullable = false)
 |-- min_word_count: long (nullable = true)
 |-- max_word_count: long (nullable = true)



### Configuración del "Sink" del stream

In [100]:
spark.conf.set("spark.sql.shuffle.partitions", "5")
spark.conf.set("spark.sql.streaming.statefulOperator.checkCorrectness.enabled", "false")

query = result \
                .writeStream \
                .outputMode("complete") \
                .trigger(processingTime='5 seconds') \
                .format("console") \
                .option("truncate", "false") \
                .start()

query.awaitTermination(30)

25/04/08 14:43:26 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-2ef9f94b-6361-4a4d-a5e4-ff902f75c15a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/08 14:43:26 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/04/08 14:43:26 WARN UnsupportedOperationChecker: Detected pattern of possible 'correctness' issue due to global watermark. The query contains stateful operation which can emit rows older than the current watermark plus allowed late record delay, which are "late rows" in downstream stateful operations and these rows can be discarded. Please refer the programming guide doc for more details. If you understand the possible risk of correctness issue and still need to r

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------------+--------------+
|word|min_word_count|max_word_count|
+----+--------------+--------------+
+----+--------------+--------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+--------------+--------------+
|word|min_word_count|max_word_count|
+----+--------------+--------------+
|hola|1             |1             |
+----+--------------+--------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----+--------------+--------------+
|word|min_word_count|max_word_count|
+----+--------------+--------------+
|cat |1             |1             |
|dog |1             |1             |
|hola|1             |3             |
+----+--------------+--------------+



False

In [102]:
query.stop()
sc.stop()