# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Spark: Structured Streaming (Kafka + Watermarking)** </center>

---
**Alumnos**: David Abraham Naranjo, Benjamin Zarate y Angel Cortes

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Kafka-Watermarking") \
    .master("spark://873bad4e62fe:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-73500fb3-d14a-4ae2-90c0-67d0a1df523f;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.5.4 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found org.apache.commons#commons-pool2;2.11.1 in centr

### Creación del Kafka Stream

In [3]:
kafka_lines = spark \
                .readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", "fc2fc5ae9bbc:9093") \
                .option("subscribe", "kafka-spark-example") \
                .load()

kafka_lines.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### Transform binary data into string

In [4]:
kafka_df = kafka_lines.withColumn("value_str", kafka_lines.value.cast("string"))

In [None]:
from pyspark.sql.functions import explode, split

words = kafka_df.select(explode(split(kafka_df.value, " ")).alias("word"), "timestamp")
words.printSchema()

root
 |-- word: string (nullable = false)
 |-- timestamp: timestamp (nullable = true)



### Aplicando el mecanismo para manejar datos tardios con marcas de agua (watermarking)

In [14]:
from pyspark.sql.functions import window
windowed_counts =  words \
                        .withWatermark("timestamp", "2 minutes") \
                        .groupBy(window(words.timestamp, 
                                        "60 seconds", # Window duration 
                                        "30 seconds"), # Slide duration
                                 words.word) \
                        .count()

In [None]:
from pyspark.sql.functions import length

words_with_length = words.withColumn("word_length", length(words.word))


### Average

In [None]:
from pyspark.sql.functions import avg

windowed_avg = words_with_length \
    .withWatermark("timestamp", "2 minutes") \
    .groupBy(window(words_with_length.timestamp, 
                    "2 minutes", # duración de la ventana
                    "1 minute"), # duración del desplazamiento
             words_with_length.word) \
    .agg(avg("word_length").alias("avg_length"))


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `event_time` cannot be resolved. Did you mean one of the following? [`timestamp`, `word`].;
'EventTimeWatermark 'event_time, 10 minutes
+- Project [word#74, timestamp#55]
   +- Generate explode(split(cast(value#51 as string),  , -1)), false, [word#74]
      +- Project [key#50, value#51, topic#52, partition#53, offset#54L, timestamp#55, timestampType#56, cast(value#51 as string) AS value_str#64]
         +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@5a897309, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@25730a46, [kafka.bootstrap.servers=fc2fc5ae9bbc:9093, subscribe=kafka-spark-example], [key#50, value#51, topic#52, partition#53, offset#54L, timestamp#55, timestampType#56], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@499dff49,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> fc2fc5ae9bbc:9093, subscribe -> kafka-spark-example),None), kafka, [key#43, value#44, topic#45, partition#46, offset#47L, timestamp#48, timestampType#49]


In [None]:
# Promedio
query_avg = windowed_avg \
    .writeStream \
    .outputMode("update") \
    .trigger(processingTime='30 seconds') \
    .format("console") \
    .option("truncate", "false") \
    .start()

query_avg.awaitTermination(300)


In [None]:
query_avg.stop()

### Min y Max

In [None]:
from pyspark.sql.functions import min, max

windowed_min_max = words_with_length \
    .withWatermark("timestamp", "2 minutes") \
    .groupBy(window(words_with_length.timestamp, 
                    "2 minutes", 
                    "1 minute"),
             words_with_length.word) \
    .agg(min("word_length").alias("min_length"),
         max("word_length").alias("max_length"))


In [None]:
query_min_max = windowed_min_max \
    .writeStream \
    .outputMode("update") \
    .trigger(processingTime='30 seconds') \
    .format("console") \
    .option("truncate", "false") \
    .start()

query_min_max.awaitTermination(300)

NameError: name 'windowed_min_max' is not defined

In [8]:
query_min_max.stop()

NameError: name 'query_min_max' is not defined

### Suma

In [None]:
from pyspark.sql.functions import sum

windowed_sum = words_with_length \
    .withWatermark("timestamp", "2 minutes") \
    .groupBy(window(words_with_length.timestamp, 
                    "2 minutes", 
                    "1 minute"),
             words_with_length.word) \
    .agg(sum("word_length").alias("sum_length"))


In [None]:
query_sum = windowed_sum \
    .writeStream \
    .outputMode("update") \
    .trigger(processingTime='30 seconds') \
    .format("console") \
    .option("truncate", "false") \
    .start()

query_sum.awaitTermination(300)

In [None]:
query_sum.stop()

### Configuración del "Sink" del stream

In [16]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

query = windowed_counts \
                .writeStream \
                .outputMode("update") \
                .trigger(processingTime='30 seconds') \
                .format("console") \
                .option("truncate", "false") \
                .start()

query.awaitTermination(300)

25/04/08 18:40:04 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-2d64b212-b1f0-411f-b84d-7076bd985d6d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/08 18:40:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/04/08 18:40:05 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+----+-----+
|window|word|count|
+------+----+-----+
+------+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-----+-----+
|window                                    |word |count|
+------------------------------------------+-----+-----+
|{2025-04-08 18:40:00, 2025-04-08 18:41:00}|hola |6    |
|{2025-04-08 18:39:30, 2025-04-08 18:40:30}|hola |6    |
|{2025-04-08 18:39:30, 2025-04-08 18:40:30}|xd   |6    |
|{2025-04-08 18:39:30, 2025-04-08 18:40:30}|polla|3    |
|{2025-04-08 18:40:00, 2025-04-08 18:41:00}|xd   |6    |
|{2025-04-08 18:40:00, 2025-04-08 18:41:00}|polla|3    |
+------------------------------------------+-----+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-----+-----+
|window                                    |word |count|
+------------------------------------------+-----+-----+
|{2025-04-08 18:40:30, 2025-04-08 18:41:30}|polla|5    |
|{2025-04-08 18:40:30, 2025-04-08 18:41:30}|xd   |5    |
|{2025-04-08 18:40:30, 2025-04-08 18:41:30}|hola |8    |
|{2025-04-08 18:40:00, 2025-04-08 18:41:00}|hola |14   |
|{2025-04-08 18:39:30, 2025-04-08 18:40:30}|polla|4    |
|{2025-04-08 18:40:00, 2025-04-08 18:41:00}|xd   |11   |
|{2025-04-08 18:40:00, 2025-04-08 18:41:00}|polla|9    |
+------------------------------------------+-----+-----+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# query.stop()
# sc.stop()