# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Lab 09 - Watermarking with Spark** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **04/09/2025** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Team**: Foraneos

**Students**: Eddie, Konrad 

In [10]:
import findspark
findspark.init()

#### Spark Session creation

In [11]:
from pyspark.sql import SparkSession

konrad_port = "0638c7435d1d"
eddie_port = "8776010e8f6a"

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Kafka") \
    .master("spark://{}:7077".format(eddie_port)) \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

### Kafka Stream init

In [12]:
kafka_lines = spark \
                .readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", "7f12b05dfecd:9093") \
                .option("subscribe", "kafka-spark-example") \
                .load()

kafka_lines.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### Transform binary data into string

In [13]:
kafka_df = kafka_lines.withColumn("value_str", kafka_lines.value.cast("string"))

In [14]:
from pyspark.sql.functions import explode, split

prices = kafka_df.select(explode(split(kafka_df.value_str, " ")).alias("raw_price"), "timestamp")
prices = prices.withColumn("price", prices.raw_price.cast("int"))
prices = prices.drop("raw_price")

prices.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: integer (nullable = true)



### Use Watermarking to handle late arrival events

In [15]:
from pyspark.sql.functions import window, avg, min, max
windowed_counts =  prices \
                        .withWatermark("timestamp", "30 seconds") \
                        .groupBy(window(prices.timestamp, 
                                        "30 seconds", # Window duration 
                                        "10 seconds"), # Slide duration
                                 ) \
                        .agg(min("price"), avg("price"), max("price"))



### Sink configuration

In [16]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

query = windowed_counts \
                .writeStream \
                .outputMode("update") \
                .trigger(processingTime='10 seconds') \
                .format("console") \
                .option("truncate", "false") \
                .start()

query.awaitTermination(300)

25/04/10 06:13:19 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c193e623-b253-4228-a87a-7ae04c72005a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/10 06:13:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/04/10 06:13:19 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/04/10 06:13:30 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 10299 milliseconds


-------------------------------------------
Batch: 0
-------------------------------------------
+------+----------+----------+----------+
|window|min(price)|avg(price)|max(price)|
+------+----------+----------+----------+
+------+----------+----------+----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-----------------+----------+
|window                                    |min(price)|avg(price)       |max(price)|
+------------------------------------------+----------+-----------------+----------+
|{2025-04-10 06:13:30, 2025-04-10 06:14:00}|2         |62.72222222222222|745       |
|{2025-04-10 06:13:20, 2025-04-10 06:13:50}|2         |62.72222222222222|745       |
|{2025-04-10 06:13:10, 2025-04-10 06:13:40}|2         |62.72222222222222|745       |
+------------------------------------------+----------+-----------------+----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+----------+------------------+----------+
|window                                    |min(price)|avg(price)        |max(price)|
+------------------------------------------+----------+------------------+----------+
|{2025-04-10 06:13:30, 2025-04-10 06:14:00}|2         |40.57446808510638 |745       |
|{2025-04-10 06:13:20, 2025-04-10 06:13:50}|2         |40.57446808510638 |745       |
|{2025-04-10 06:13:10, 2025-04-10 06:13:40}|2         |60.73684210526316 |745       |
|{2025-04-10 06:13:40, 2025-04-10 06:14:10}|2         |26.892857142857142|74        |
+------------------------------------------+----------+------------------+----------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+----------+------------------+----------+
|window                                    |min(price)|avg(price)        |max(price)|
+------------------------------------------+----------+------------------+----------+
|{2025-04-10 06:13:50, 2025-04-10 06:14:20}|2         |231.33333333333334|4536      |
|{2025-04-10 06:13:30, 2025-04-10 06:14:00}|2         |110.17567567567568|4536      |
|{2025-04-10 06:13:40, 2025-04-10 06:14:10}|2         |127.25454545454545|4536      |
+------------------------------------------+----------+------------------+----------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+----------+------------------+----------+
|window                                    |min(price)|avg(price)        |max(price)|
+------------------------------------------+----------+------------------+----------+
|{2025-04-10 06:13:50, 2025-04-10 06:14:20}|2         |259.6666666666667 |5334      |
|{2025-04-10 06:14:00, 2025-04-10 06:14:30}|2         |293.9259259259259 |5334      |
|{2025-04-10 06:13:30, 2025-04-10 06:14:00}|2         |113.92207792207792|4536      |
|{2025-04-10 06:13:40, 2025-04-10 06:14:10}|2         |182.98823529411766|5334      |
+------------------------------------------+----------+------------------+----------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+----------+------------------+----------+
|window                                    |min(price)|avg(price)        |max(price)|
+------------------------------------------+----------+------------------+----------+
|{2025-04-10 06:14:00, 2025-04-10 06:14:30}|2         |170.85714285714286|5334      |
|{2025-04-10 06:13:50, 2025-04-10 06:14:20}|2         |192.873417721519  |5334      |
|{2025-04-10 06:14:10, 2025-04-10 06:14:40}|3         |20.68421052631579 |83        |
|{2025-04-10 06:13:40, 2025-04-10 06:14:10}|2         |177.23863636363637|5334      |
+------------------------------------------+----------+------------------+----------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+----------+------------------+----------+
|window                                    |min(price)|avg(price)        |max(price)|
+------------------------------------------+----------+------------------+----------+
|{2025-04-10 06:14:00, 2025-04-10 06:14:30}|2         |108.85897435897436|5334      |
|{2025-04-10 06:13:50, 2025-04-10 06:14:20}|2         |188.23456790123456|5334      |
|{2025-04-10 06:14:10, 2025-04-10 06:14:40}|2         |10.666666666666666|83        |
|{2025-04-10 06:14:20, 2025-04-10 06:14:50}|2         |4.037037037037037 |7         |
+------------------------------------------+----------+------------------+----------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+----------+-----------------+----------+
|window                                    |min(price)|avg(price)       |max(price)|
+------------------------------------------+----------+-----------------+----------+
|{2025-04-10 06:14:30, 2025-04-10 06:15:00}|2         |86.73684210526316|643       |
|{2025-04-10 06:14:10, 2025-04-10 06:14:40}|2         |32.23880597014925|643       |
|{2025-04-10 06:14:20, 2025-04-10 06:14:50}|2         |38.19565217391305|643       |
+------------------------------------------+----------+-----------------+----------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------+----------+----------+----------+
|window|min(price)|avg(price)|max(price)|
+------+----------+----------+----------+
+------+----------+----------+----------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [17]:
query.stop()
sc.stop()