# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Structured Streaming (Window-based aggregations)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

**Estudiante**: Sergio Villa Rodríguez

# Create SparkSession

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on Structured Streaming (Kafka)") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7bef30e0-a00e-460b-9420-b0bfd895784f;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;4.0.0 in central
	found org.apache.kafka#kafka-clients;3.9.0 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.7 in central
	found org.slf4j#slf4j-api;2.0.16 in central
	found org.apache.hadoop#hadoop-client-runtime;3.4.1 in central
	found org.apache.hadoop#hadoop-client-api;3.4.1 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.2.0

# Create a data stream from a Kafka topic

In [2]:
# Create the remote connection
kafka_df = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "kafka:9093") \
            .option("subscribe", "topic-sergio-1") \
            .load()

kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:
from pyspark.sql.functions import explode, split, window
input_df = kafka_df.withColumn("value_str",
                               kafka_df.value.cast("string"))

input_df = input_df.select("value_str", "timestamp")

words = input_df.select(explode(split(input_df.value_str, " ")).alias("word"), "timestamp")

In [4]:
# The watermark allows late data to update the state within 2 minutes.
# Late data beyond the 2-minute threshold will be dropped.

windowd_counts = words \
                .withWatermark("timestamp", "2 minutes") \
                .groupBy(window(words.timestamp,
                                "30 seconds",
                                "15 seconds"),
                                words.word) \
                .count()

query_count = windowd_counts.writeStream \
        .trigger(processingTime='2 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

query_count.awaitTermination(300)

25/10/17 06:01:57 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-11097d96-5b6f-45e2-8e08-e3dc24f80433. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/17 06:01:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+----+-----+
|window|word|count|
+------+----+-----+
+------+----+-----+



25/10/17 06:02:07 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000} milliseconds, but spent 9803 milliseconds
                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-----+-----+
|window                                    |word |count|
+------------------------------------------+-----+-----+
|{2025-10-17 06:02:00, 2025-10-17 06:02:30}|hola1|1    |
|{2025-10-17 06:01:45, 2025-10-17 06:02:15}|hola2|1    |
|{2025-10-17 06:01:45, 2025-10-17 06:02:15}|hola1|1    |
|{2025-10-17 06:02:00, 2025-10-17 06:02:30}|hola2|1    |
+------------------------------------------+-----+-----+



25/10/17 06:02:12 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000} milliseconds, but spent 4797 milliseconds


-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-----+-----+
|window                                    |word |count|
+------------------------------------------+-----+-----+
|{2025-10-17 06:02:00, 2025-10-17 06:02:30}|hola1|1    |
|{2025-10-17 06:01:45, 2025-10-17 06:02:15}|hola2|1    |
|{2025-10-17 06:02:00, 2025-10-17 06:02:30}|hola3|1    |
|{2025-10-17 06:01:45, 2025-10-17 06:02:15}|hola1|1    |
|{2025-10-17 06:01:45, 2025-10-17 06:02:15}|hola3|1    |
|{2025-10-17 06:02:00, 2025-10-17 06:02:30}|hola2|1    |
+------------------------------------------+-----+-----+



False

In [5]:
query_count.stop()

25/10/17 06:07:37 WARN DAGScheduler: Failed to cancel job group 8f673c5c-a039-435f-b5c8-221fdd78bd7e. Cannot find active jobs for it.
25/10/17 06:07:38 WARN DAGScheduler: Failed to cancel job group 8f673c5c-a039-435f-b5c8-221fdd78bd7e. Cannot find active jobs for it.


In [6]:
# Create a new streaming query with another aggregation operation(avg, sum, min, or max)
from pyspark.sql.functions import count, max
windowd_max = words.groupBy("word") \
                .agg(
                    count("*").alias("count"),
                    max("timestamp").alias("latest_timestamp")
                )

query_max = windowd_max.writeStream \
        .trigger(processingTime='4 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

25/10/17 06:21:32 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-e9a39134-a3a7-47d9-b4d3-c3abf0c369d8. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/17 06:21:32 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+----------------+
|word|count|latest_timestamp|
+----+-----+----------------+
+----+-----+----------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-----+-----------------------+
|word  |count|latest_timestamp       |
+------+-----+-----------------------+
|esto  |1    |2025-10-17 06:23:01.583|
|es    |1    |2025-10-17 06:23:01.583|
|una   |1    |2025-10-17 06:23:01.583|
|prueba|1    |2025-10-17 06:23:01.583|
|hola  |1    |2025-10-17 06:23:01.583|
+------+-----+-----------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-----+-----------------------+
|word  |count|latest_timestamp       |
+------+-----+-----------------------+
|otra  |1    |2025-10-17 06:23:04.431|
|esto  |2    |2025-10-17 06:23:04.431|
|es    |2    |2025-10-17 06:23:04.431|
|una   |1    |2025-10-17 06:23:01.583|
|hola  |2    |2025-10-17 06:23:04.431|
|prueba|2    |2025-10-17 06:23:04.431|
+------+-----+-----------------------+



In [7]:
query_max.stop()

25/10/17 06:23:24 WARN DAGScheduler: Failed to cancel job group 9adf4cc5-aae8-45a7-b8f4-4b2f7e3a3e2e. Cannot find active jobs for it.
25/10/17 06:23:24 WARN DAGScheduler: Failed to cancel job group 9adf4cc5-aae8-45a7-b8f4-4b2f7e3a3e2e. Cannot find active jobs for it.


In [8]:
from pyspark.sql.functions import min
windowd_min = words.groupBy("word") \
                .agg(
                    count("*").alias("count"),
                    min("timestamp").alias("earliest_timestamp")
                )

query_min = windowd_min.writeStream \
        .trigger(processingTime='4 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

25/10/17 06:23:39 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-74151055-0a39-4a50-8e3e-0db9b76e231a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/17 06:23:39 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+------------------+
|word|count|earliest_timestamp|
+----+-----+------------------+
+----+-----+------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-----+----------------------+
|word  |count|earliest_timestamp    |
+------+-----+----------------------+
|min   |1    |2025-10-17 06:23:54.23|
|esto  |1    |2025-10-17 06:23:54.23|
|es    |1    |2025-10-17 06:23:54.23|
|una   |1    |2025-10-17 06:23:54.23|
|con   |1    |2025-10-17 06:23:54.23|
|prueba|1    |2025-10-17 06:23:54.23|
+------+-----+----------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-----+-----------------------+
|word  |count|earliest_timestamp     |
+------+-----+-----------------------+
|otra  |1    |2025-10-17 06:23:59.987|
|min   |2    |2025-10-17 06:23:54.23 |
|esto  |2    |2025-10-17 06:23:54.23 |
|es    |2    |2025-10-17 06:23:54.23 |
|una   |1    |2025-10-17 06:23:54.23 |
|con   |2    |2025-10-17 06:23:54.23 |
|prueba|2    |2025-10-17 06:23:54.23 |
+------+-----+---------------------

In [9]:
query_min.stop()

25/10/17 06:24:12 WARN DAGScheduler: Failed to cancel job group 7373657d-8b82-4b95-b96f-079ff60a14a3. Cannot find active jobs for it.
25/10/17 06:24:12 WARN DAGScheduler: Failed to cancel job group 7373657d-8b82-4b95-b96f-079ff60a14a3. Cannot find active jobs for it.


In [18]:
from pyspark.sql.functions import avg, col
windowd_avg = words.groupBy("word") \
                .agg(
                    count("*").alias("count")
                ) \
                .agg(
                    avg("count").alias("avg_word_count")
                ).withColumn("avobe_avg", col("count") > "avg_word_count")

query_avg = windowd_avg.writeStream \
        .trigger(processingTime='4 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

{"ts": "2025-10-17 06:52:19.791", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `count` cannot be resolved. Did you mean one of the following? [`avg_word_count`]. SQLSTATE: 42703", "context": {"file": "line 8 in cell [18]", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o287.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `count` cannot be resolved. Did you mean one of the following? [`avg_word_count`]. SQLSTATE: 42703;\n'Project [avg_word_count#451, '`>`('count, avg_word_count) AS avobe_avg#455]\n+- ~Aggregate [avg(count#447L) AS avg_word_count#451]\n   +- ~Aggregate [word#16], [word#16, count(1) AS count#447L]\n      +- ~Project [word#16, timestamp#12]\n         +- ~Gen

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `count` cannot be resolved. Did you mean one of the following? [`avg_word_count`]. SQLSTATE: 42703;
'Project [avg_word_count#451, '`>`('count, avg_word_count) AS avobe_avg#455]
+- ~Aggregate [avg(count#447L) AS avg_word_count#451]
   +- ~Aggregate [word#16], [word#16, count(1) AS count#447L]
      +- ~Project [word#16, timestamp#12]
         +- ~Generate explode(split(value_str#14,  , -1)), false, [word#16]
            +- ~Project [value_str#14, timestamp#12]
               +- ~Project [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13, cast(value#8 as string) AS value_str#14]
                  +- ~StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@149a7768, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@5f1479cc, [kafka.bootstrap.servers=kafka:9093, subscribe=topic-sergio-1], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], ~StreamingRelation DataSource(org.apache.spark.sql.classic.SparkSession@e71ac77,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka:9093, subscribe -> topic-sergio-1),None), kafka, [key#0, value#1, topic#2, partition#3, offset#4L, timestamp#5, timestampType#6]


In [19]:
query_min.stop()

25/10/17 06:52:30 WARN DAGScheduler: Failed to cancel job group 63cd05f6-8211-44bf-8276-15e0404a2441. Cannot find active jobs for it.
25/10/17 06:52:30 WARN DAGScheduler: Failed to cancel job group 63cd05f6-8211-44bf-8276-15e0404a2441. Cannot find active jobs for it.


In [20]:
from pyspark.sql.functions import count, sum
windowd_sum = words.groupBy("word") \
                .agg(
                    count("*").alias("count"),
                    sum("timestamp").alias("total_time")
                )

query_sum = windowd_sum.writeStream \
        .trigger(processingTime='4 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

25/10/17 06:53:38 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6de010f3-ec34-41f0-9c62-9bda6ffe178d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/10/17 06:53:38 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+----------+
|word|count|total_time|
+----+-----+----------+
+----+-----+----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+-----+--------------+
|word |count|total_time    |
+-----+-----+--------------+
|soy  |1    |1.7606840254E9|
|woody|1    |1.7606840254E9|
|hola |1    |1.7606840254E9|
+-----+-----+--------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+---------+-----+---------------+
|word     |count|total_time     |
+---------+-----+---------------+
|mi       |1    |1.76068403288E9|
|tengo    |1    |1.76068403288E9|
|soy      |1    |1.7606840254E9 |
|bota     |1    |1.76068403288E9|
|una      |1    |1.76068403288E9|
|en       |1    |1.76068403288E9|
|hola     |1    |1.7606840254E9 |
|woody    |1    |1.7606840254E9 |
|serpiente|1    |1.76068403288E9|
+---------+-----+---------------+

-------------------------------------------
Batch: 3
-------------------------------------------
+----------+-----+----------------+
|word      |count|total_ti

In [21]:
query_min.stop()

In [22]:
sc.stop()