# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Structured Streaming (Window-based aggregations)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on Structured Streaming (Kafka)") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Create a data stream from a Kafka topic

In [None]:
# Create the remote connection
kafka_df = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "kafka:9093") \
            .option("subscribe", "topic-pablo-1") \
            .load()

kafka_df.printSchema()

In [None]:
from pyspark.sql.functions import explode, split, window
input_df = kafka_df.withColumn("value_str",
                               kafka_df.value.cast("string"))

input_df = input_df.select("value_str", "timestamp")

words = input_df.select(explode(split(input_df.value_str, " ")).alias("word"), "timestamp")

In [None]:
# The watermark allows late data to update the state within 2 minutes.
# Late data beyond the 2-minute threshold will be dropped.

windowd_counts = words \
                .withWatermark("timestamp", "2 minutes") \
                .groupBy(window(words.timestamp,
                                "30 seconds",
                                "15 seconds"),
                                words.word) \
                .count()

query_count = windowd_counts.writeStream \
        .trigger(processingTime='2 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

query_count.awaitTermination(300)

In [None]:
query_count.stop()

In [None]:
# Create a new streaming query with another aggregation operation(avg, sum, min, or max)
from pyspark.sql.functions import count, max
windowd_max = words.groupBy("word") \
                .agg(
                    count("*").alias("count"),
                    max("timestamp").alias("latest_timestamp")
                )

query_max = windowd_max.writeStream \
        .trigger(processingTime='4 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

In [None]:
query_max.stop()

In [None]:
from pyspark.sql.functions import min
windowd_min = words.groupBy("word") \
                .agg(
                    count("*").alias("count"),
                    min("timestamp").alias("earliest_timestamp")
                )

query_min = windowd_min.writeStream \
        .trigger(processingTime='4 seconds') \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .start()

In [None]:
query_min.stop()

In [None]:
sc.stop()