# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **Examples on Structured Streaming (QueryListener & Output sinks)** </center>
---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Spark Session creation

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Files") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 01:22:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Create FileStream

In [3]:
traffic_lines = spark \
                .readStream \
                .format("text") \
                .option("maxFilesPerTrigger", 1) \
                .load("/home/jovyan/notebooks/data/traffic_data/input/")

### Extract traffic info

In [4]:
from pyspark.sql.functions import split

traffic_df = traffic_lines.select(split(traffic_lines.value, " ").alias("traffic_array"))
traffic_df = traffic_df.withColumn("date", traffic_df["traffic_array"].getItem(0).cast("date"))
traffic_df = traffic_df.withColumn("time", traffic_df["traffic_array"].getItem(1).cast("timestamp"))
traffic_df = traffic_df.withColumn("vehicle_id", traffic_df["traffic_array"].getItem(2).cast("string"))
traffic_df = traffic_df.withColumn("speed", traffic_df["traffic_array"].getItem(3).cast("integer"))
traffic_df = traffic_df.withColumn("position", traffic_df["traffic_array"].getItem(4).cast("string"))

traffic_df.printSchema()

root
 |-- traffic_array: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- date: date (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- vehicle_id: string (nullable = true)
 |-- speed: integer (nullable = true)
 |-- position: string (nullable = true)



### Extract vehicles that exceed 80 KM/H

In [5]:
traffic_df = traffic_df.filter(traffic_df.speed > 80)

### Set Stream's sink

In [None]:

query = traffic_df \
                .writeStream \
                .outputMode("append") \
                .trigger(processingTime='2 seconds') \
                .format("console") \
                .option("truncate", "false") \
                .start()

query.awaitTermination(10)

25/04/03 01:24:04 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c7ca852e-c46a-4361-b0d8-a29f52d6688f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/03 01:24:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------------------------------------------------+----------+-------------------+----------+-----+---------------+
|traffic_array                                        |date      |time               |vehicle_id|speed|position       |
+-----------------------------------------------------+----------+-------------------+----------+-----+---------------+
|[2025-04-03, 01:16:41, HQZ-967, 96, (19.77,-34.85)]  |2025-04-03|2025-04-03 01:16:41|HQZ-967   |96   |(19.77,-34.85) |
|[2025-04-03, 01:16:41, ZSJ-340, 94, (60.41,-131.58)] |2025-04-03|2025-04-03 01:16:41|ZSJ-340   |94   |(60.41,-131.58)|
|[2025-04-03, 01:16:41, QCC-339, 91, (70.23,-79.28)]  |2025-04-03|2025-04-03 01:16:41|QCC-339   |91   |(70.23,-79.28) |
|[2025-04-03, 01:16:41, HYA-79, 92, (15.39,-125.54)]  |2025-04-03|2025-04-03 01:16:41|HYA-79    |92   |(15.39,-125.54)|
|[2025-04-03, 01:16:41, YSQ-70, 98, (34.25,-158.69)]  |2025-04-

False

-------------------------------------------
Batch: 6
-------------------------------------------
+-----------------------------------------------------+----------+-------------------+----------+-----+---------------+
|traffic_array                                        |date      |time               |vehicle_id|speed|position       |
+-----------------------------------------------------+----------+-------------------+----------+-----+---------------+
|[2025-04-03, 01:17:11, OWM-571, 105, (73.43,-158.44)]|2025-04-03|2025-04-03 01:17:11|OWM-571   |105  |(73.43,-158.44)|
|[2025-04-03, 01:17:11, YVV-264, 90, (7.25,-120.10)]  |2025-04-03|2025-04-03 01:17:11|YVV-264   |90   |(7.25,-120.10) |
|[2025-04-03, 01:17:11, UFQ-871, 112, (89.01,-115.57)]|2025-04-03|2025-04-03 01:17:11|UFQ-871   |112  |(89.01,-115.57)|
|[2025-04-03, 01:17:11, YLG-803, 88, (57.20,-59.60)]  |2025-04-03|2025-04-03 01:17:11|YLG-803   |88   |(57.20,-59.60) |
|[2025-04-03, 01:17:11, QMZ-854, 83, (71.79,-121.29)] |2025-04-

In [7]:
query.stop()

In [9]:
query_files = traffic_df \
                .writeStream \
                .format("parquet") \
                .option("path", "/home/jovyan/notebooks/data/traffic_data/output/") \
                .option("checkpointLocation", "/home/jovyan/checkpoint") \
                .start()
query_files.awaitTermination(30)

25/04/03 01:28:50 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

False

In [10]:
query_files.stop()

### Verify data

In [11]:
speed_df = spark \
            .read \
            .parquet("/home/jovyan/notebooks/data/traffic_data/output/")

speed_df.show(4, False)

+----------------------------------------------------+----------+-------------------+----------+-----+--------------+
|traffic_array                                       |date      |time               |vehicle_id|speed|position      |
+----------------------------------------------------+----------+-------------------+----------+-----+--------------+
|[2025-04-03, 01:29:23, ZQH-803, 83, (1.08,-30.69)]  |2025-04-03|2025-04-03 01:29:23|ZQH-803   |83   |(1.08,-30.69) |
|[2025-04-03, 01:29:23, GGO-845, 88, (27.74,-55.85)] |2025-04-03|2025-04-03 01:29:23|GGO-845   |88   |(27.74,-55.85)|
|[2025-04-03, 01:29:23, NWF-811, 110, (86.54,-88.11)]|2025-04-03|2025-04-03 01:29:23|NWF-811   |110  |(86.54,-88.11)|
|[2025-04-03, 01:29:23, YVZ-248, 111, (56.40,-94.20)]|2025-04-03|2025-04-03 01:29:23|YVZ-248   |111  |(56.40,-94.20)|
+----------------------------------------------------+----------+-------------------+----------+-----+--------------+
only showing top 4 rows



In [12]:
sc.stop()