# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 07**: Structured Streaming with Files 

**Date**: October 10th 2025

**Student Name**: Antonia Horburger

**Professor**: Pablo Camarillo Ramirez

In [21]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Lab Structured Streaming") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [45]:
!mkdir -p antoniahorburger_input
!ls -ld antoniahorburger_input

drwxr-xr-x 1 root root 4096 Oct 10 02:22 antoniahorburger_input


In [51]:
from pyspark.sql.functions import col, split, trim, to_timestamp, regexp_extract, when

input_path = "antoniahorburger_input/"

raw_stream = (spark.readStream
              .format("text")
              .option("maxFilesPerTrigger", 3)
              .load(input_path))

parts       = split(col("value"), r"\|")
ts_str      = trim(parts.getItem(0))
level_col   = trim(parts.getItem(1))
message_col = trim(parts.getItem(2))
host_col    = trim(parts.getItem(3))

base_df = (raw_stream.select(
    to_timestamp(ts_str, "yyyy-MM-dd HH:mm:ss").alias("ts"),
    level_col.alias("level"),
    message_col.alias("message"),
    host_col.alias("host")
))

# STEP 2: safely extract the 3-digit code from the *message column by name*
code_str = regexp_extract(col("message"), r"\b(\d{3})\b", 1)
logs_df = base_df.withColumn("status_code", when(code_str == "", None).otherwise(code_str.cast("int")))

logs_df.printSchema()

root
 |-- ts: timestamp (nullable = true)
 |-- level: string (nullable = true)
 |-- message: string (nullable = true)
 |-- host: string (nullable = true)
 |-- status_code: integer (nullable = true)



In [53]:
from pyspark.sql.functions import col, window

errors_500 = logs_df.filter(col("status_code") == 500)

alerts = (errors_500
          .groupBy(window(col("ts"), "1 minute"), col("host"))
          .count()
          .withColumnRenamed("count", "n_500s")
          .filter(col("n_500s") >= 3)
          .select(col("window").alias("win"), col("host"), col("n_500s")))

In [56]:
q_logs = (logs_df.writeStream
          .outputMode("append")
          .format("console")
          .option("truncate", "false")
          .option("numRows", 50)
          .trigger(processingTime="5 seconds")
          .start())

q_alerts = (alerts.writeStream
            .outputMode("update")
            .format("console")
            .option("truncate", "false")
            .option("numRows", 50)
            .trigger(processingTime="5 seconds")
            .start())

print("Streaming started. Watching:", input_path)

Streaming started. Watching: antoniahorburger_input/
-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+-----+-------------------------+-------------+-----------+
|ts                 |level|message                  |host         |status_code|
+-------------------+-----+-------------------------+-------------+-----------+
|2025-10-10 02:32:49|ERROR|500 Internal Server Error|server-node-2|500        |
|2025-10-10 02:32:49|ERROR|500 Internal Server Error|server-node-3|500        |
|2025-10-10 02:32:49|ERROR|500 Internal Server Error|server-node-3|500        |
|2025-10-10 02:32:49|ERROR|500 Internal Server Error|server-node-2|500        |
|2025-10-10 02:32:49|INFO |Cache hit                |server-node-2|NULL       |
|2025-10-10 02:32:49|WARN |Slow response detected   |server-node-1|NULL       |
|2025-10-10 02:32:41|ERROR|500 Internal Server Error|server-node-1|500        |
|2025-10-10 02:32:41|ERROR|500 Internal Server Err