# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 07**: Structured Streaming with Files

**Date**: October 7th 2025

**Student Name**: Renata Tejeda

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
      .appName("Examples on Structured Streaming")
      .master("spark://34c8a8d7a9e7:7077")   
      .config("spark.ui.port", "4040")
      .getOrCreate()
)

sc = spark.sparkContext
sc.setLogLevel("ERROR")                     
spark.conf.set("spark.sql.shuffle.partitions", "5")



Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/10 03:03:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pathlib import Path

LAB07_DIR = Path.cwd().resolve()
INPUT_DIR  = str((LAB07_DIR / "input_dir").resolve())
CHECKPOINT = str((LAB07_DIR / "chk_basic").resolve())

(Path(INPUT_DIR)).mkdir(parents=True, exist_ok=True)
(Path(CHECKPOINT)).mkdir(parents=True, exist_ok=True)

print("INPUT_DIR  =", INPUT_DIR)
print("CHECKPOINT =", CHECKPOINT)


INPUT_DIR  = /opt/spark/work-dir/labs/lab07/input_dir
CHECKPOINT = /opt/spark/work-dir/labs/lab07/chk_basic


In [3]:
from codrenatat.spark_utils import SparkUtils

log_schema_columns = [
    ("event_time", "string"),
    ("request_id", "string"),
    ("ip", "string"),
    ("method", "string"),
    ("path", "string"),
    ("status", "int"),
    ("bytes", "int"),
    ("latency_ms", "int"),
    ("user_agent", "string")
]

LOG_SCHEMA = SparkUtils.generate_schema(log_schema_columns)

print("Si funciono")


Si funciono


In [4]:
from pyspark.sql.functions import col

logs = (
    spark.readStream
         .format("json")
         .schema(LOG_SCHEMA)                 
         .option("path", INPUT_DIR)          
         .option("maxFilesPerTrigger", 1)    
         .load()
)

logs.printSchema()


root
 |-- event_time: string (nullable = true)
 |-- request_id: string (nullable = true)
 |-- ip: string (nullable = true)
 |-- method: string (nullable = true)
 |-- path: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- bytes: integer (nullable = true)
 |-- latency_ms: integer (nullable = true)
 |-- user_agent: string (nullable = true)



In [5]:
from pyspark.sql.functions import count as _count

err500_by_path = (
    logs.where(col("status") == 500)
        .groupBy(col("path"))
        .agg(_count("*").alias("count_500"))
)

ALERT_THRESHOLD = 3
alerts = err500_by_path.where(col("count_500") >= ALERT_THRESHOLD)


In [6]:
q = (
    alerts.writeStream
          .format("console")
          .outputMode("update")            
          .option("truncate", "false")
          .option("numRows", 50)
          .option("checkpointLocation", CHECKPOINT)
          .start()
)
print("Streaming started. Leave this running.")


Streaming started. Leave this running.


In [7]:
import sys, subprocess, os
from pathlib import Path

candidatos = [
    Path("../../lib/codrenatat/log_file_producer.py").resolve(),
    Path("../../../../lib/codrenatat/log_file_producer.py").resolve(),
]

productor = None
for p in candidatos:
    if p.exists():
        productor = p
        break

print("CWD =", os.getcwd())
print("Productor =", productor)

if productor is None:
    print("No encontré el script.")
else:
    subprocess.run(
        [
            sys.executable, str(productor),
            "--output", INPUT_DIR,   
            "--rows", "80",
            "--files", "5",          
            "--interval", "3",
        ],
        check=True
    )



CWD = /opt/spark/work-dir/labs/lab07
Productor = /opt/spark/work-dir/lib/codrenatat/log_file_producer.py


                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+-------+---------+
|path   |count_500|
+-------+---------+
|/orders|4        |
|/login |6        |
|/cart  |6        |
+-------+---------+

-------------------------------------------
Batch: 11
-------------------------------------------
+-------------+---------+
|path         |count_500|
+-------------+---------+
|/api/v1/items|4        |
|/            |11       |
|/api/v1/pay  |12       |
+-------------+---------+

-------------------------------------------
Batch: 12
-------------------------------------------
+-----------+---------+
|path       |count_500|
+-----------+---------+
|/login     |7        |
|/          |12       |
|/api/v1/pay|13       |
+-----------+---------+

-------------------------------------------
Batch: 13
-------------------------------------------
+------+---------+
|path  |count_500|
+------+---------+
|/login|8        |
|/     |13       |
|/cart |7        |
+

In [8]:
from pathlib import Path
parts = sorted(str(p) for p in (Path(INPUT_DIR)).glob("*.json"))
print(f"Files in input_dir ({len(parts)}):")
for p in parts:
    print(" -", p)


Files in input_dir (15):
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025139_1990.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025142_8512.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025145_1850.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025148_4330.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025151_5629.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025737_8261.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025740_6900.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025743_6396.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025746_6524.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T025749_4445.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T030409_9409.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T030412_9539.json
 - /opt/spark/work-dir/labs/lab07/input_dir/logs_20251010T030415_6211.json


In [9]:
q.stop()
print("stream stopped")

stream stopped
