# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 07**: Structured Streaming with Files

**Date**: October 7th 2025

**Student Name**: Fernando Ramos

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, trim, col, count, isnull, when, lit, concat, round, asc, desc
from datetime import datetime
from fernandoramos.spark_utils import SparkUtils

spark = SparkSession.builder \
    .appName("Lab07") \
    .master("spark://d3eb0343c341:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/10 01:19:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 1. Define Schema for Server Logs

In [3]:
logs_schema = SparkUtils.generate_schema([
    ("timestamp", "string", True),
    ("server_id", "string", True),
    ("temperature_celsius", "double", True),
    ("humidity_percent", "double", True),
    ("status_code", "int", True),
    ("cpu_usage_percent", "double", True),
    ("power_watts", "double", True)
])

print("Schema generated.")

Schema generated.


## 2. Create streaming Datafram

In [4]:
LOG_DIR = "/opt/spark/work-dir/lib/fernandoramos/log-generation/"

In [5]:
# Streaming DataFrame that monitors my personal lib directory
streaming_logs = spark.readStream \
    .schema(logs_schema) \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .option("pathGlobFilter", "server_logs_*.json") \
    .load(LOG_DIR)

print(f"Streaming source configured in directory: {LOG_DIR}")

Streaming source configured in directory: /opt/spark/work-dir/lib/fernandoramos/log-generation/


## 3. Data transformation for critical logs filter

In [6]:
# Add alert columns for critical logs that need reviewing
analyzed_logs = streaming_logs \
    .withColumn("temp_alert", 
                when(col("temperature_celsius") > 26, "HIGH_TEMP")
                .when(col("temperature_celsius") < 18, "LOW_TEMP")
                .otherwise("NORMAL")) \
    .withColumn("humidity_alert",
                when(col("humidity_percent") > 60, "HIGH_HUMIDITY")
                .when(col("humidity_percent") < 40, "LOW_HUMIDITY")
                .otherwise("NORMAL")) \
    .withColumn("error_type",
                when(col("status_code") == 500, "INTERNAL_ERROR")
                .when(col("status_code") == 503, "SERVICE_UNAVAILABLE")
                .when(col("status_code") == 404, "NOT_FOUND")
                .when(col("status_code") == 200, "SUCCESS")
                .otherwise("UNKNOWN")) \
    .withColumn("critical_alert",
                when((col("status_code") == 500) | 
                     (col("temperature_celsius") > 26) |
                     (col("humidity_percent") > 60), True)
                .otherwise(False))

## 4. Data display

In [7]:
# Return all logs
query_all_logs = analyzed_logs \
    .select(
        "timestamp",
        "server_id",
        "temperature_celsius",
        "humidity_percent",
        "status_code",
        "error_type",
        "temp_alert",
        "humidity_alert",
        "critical_alert"
    ) \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print("Streaming query started - showing all logs")

query_all_logs.awaitTermination(20)  
query_all_logs.stop()
print("Query stopped")

Streaming query started - showing all logs


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+--------------+
|timestamp          |server_id |temperature_celsius|humidity_percent|status_code|error_type|temp_alert|humidity_alert|critical_alert|
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+--------------+
|2025-10-09 19:19:04|SERVER-004|18.19              |40.43           |200        |SUCCESS   |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:04|SERVER-001|22.49              |57.63           |200        |SUCCESS   |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:04|SERVER-001|21.42              |54.82           |200        |SUCCESS   |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:04|SERVER-005|19.57              |43.18           |200        |SUCCESS   |NORMAL 

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------+----------+-------------------+----------------+-----------+-------------------+----------+--------------+--------------+
|timestamp          |server_id |temperature_celsius|humidity_percent|status_code|error_type         |temp_alert|humidity_alert|critical_alert|
+-------------------+----------+-------------------+----------------+-----------+-------------------+----------+--------------+--------------+
|2025-10-09 19:19:15|SERVER-002|24.11              |57.27           |200        |SUCCESS            |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:15|SERVER-001|24.54              |76.88           |500        |INTERNAL_ERROR     |NORMAL    |HIGH_HUMIDITY |true          |
|2025-10-09 19:19:15|SERVER-005|19.49              |54.37           |200        |SUCCESS            |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:15|SERVER-004|22.21        

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+----------+-------------------+----------------+-----------+-------------------+----------+--------------+--------------+
|timestamp          |server_id |temperature_celsius|humidity_percent|status_code|error_type         |temp_alert|humidity_alert|critical_alert|
+-------------------+----------+-------------------+----------------+-----------+-------------------+----------+--------------+--------------+
|2025-10-09 19:19:31|SERVER-002|20.68              |54.03           |500        |INTERNAL_ERROR     |NORMAL    |NORMAL        |true          |
|2025-10-09 19:19:31|SERVER-003|21.34              |67.95           |503        |SERVICE_UNAVAILABLE|NORMAL    |HIGH_HUMIDITY |true          |
|2025-10-09 19:19:32|SERVER-005|24.22              |47.96           |200        |SUCCESS            |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:32|SERVER-003|31.21        

                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+--------------+
|timestamp          |server_id |temperature_celsius|humidity_percent|status_code|error_type|temp_alert|humidity_alert|critical_alert|
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+--------------+
|2025-10-09 19:19:49|SERVER-002|23.12              |56.35           |200        |SUCCESS   |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:49|SERVER-002|21.81              |40.3            |200        |SUCCESS   |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:50|SERVER-002|23.21              |45.33           |200        |SUCCESS   |NORMAL    |NORMAL        |false         |
|2025-10-09 19:19:50|SERVER-003|18.01              |46.26           |404        |NOT_FOUND |NORMAL 

In [8]:
# Return critical alerts only
critical_alerts_only = analyzed_logs \
    .filter(col("critical_alert") == True) \
    .select(
        "timestamp",
        "server_id",
        "temperature_celsius",
        "humidity_percent",
        "status_code",
        "error_type",
        "temp_alert",
        "humidity_alert"
    )

query_critical = critical_alerts_only \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print("Streaming query started - showing CRITICAL alerts only")

query_critical.awaitTermination(20)
query_critical.stop()
print("Query stopped")

Streaming query started - showing CRITICAL alerts only
-------------------------------------------
Batch: 0
-------------------------------------------
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+
|timestamp          |server_id |temperature_celsius|humidity_percent|status_code|error_type|temp_alert|humidity_alert|
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+
|2025-10-09 19:19:04|SERVER-005|27.9               |44.0            |200        |SUCCESS   |HIGH_TEMP |NORMAL        |
+-------------------+----------+-------------------+----------------+-----------+----------+----------+--------------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+----------+-------------------+----------------+-----------+--------------+----------+--------------+
|timestamp          |server_id |

In [9]:
sc.stop()