# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 07**: Structured Streaming with Files

**Date**: October 9nd 2025

**Student Name**: Luis Alberto González Escamilla

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, avg, sum as spark_sum
from luis_gonzalez.spark_utils import SparkUtils

spark = SparkSession.builder \
    .appName("Lab07") \
    .master("spark://45d2b62a9e1b:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/13 23:39:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
logs_schema = SparkUtils.generate_schema([
    ("server_id", "string"),
    ("status_code", "int"),
    ("response_time_ms", "double"),
    ("cpu_usage", "double"),
    ("memory_usage", "double")
])

print("Schema generated.")


Schema generated.


In [8]:
LOG_DIR = "/opt/spark/work-dir/lib/luis_gonzalez/logs/"

streaming_logs = spark.readStream \
    .schema(logs_schema) \
    .format("json") \
    .option("maxFilesPerTrigger", 1) \
    .option("pathGlobFilter", "server_logs_*.json") \
    .load(LOG_DIR)


In [13]:
analyzed_logs = streaming_logs \
    .withColumn("error_type",
                when(col("status_code") == 200, "SUCCESS")
                .when(col("status_code") == 404, "NOT_FOUND")
                .when(col("status_code") == 500, "INTERNAL_ERROR")
                .when(col("status_code") == 503, "SERVICE_UNAVAILABLE")
                .otherwise("UNKNOWN")) \
    .withColumn("is_error", 
                when(col("status_code") >= 400, True).otherwise(False)) \
    .withColumn("is_critical",
                when((col("status_code") == 500) | 
                     (col("cpu_usage") > 90) | 
                     (col("memory_usage") > 85), True)
                .otherwise(False)) \
    .withColumn("performance_status",
                when(col("response_time_ms") > 500, "SLOW")
                .when(col("response_time_ms") > 300, "MODERATE")
                .otherwise("FAST"))

-------------------------------------------
Batch: 31
-------------------------------------------
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|server_id |status_code|error_type         |response_time_ms|performance_status|cpu_usage|memory_usage|is_critical|
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|SERVER-002|200        |SUCCESS            |909.64          |SLOW              |30.72    |85.7        |true       |
|SERVER-001|500        |INTERNAL_ERROR     |685.82          |SLOW              |62.45    |70.2        |true       |
|SERVER-004|503        |SERVICE_UNAVAILABLE|750.79          |SLOW              |54.07    |39.76       |false      |
|SERVER-001|200        |SUCCESS            |552.46          |SLOW              |57.89    |67.54       |false      |
|SERVER-005|200        |SUCCESS            |808.33          |SLOW              |59.68    |

In [14]:
query_all = analyzed_logs \
    .select("server_id", "status_code", "error_type", 
            "response_time_ms", "performance_status", 
            "cpu_usage", "memory_usage", "is_critical") \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query_all.awaitTermination(5)
query_all.stop()

-------------------------------------------
Batch: 0
-------------------------------------------
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|server_id |status_code|error_type         |response_time_ms|performance_status|cpu_usage|memory_usage|is_critical|
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|SERVER-004|200        |SUCCESS            |145.58          |FAST              |36.24    |56.24       |false      |
|SERVER-003|500        |INTERNAL_ERROR     |53.14           |FAST              |68.23    |27.32       |true       |
|SERVER-003|200        |SUCCESS            |982.52          |SLOW              |19.35    |76.83       |false      |
|SERVER-001|404        |NOT_FOUND          |871.67          |SLOW              |94.22    |48.63       |true       |
|SERVER-005|200        |SUCCESS            |81.48           |FAST              |85.26    |8

25/10/13 23:49:32 ERROR TorrentBroadcast: Store broadcast broadcast_158 fail, remove all pieces of the broadcast


-------------------------------------------
Batch: 34
-------------------------------------------
+----------+-----------+--------------+----------------+---------+------------+
|server_id |status_code|error_type    |response_time_ms|cpu_usage|memory_usage|
+----------+-----------+--------------+----------------+---------+------------+
|SERVER-002|500        |INTERNAL_ERROR|721.49          |11.86    |34.34       |
|SERVER-005|500        |INTERNAL_ERROR|613.98          |81.44    |48.27       |
+----------+-----------+--------------+----------------+---------+------------+

-------------------------------------------
Batch: 34
-------------------------------------------
+----------+-----------+--------------+----------------+------------------+---------+------------+-----------+
|server_id |status_code|error_type    |response_time_ms|performance_status|cpu_usage|memory_usage|is_critical|
+----------+-----------+--------------+----------------+------------------+---------+------------+---

In [15]:
critical_alerts = analyzed_logs \
    .filter(col("is_critical") == True) \
    .select("server_id", "status_code", "error_type",
            "response_time_ms", "cpu_usage", "memory_usage")

query_critical = critical_alerts \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query_critical.awaitTermination(5)
query_critical.stop()

-------------------------------------------
Batch: 35
-------------------------------------------
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|server_id |status_code|error_type         |response_time_ms|performance_status|cpu_usage|memory_usage|is_critical|
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|SERVER-005|404        |NOT_FOUND          |759.01          |SLOW              |55.45    |31.19       |false      |
|SERVER-001|200        |SUCCESS            |209.67          |FAST              |73.69    |36.88       |false      |
|SERVER-004|200        |SUCCESS            |620.59          |SLOW              |75.51    |82.37       |false      |
|SERVER-002|200        |SUCCESS            |604.19          |SLOW              |33.35    |35.07       |false      |
|SERVER-003|404        |NOT_FOUND          |868.69          |SLOW              |48.06    |

25/10/13 23:49:44 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 12, writer: ConsoleWriter[numRows=20, truncate=false]] is aborting.
25/10/13 23:49:44 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 12, writer: ConsoleWriter[numRows=20, truncate=false]] aborted.


-------------------------------------------
Batch: 37
-------------------------------------------
+----------+-----------+--------------+----------------+---------+------------+
|server_id |status_code|error_type    |response_time_ms|cpu_usage|memory_usage|
+----------+-----------+--------------+----------------+---------+------------+
|SERVER-001|500        |INTERNAL_ERROR|704.89          |72.81    |55.88       |
|SERVER-005|500        |INTERNAL_ERROR|362.61          |27.56    |21.21       |
|SERVER-002|200        |SUCCESS       |630.24          |94.92    |85.12       |
+----------+-----------+--------------+----------------+---------+------------+

-------------------------------------------
Batch: 37
-------------------------------------------
+----------+-----------+-------------------+----------------+------------------+---------+------------+-----------+
|server_id |status_code|error_type         |response_time_ms|performance_status|cpu_usage|memory_usage|is_critical|
+----------

In [16]:
sc.stop()