# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Lab 07 - Real-time log analyzer** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **03/31/2025** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Team**: Foraneos

**Students**: Eddie, Konrad 

In [1]:
import findspark
findspark.init()

#### Spark Session creation


In [2]:
from pyspark.sql import SparkSession
from foraneos.spark_utils import TrafficListener

konrad_port = "0638c7435d1d"
eddie_port = "8776010e8f6a"

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Files") \
    .master("spark://{}:7077".format(eddie_port)) \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/09 01:38:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### File stream initialization

In [3]:
entries = spark \
                .readStream \
                .format("text") \
                .option("cleanSource", "archive") \
                .option("sourceArchiveDir", "/home/jovyan/archive") \
                .load("/home/jovyan/notebooks/data/strucutred_streaming_files/")

### Start Log Creation


In [4]:
from foraneos.spark_utils import SparkUtils, Logging
import threading

In [5]:
logger = Logging(log_time = 1)
process = threading.Thread(target=logger.start_logging,args=("/home/jovyan/notebooks/data/strucutred_streaming_files/",), daemon=True)
process.start()

### Set the output sink of the stream

In [None]:
from pyspark.sql.functions import split, col, count

# Parse the log data into separate columns
entries = entries.withColumn("data", split(col("value"), " \\| ")) \
                 .withColumn("timestamp", col("data").getItem(0)) \
                 .withColumn("type", col("data").getItem(1)) \
                 .withColumn("message", col("data").getItem(2)) \
                 .withColumn("node", col("data").getItem(3)) \
                 .drop("value", "data") \
                 .filter(col("type") == "ERROR")

# Count the total number of ERROR entries
len_entries = entries.groupBy().agg(count("*").alias("total_error_entries"))

# Adding listener
spark.streams.addListener(TrafficListener())

# Stream the error log entries to the console
query_entries = entries \
                .writeStream \
                .outputMode("append") \
                .option("checkpointLocation", "/home/jovyan/checkpoint/") \
                .trigger(processingTime='10 seconds') \
                .format("console") \
                .start()

query_entries.awaitTermination(40)

# Uncomment below to stream the count of error entries
# query_len_entries = len_entries \
#                 .writeStream \
#                 .outputMode("complete") \
#                 .trigger(processingTime='2 seconds') \
#                 .format("console") \
#                 .start()

# query_len_entries.awaitTermination()

25/04/09 01:39:24 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Query started: e19a82e7-6708-4984-8a69-1991514fec7a


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+----+-------+----+
|timestamp|type|message|node|
+---------+----+-------+----+
+---------+----+-------+----+

Query made progress: {
  "id" : "e19a82e7-6708-4984-8a69-1991514fec7a",
  "runId" : "511953bc-95c6-4b2e-85a2-530ce34e66b5",
  "name" : null,
  "timestamp" : "2025-04-09T01:39:24.835Z",
  "batchId" : 0,
  "numInputRows" : 0,
  "inputRowsPerSecond" : 0.0,
  "processedRowsPerSecond" : 0.0,
  "durationMs" : {
    "addBatch" : 2204,
    "commitOffsets" : 63,
    "getBatch" : 128,
    "latestOffset" : 110,
    "queryPlanning" : 817,
    "triggerExecution" : 3442,
    "walCommit" : 75
  },
  "stateOperators" : [ ],
  "sources" : [ {
    "description" : "FileStreamSource[file:/home/jovyan/notebooks/data/strucutred_streaming_files]",
    "startOffset" : null,
    "endOffset" : {
      "logOffset" : 0
    },
    "latestOffset" : null,
    "numInputRows" : 0,
    "inputRowsPerSecond

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+-----+--------------------+-------------+
|          timestamp| type|             message|         node|
+-------------------+-----+--------------------+-------------+
|2025-04-09 01:39:23|ERROR|Major dependency ...|server-node-2|
|2025-04-09 01:39:25|ERROR|Severe performanc...|server-node-1|
|2025-04-09 01:39:27|ERROR|File permissions ...|server-node-2|
|2025-04-09 01:39:34|ERROR|Critical security...|server-node-9|
|2025-04-09 01:39:35|ERROR|Service deploymen...|server-node-7|
+-------------------+-----+--------------------+-------------+

Query made progress: {
  "id" : "e19a82e7-6708-4984-8a69-1991514fec7a",
  "runId" : "511953bc-95c6-4b2e-85a2-530ce34e66b5",
  "name" : null,
  "timestamp" : "2025-04-09T01:39:40.000Z",
  "batchId" : 1,
  "numInputRows" : 20,
  "inputRowsPerSecond" : 2.0,
  "processedRowsPerSecond" : 4.237288135593221,
  "durationMs" : {
    "addBatch

False

-------------------------------------------
Batch: 3
-------------------------------------------
+-------------------+-----+--------------------+-------------+
|          timestamp| type|             message|         node|
+-------------------+-----+--------------------+-------------+
|2025-04-09 01:39:48|ERROR|      Disk I/O error|server-node-2|
|2025-04-09 01:39:49|ERROR|Cloud resource ex...|server-node-1|
|2025-04-09 01:39:51|ERROR|Database connecti...|server-node-9|
|2025-04-09 01:39:52|ERROR|Memory allocation...|server-node-6|
|2025-04-09 01:39:54|ERROR|Memory allocation...|server-node-8|
|2025-04-09 01:39:58|ERROR|Service loop dete...|server-node-1|
+-------------------+-----+--------------------+-------------+

Query made progress: {
  "id" : "e19a82e7-6708-4984-8a69-1991514fec7a",
  "runId" : "511953bc-95c6-4b2e-85a2-530ce34e66b5",
  "name" : null,
  "timestamp" : "2025-04-09T01:40:10.000Z",
  "batchId" : 3,
  "numInputRows" : 13,
  "inputRowsPerSecond" : 1.3,
  "processedRowsP

In [7]:
query_entries.stop()
# query_len_entries.stop()

Query terminated: e19a82e7-6708-4984-8a69-1991514fec7a


In [8]:
logger.stop_logging()


In [9]:
sc.stop()