# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Lab 07 - Real-time log analyzer** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **03/31/2025** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Team**: Foraneos

**Students**: Eddie, Konrad 

In [1]:
import findspark
findspark.init()

#### Spark Session creation


In [2]:
from pyspark.sql import SparkSession

konrad_port = "0638c7435d1d"
eddie_port = "8776010e8f6a"

spark = SparkSession.builder \
    .appName("SparkSQLStructuredStreaming-Files") \
    .master("spark://{}:7077".format(konrad_port)) \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 02:41:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### File stream initialization

In [3]:
entries = spark \
                .readStream \
                .format("text") \
                .option("cleanSource", "archive") \
                .option("sourceArchiveDir", "/home/jovyan/archive") \
                .load("/home/jovyan/notebooks/data/strucutred_streaming_files/")

### Start Log Creation


In [4]:
from foraneos.spark_utils import SparkUtils, Logging
import threading

In [5]:
logger = Logging(log_time = 1)
process = threading.Thread(target=logger.start_logging,args=("/home/jovyan/notebooks/data/strucutred_streaming_files/",), daemon=True)
process.start()

### Set the output sink of the stream

In [None]:
from pyspark.sql.functions import split, col, count

# Parse the log data into separate columns
entries = entries.withColumn("data", split(col("value"), " \\| ")) \
                 .withColumn("timestamp", col("data").getItem(0)) \
                 .withColumn("type", col("data").getItem(1)) \
                 .withColumn("message", col("data").getItem(2)) \
                 .withColumn("node", col("data").getItem(3)) \
                 .drop("value", "data") \
                 .filter(col("type") == "ERROR")

# Count the total number of ERROR entries
len_entries = entries.groupBy().agg(count("*").alias("total_error_entries"))

# Stream the error log entries to the console
query_entries = entries \
                .writeStream \
                .outputMode("append") \
                .option("checkpointLocation", "/home/jovyan/checkpoint/") \
                .trigger(processingTime='10 seconds') \
                .format("console") \
                .start()

query_entries.awaitTermination(40)

# Uncomment below to stream the count of error entries
# query_len_entries = len_entries \
#                 .writeStream \
#                 .outputMode("complete") \
#                 .trigger(processingTime='2 seconds') \
#                 .format("console") \
#                 .start()

# query_len_entries.awaitTermination()

25/04/03 02:41:27 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+-------------------+-----+--------------------+-------------+
|          timestamp| type|             message|         node|
+-------------------+-----+--------------------+-------------+
|2025-04-03 02:40:38|ERROR|Service deploymen...|server-node-3|
|2025-04-03 02:40:39|ERROR|500 Internal Serv...|server-node-4|
|2025-04-03 02:40:33|ERROR|Firewall rule mis...|server-node-8|
|2025-04-03 02:40:36|ERROR|Too many authenti...|server-node-2|
|2025-04-03 02:41:21|ERROR|Database corrupti...|server-node-1|
|2025-04-03 02:41:24|ERROR|   Disk read failure|server-node-1|
|2025-04-03 02:41:26|ERROR|Unexpected shutdo...|server-node-8|
+-------------------+-----+--------------------+-------------+

-------------------------------------------
Batch: 11
-------------------------------------------
+-------------------+-----+--------------------+-------------+
|          timestamp| type|             message

False

-------------------------------------------
Batch: 13
-------------------------------------------
+-------------------+-----+--------------------+-------------+
|          timestamp| type|             message|         node|
+-------------------+-----+--------------------+-------------+
|2025-04-03 02:41:49|ERROR|Network interface...|server-node-1|
|2025-04-03 02:41:51|ERROR|    File system full|server-node-2|
|2025-04-03 02:41:54|ERROR|    File system full|server-node-8|
+-------------------+-----+--------------------+-------------+

-------------------------------------------
Batch: 14
-------------------------------------------
+-------------------+-----+--------------------+-------------+
|          timestamp| type|             message|         node|
+-------------------+-----+--------------------+-------------+
|2025-04-03 02:42:04|ERROR| API service failure|server-node-8|
|2025-04-03 02:42:07|ERROR|   Disk read failure|server-node-4|
|2025-04-03 02:42:08|ERROR|Unrecoverable sys...

In [7]:
query_entries.stop()
# query_len_entries.stop()

In [8]:
logger.stop_logging()


In [9]:
sc.stop()