In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from gatubelxs.spark_utils import SparkUtils
from gatubelxs.streaming_monitor import StreamingPerformanceMonitor, analyze_performance_data

In [12]:
import os 
base_path = "/home/jovyan/notebooks/data"

for topic in ["page_views", "click_events", "user_interactions"]:
    os.makedirs(f"{base_path}/{topic}/checkpoint", exist_ok=True)
    os.makedirs(f"{base_path}/{topic}", exist_ok=True)  # For the data files too

25/05/13 02:49:05 ERROR MicroBatchExecution: Query [id = 3f9b9cf9-6378-4db4-82e0-1d93c9cf0ffa, runId = eb429f68-17cb-42c3-946d-d1c4053d4124] terminated with error
java.io.FileNotFoundException: File file:/home/jovyan/notebooks/data/click_events/checkpoint/offsets does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:779)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1100)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:769)
	at org.apache.hadoop.fs.DelegateToFileSystem.getFileStatus(DelegateToFileSystem.java:128)
	at org.apache.hadoop.fs.DelegateToFileSystem.createInternal(DelegateToFileSystem.java:93)
	at org.apache.hadoop.fs.ChecksumFs$ChecksumFSOutputSummer.<init>(ChecksumFs.java:353)
	at org.apache.hadoop.fs.ChecksumFs.createInternal(ChecksumFs.java:400)
	at org.apache.hadoop.fs.AbstractFileSystem.create(AbstractFileSystem.java:626)
	at org.apache.h

Query terminated: 3f9b9cf9-6378-4db4-82e0-1d93c9cf0ffa at 2025-05-13 02:49:05.256336
Query terminated: 234ed67b-dc45-4e7a-a5c6-f13888e7b78f at 2025-05-13 02:49:05.257601
Query terminated: b12caf86-1f10-4a17-8be5-bb4b8b8ab090 at 2025-05-13 02:49:05.259534


In [4]:
spark = SparkSession.builder \
    .appName("FinalProjectGatubelxs") \
    .master("spark://b33dcc1265b4:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4fa0d0ee-0a0a-486c-b44b-deaa3e6ac339;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.5.4 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found org.apache.commons#commons-pool2;2.11.1 in centr

In [5]:
performance_monitor = StreamingPerformanceMonitor()
spark.streams.addListener(performance_monitor)

In [6]:
TOPICS = ["page_views", "click_events", "user_interactions"]
KAFKA_BOOTSTRAP_SERVER = "kafka:9093"

In [7]:
page_views_schema = SparkUtils.generate_schema(
    [

        ("user_id", "string"),
        ("session_id", "string"),
        ("page_url", "string"),
        ("referrer_url", "string"),
        ("category", "string"),
        ("price", "float"),
        ("timestamp", "string")
    ]
)

In [8]:
click_events_schema = SparkUtils.generate_schema(
    [
        ("user_id", "string"),
        ("session_id", "string"),
        ("element_id", "string"),
        ("page_url", "string"),
        ("category", "string"),
        ("price", "float"),
        ("timestamp", "string"),
        ("x_coord", "float"),
        ("y_coord", "float"),
    ]
)

In [9]:
user_interaction_schema = SparkUtils.generate_schema(
    [
        ("user_id", "string"),
        ("session_id", "string"),
        ("interaction_type", "string"),
        ("page_url", "string"),
        ("category", "string"),
        ("price", "float"),
        ("details", "string"),
        ("timestamp", "string")
    ]
)

In [10]:
def create_stream_query(topic, schema, output_dir):
    kafka__stream = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVER) \
        .option("subscribe", topic) \
        .load()
    
    parsed_data = kafka__stream.selectExpr("CAST(value as STRING) as json") \
        .withColumn("data", from_json(col("json"), schema)) \
        .select("data.*")

    return parsed_data.writeStream \
        .format("parquet") \
        .outputMode("append") \
        .option("path", f"{base_path}/{topic}") \
        .option("checkpointLocation", f"{base_path}/{topic}/checkpoint") \
        .trigger(processingTime="5 seconds") \
        .start()


In [13]:
queries = [
    create_stream_query("page_views", page_views_schema, "./datalake/page_views"),
    create_stream_query("click_events", click_events_schema, "./datalake/click_events"),
    create_stream_query("user_interactions", user_interaction_schema, "./datalake/user_interactions"),
]

for q in queries:
    q.awaitTermination(timeout=60)

for q in queries:
    q.stop()

analyze_performance_data(performance_monitor.metrics_history)

25/05/13 02:49:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/13 02:49:06 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/05/13 02:49:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/13 02:49:06 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Query started: 6d1873b1-444e-4aed-9f88-f985e0238522 at 2025-05-13 02:49:06.526340
Query started: 26e943bf-4ad6-455b-8e7e-9017a2def1d9 at 2025-05-13 02:49:06.613129


25/05/13 02:49:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/13 02:49:06 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Query started: 181d51de-4ee4-4850-9f0b-14b43949beab at 2025-05-13 02:49:06.763812


                                                                                


Performance summary for page_views]:
Processed 7 rows at 6.78 rows/second
Input rate: 0.00 rows/second
Processing time: 1033ms

Performance summary for click_events]:
Processed 0 rows at 0.00 rows/second
Input rate: 0.00 rows/second
Processing time: 1005ms

Performance summary for user_interactions]:
Processed 0 rows at 0.00 rows/second
Input rate: 0.00 rows/second
Processing time: 884ms

Performance summary for user_interactions]:
Processed 7 rows at 9.92 rows/second
Input rate: 2.16 rows/second
Processing time: 706ms

Performance summary for page_views]:
Processed 8 rows at 6.51 rows/second
Input rate: 2.30 rows/second
Processing time: 1229ms


                                                                                


Performance summary for click_events]:
Processed 9 rows at 4.96 rows/second
Input rate: 2.66 rows/second
Processing time: 1814ms

Performance summary for user_interactions]:
Processed 10 rows at 13.26 rows/second
Input rate: 2.00 rows/second
Processing time: 754ms

Performance summary for click_events]:
Processed 19 rows at 14.36 rows/second
Input rate: 3.80 rows/second
Processing time: 1323ms

Performance summary for page_views]:
Processed 17 rows at 9.92 rows/second
Input rate: 3.40 rows/second
Processing time: 1713ms

Performance summary for click_events]:
Processed 16 rows at 21.42 rows/second
Input rate: 3.20 rows/second
Processing time: 747ms


                                                                                


Performance summary for page_views]:
Processed 9 rows at 6.93 rows/second
Input rate: 1.80 rows/second
Processing time: 1298ms

Performance summary for user_interactions]:
Processed 14 rows at 7.43 rows/second
Input rate: 2.80 rows/second
Processing time: 1884ms


                                                                                


Performance summary for click_events]:
Processed 20 rows at 26.85 rows/second
Input rate: 4.00 rows/second
Processing time: 745ms

Performance summary for user_interactions]:
Processed 23 rows at 17.76 rows/second
Input rate: 4.60 rows/second
Processing time: 1295ms

Performance summary for page_views]:
Processed 18 rows at 10.39 rows/second
Input rate: 3.60 rows/second
Processing time: 1732ms

Performance summary for user_interactions]:
Processed 11 rows at 15.11 rows/second
Input rate: 2.20 rows/second
Processing time: 728ms

Performance summary for click_events]:
Processed 17 rows at 22.52 rows/second
Input rate: 3.40 rows/second
Processing time: 755ms

Performance summary for page_views]:
Processed 14 rows at 10.74 rows/second
Input rate: 2.80 rows/second
Processing time: 1303ms


                                                                                


Performance summary for click_events]:
Processed 12 rows at 16.74 rows/second
Input rate: 2.40 rows/second
Processing time: 717ms


                                                                                


Performance summary for page_views]:
Processed 26 rows at 20.38 rows/second
Input rate: 5.20 rows/second
Processing time: 1276ms

Performance summary for user_interactions]:
Processed 16 rows at 8.92 rows/second
Input rate: 3.20 rows/second
Processing time: 1793ms

Performance summary for click_events]:
Processed 13 rows at 15.55 rows/second
Input rate: 2.60 rows/second
Processing time: 836ms

Performance summary for user_interactions]:
Processed 12 rows at 8.77 rows/second
Input rate: 2.40 rows/second
Processing time: 1369ms


                                                                                


Performance summary for page_views]:
Processed 14 rows at 7.43 rows/second
Input rate: 2.80 rows/second
Processing time: 1884ms

Performance summary for user_interactions]:
Processed 22 rows at 30.64 rows/second
Input rate: 4.40 rows/second
Processing time: 718ms

Performance summary for page_views]:
Processed 12 rows at 9.57 rows/second
Input rate: 2.40 rows/second
Processing time: 1254ms

Performance summary for click_events]:
Processed 20 rows at 11.00 rows/second
Input rate: 4.00 rows/second
Processing time: 1819ms

Performance summary for user_interactions]:
Processed 18 rows at 23.87 rows/second
Input rate: 3.60 rows/second
Processing time: 753ms

Performance summary for page_views]:
Processed 6 rows at 7.09 rows/second
Input rate: 1.20 rows/second
Processing time: 846ms


                                                                                


Performance summary for click_events]:
Processed 16 rows at 11.65 rows/second
Input rate: 3.20 rows/second
Processing time: 1373ms

Performance summary for user_interactions]:
Processed 12 rows at 14.96 rows/second
Input rate: 2.40 rows/second
Processing time: 802ms

Performance summary for page_views]:
Processed 19 rows at 13.71 rows/second
Input rate: 3.80 rows/second
Processing time: 1386ms

Performance summary for click_events]:
Processed 19 rows at 9.96 rows/second
Input rate: 3.80 rows/second
Processing time: 1908ms


                                                                                


Performance summary for user_interactions]:
Processed 6 rows at 8.15 rows/second
Input rate: 1.20 rows/second
Processing time: 736ms

Performance summary for page_views]:
Processed 10 rows at 11.10 rows/second
Input rate: 2.00 rows/second
Processing time: 901ms

Performance summary for click_events]:
Processed 13 rows at 9.00 rows/second
Input rate: 2.60 rows/second
Processing time: 1445ms


                                                                                


Performance summary for page_views]:
Processed 15 rows at 18.61 rows/second
Input rate: 3.00 rows/second
Processing time: 806ms

Performance summary for click_events]:
Processed 13 rows at 9.61 rows/second
Input rate: 2.60 rows/second
Processing time: 1353ms

Performance summary for user_interactions]:
Processed 16 rows at 8.14 rows/second
Input rate: 3.20 rows/second
Processing time: 1965ms


                                                                                


Performance summary for click_events]:
Processed 9 rows at 9.23 rows/second
Input rate: 1.80 rows/second
Processing time: 975ms

Performance summary for user_interactions]:
Processed 6 rows at 3.59 rows/second
Input rate: 1.20 rows/second
Processing time: 1670ms


                                                                                


Performance summary for page_views]:
Processed 7 rows at 3.23 rows/second
Input rate: 1.40 rows/second
Processing time: 2164ms


                                                                                


Performance summary for click_events]:
Processed 24 rows at 26.20 rows/second
Input rate: 4.78 rows/second
Processing time: 916ms

Performance summary for user_interactions]:
Processed 7 rows at 4.68 rows/second
Input rate: 1.39 rows/second
Processing time: 1495ms

Performance summary for page_views]:
Processed 9 rows at 4.55 rows/second
Input rate: 1.79 rows/second
Processing time: 1979ms

Performance summary for user_interactions]:
Processed 9 rows at 13.24 rows/second
Input rate: 1.81 rows/second
Processing time: 680ms

Performance summary for click_events]:
Processed 13 rows at 12.91 rows/second
Input rate: 2.61 rows/second
Processing time: 1007ms

Performance summary for page_views]:
Processed 9 rows at 5.76 rows/second
Input rate: 1.81 rows/second
Processing time: 1563ms

Performance summary for page_views]:
Processed 25 rows at 30.86 rows/second
Input rate: 5.00 rows/second
Processing time: 810ms

Performance summary for click_events]:
Processed 11 rows at 7.98 rows/second
Inpu

                                                                                


Performance summary for user_interactions]:
Processed 18 rows at 8.60 rows/second
Input rate: 3.60 rows/second
Processing time: 2094ms


                                                                                


Performance summary for page_views]:
Processed 12 rows at 12.55 rows/second
Input rate: 2.40 rows/second
Processing time: 956ms

Performance summary for click_events]:
Processed 18 rows at 15.76 rows/second
Input rate: 3.60 rows/second
Processing time: 1142ms

Performance summary for user_interactions]:
Processed 9 rows at 5.33 rows/second
Input rate: 1.80 rows/second
Processing time: 1688ms

Performance summary for user_interactions]:
Processed 6 rows at 7.02 rows/second
Input rate: 1.20 rows/second
Processing time: 855ms

Performance summary for page_views]:
Processed 5 rows at 3.60 rows/second
Input rate: 1.00 rows/second
Processing time: 1388ms


                                                                                


Performance summary for click_events]:
Processed 8 rows at 3.88 rows/second
Input rate: 1.60 rows/second
Processing time: 2061ms


                                                                                


Performance summary for click_events]:
Processed 15 rows at 14.66 rows/second
Input rate: 3.00 rows/second
Processing time: 1023ms

Performance summary for page_views]:
Processed 8 rows at 6.21 rows/second
Input rate: 1.60 rows/second
Processing time: 1288ms


                                                                                


Performance summary for user_interactions]:
Processed 17 rows at 9.32 rows/second
Input rate: 3.40 rows/second
Processing time: 1825ms


                                                                                


Performance summary for page_views]:
Processed 11 rows at 12.97 rows/second
Input rate: 2.20 rows/second
Processing time: 848ms

Performance summary for user_interactions]:
Processed 14 rows at 10.04 rows/second
Input rate: 2.80 rows/second
Processing time: 1395ms

Performance summary for click_events]:
Processed 13 rows at 6.57 rows/second
Input rate: 2.60 rows/second
Processing time: 1980ms


                                                                                


Performance summary for page_views]:
Processed 13 rows at 13.57 rows/second
Input rate: 2.60 rows/second
Processing time: 958ms

Performance summary for user_interactions]:
Processed 13 rows at 9.78 rows/second
Input rate: 2.60 rows/second
Processing time: 1329ms


                                                                                


Performance summary for click_events]:
Processed 20 rows at 10.45 rows/second
Input rate: 4.00 rows/second
Processing time: 1913ms


                                                                                


Performance summary for user_interactions]:
Processed 11 rows at 12.54 rows/second
Input rate: 2.20 rows/second
Processing time: 876ms

Performance summary for click_events]:
Processed 10 rows at 6.96 rows/second
Input rate: 2.00 rows/second
Processing time: 1436ms


                                                                                


Performance summary for page_views]:
Processed 17 rows at 8.59 rows/second
Input rate: 3.40 rows/second
Processing time: 1979ms

Performance summary for click_events]:
Processed 14 rows at 15.38 rows/second
Input rate: 2.79 rows/second
Processing time: 910ms

Performance summary for page_views]:
Processed 11 rows at 8.05 rows/second
Input rate: 2.20 rows/second
Processing time: 1367ms


                                                                                


Performance summary for user_interactions]:
Processed 8 rows at 4.10 rows/second
Input rate: 1.59 rows/second
Processing time: 1953ms


                                                                                


Performance summary for user_interactions]:
Processed 13 rows at 12.91 rows/second
Input rate: 2.61 rows/second
Processing time: 1006ms

Performance summary for page_views]:
Processed 10 rows at 6.23 rows/second
Input rate: 2.00 rows/second
Processing time: 1605ms

Performance summary for click_events]:
Processed 18 rows at 7.84 rows/second
Input rate: 3.61 rows/second
Processing time: 2295ms


                                                                                


Performance summary for user_interactions]:
Processed 9 rows at 9.53 rows/second
Input rate: 1.80 rows/second
Processing time: 944ms

Performance summary for page_views]:
Processed 13 rows at 8.79 rows/second
Input rate: 2.60 rows/second
Processing time: 1479ms

Performance summary for click_events]:
Processed 12 rows at 5.98 rows/second
Input rate: 2.40 rows/second
Processing time: 2007ms

Performance summary for click_events]:
Processed 5 rows at 17.54 rows/second
Input rate: 1.00 rows/second
Processing time: 285ms

Performance summary for user_interactions]:
Processed 11 rows at 25.70 rows/second
Input rate: 2.20 rows/second
Processing time: 428ms


                                                                                


Performance summary for page_views]:
Processed 15 rows at 14.56 rows/second
Input rate: 3.00 rows/second
Processing time: 1030ms

Performance summary for click_events]:
Processed 15 rows at 20.41 rows/second
Input rate: 3.00 rows/second
Processing time: 735ms

Performance summary for user_interactions]:
Processed 13 rows at 9.99 rows/second
Input rate: 2.60 rows/second
Processing time: 1301ms

Performance summary for page_views]:
Processed 10 rows at 6.81 rows/second
Input rate: 2.00 rows/second
Processing time: 1469ms

Performance summary for click_events]:
Processed 7 rows at 12.84 rows/second
Input rate: 1.40 rows/second
Processing time: 545ms


                                                                                


Performance summary for page_views]:
Processed 9 rows at 8.12 rows/second
Input rate: 1.80 rows/second
Processing time: 1108ms

Performance summary for user_interactions]:
Processed 19 rows at 11.57 rows/second
Input rate: 3.80 rows/second
Processing time: 1642ms


                                                                                


Performance summary for user_interactions]:
Processed 17 rows at 17.35 rows/second
Input rate: 3.40 rows/second
Processing time: 980ms


                                                                                


Performance summary for click_events]:
Processed 27 rows at 16.97 rows/second
Input rate: 5.40 rows/second
Processing time: 1591ms

Performance summary for page_views]:
Processed 14 rows at 6.73 rows/second
Input rate: 2.80 rows/second
Processing time: 2080ms

Performance summary for click_events]:
Processed 9 rows at 15.87 rows/second
Input rate: 1.80 rows/second
Processing time: 567ms

Performance summary for page_views]:
Processed 14 rows at 12.58 rows/second
Input rate: 2.80 rows/second
Processing time: 1113ms

Performance summary for user_interactions]:
Processed 6 rows at 3.57 rows/second
Input rate: 1.20 rows/second
Processing time: 1683ms

Performance summary for click_events]:
Processed 14 rows at 17.28 rows/second
Input rate: 2.80 rows/second
Processing time: 810ms

Performance summary for page_views]:
Processed 17 rows at 12.43 rows/second
Input rate: 3.40 rows/second
Processing time: 1368ms

Performance summary for user_interactions]:
Processed 24 rows at 15.32 rows/second

                                                                                


Performance summary for page_views]:
Processed 17 rows at 23.91 rows/second
Input rate: 3.40 rows/second
Processing time: 711ms

Performance summary for click_events]:
Processed 27 rows at 21.57 rows/second
Input rate: 5.40 rows/second
Processing time: 1252ms

Performance summary for user_interactions]:
Processed 19 rows at 10.56 rows/second
Input rate: 3.80 rows/second
Processing time: 1799ms

Performance summary for click_events]:
Processed 11 rows at 16.42 rows/second
Input rate: 2.20 rows/second
Processing time: 670ms

Performance summary for page_views]:
Processed 6 rows at 4.94 rows/second
Input rate: 1.20 rows/second
Processing time: 1215ms


                                                                                


Performance summary for user_interactions]:
Processed 7 rows at 3.98 rows/second
Input rate: 1.40 rows/second
Processing time: 1758ms


                                                                                


Performance summary for page_views]:
Processed 7 rows at 10.37 rows/second
Input rate: 1.40 rows/second
Processing time: 675ms

Performance summary for click_events]:
Processed 26 rows at 21.33 rows/second
Input rate: 5.20 rows/second
Processing time: 1219ms

Performance summary for user_interactions]:
Processed 7 rows at 4.19 rows/second
Input rate: 1.40 rows/second
Processing time: 1669ms

Performance summary for page_views]:
Processed 11 rows at 16.25 rows/second
Input rate: 2.20 rows/second
Processing time: 677ms

Performance summary for user_interactions]:
Processed 5 rows at 4.11 rows/second
Input rate: 1.00 rows/second
Processing time: 1217ms

Performance Analysis fro unknown:
Average processing rate: 12.37 rows/second
Peak processing rate: 64.39 rows/second
Average processing time: 1209.55ms
Peak processing time: 4386ms



Performance summary for click_events]:
Processed 14 rows at 8.02 rows/second
Input rate: 2.80 rows/second
Processing time: 1745ms
Query terminated: 6d1873b1-444e-4aed-9f88-f985e0238522 at 2025-05-13 02:52:06.811165
Query terminated: 26e943bf-4ad6-455b-8e7e-9017a2def1d9 at 2025-05-13 02:52:06.811754
Query terminated: 181d51de-4ee4-4850-9f0b-14b43949beab at 2025-05-13 02:52:06.812366


In [None]:
sc.stop()