# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>

#### <center> **Final Project: Structured Streaming** </center>
---

**Date**: November, 2025

**Student Name**: Ana Carolina Arellano Valdez

**Professor**: Pablo Camarillo Ramirez

# Producer

In [22]:
from carolinarellano.fake_data import FakeDataGenerator

# Initialize the generator
generator = FakeDataGenerator()

tracks_df, events_df = generator.generate_and_save_all(
    n_tracks=250,      
    n_users=50,
    n_events=100
)

Generating fake data...
Saved tracks.txt with 250 rows to /opt/spark/work-dir/data/carolinarellano/spotify_logs/tracks.txt
Saved user_events.txt with 100 rows to /opt/spark/work-dir/data/carolinarellano/spotify_logs/user_events.txt
Data generation complete!


In [23]:
!pwd
!ls -l data/carolinarellano/spotify_logs
!head -n 5 data/carolinarellano/spotify_logs/user_events.txt

/opt/spark/work-dir


total 24
-rw-r--r-- 1 root root 13807 Nov 14 06:57 tracks.txt
-rw-r--r-- 1 root root  7574 Nov 14 06:57 user_events.txt
user_id|track_id|event_type|event_ts|session_id|device
user_0048|track_0183|play|2025-09-06 16:05:28.086028|session_000150|mobile
user_0036|track_0112|play|2025-09-09 14:56:42.086028|session_002909|desktop
user_0035|track_0089|skip|2025-09-03 07:04:20.086028|session_004532|web player
user_0031|track_0021|like|2025-10-19 08:46:49.086028|session_004243|tv


# Consumer
## Dataset and Stream creation 

In [24]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Structured Streaming - Spotify Logs") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [25]:
from carolinarellano.spark_utils import SparkUtils

user_events_schema = SparkUtils.generate_schema([
    ("user_id", "string"),
    ("track_id", "string"),
    ("event_type", "string"),
    ("event_timestamp", "string"),
    ("session_id", "string"),
    ("device", "string")
])

## Transformations and Actions

In [29]:
from pyspark.sql.functions import col, split, to_date, trim, year

user_events_streaming_df = spark.readStream \
    .format("text") \
    .load("/opt/spark/work-dir/data/carolinarellano/spotify_logs")

parsed_user_events_streaming_df = user_events_streaming_df.select(
    split(col("value"), " \\|").alias("parsed")
).select(
    col("parsed")[0].alias("user_id"),
    col("parsed")[1].alias("track_id"),
    col("parsed")[2].alias("event_type"),
    col("parsed")[3].alias("event_timestamp"),
    col("parsed")[4].alias("session_id"),
    col("parsed")[5].alias("device")
)

final_df = parsed_user_events_streaming_df.withColumn(
    "event_timestamp", to_date(trim(col("event_timestamp")), "yyyy-MM-dd HH:mm:ss")
).withColumn(
    "year", year(col("event_timestamp"))
)

final_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_timestamp: date (nullable = true)
 |-- session_id: string (nullable = true)
 |-- device: string (nullable = true)
 |-- year: integer (nullable = true)



## Persistence Data

In [30]:
query_event_type = final_df.groupBy("event_type").count()
# query_device = final_df.groupBy("device").count()
# query_year_device = final_df.groupBy("year", "device").count()
logs_detailed = final_df.select("user_id", "track_id", "event_type", "event_timestamp", "session_id", "device", "year")

In [31]:
event_type_query = query_event_type.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .start()

# device_query = query_device.writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .option("truncate", "false") \
#     .start()

# year_device_query = query_year_device.writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .option("truncate", "false") \
#     .start()

logs_detailed_query = logs_detailed.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

print("Streaming started...")
event_type_query.awaitTermination()
event_type_query.stop()
logs_detailed_query.awaitTermination()
logs_detailed_query.stop()

Streaming started...


25/11/14 07:03:46 ERROR TaskSetManager: Task 0 in stage 15.0 failed 4 times; aborting job
25/11/14 07:03:46 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 0, writer: ConsoleWriter[numRows=20, truncate=false]] is aborting.
25/11/14 07:03:46 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 0, writer: ConsoleWriter[numRows=20, truncate=false]] aborted.
25/11/14 07:03:46 ERROR MicroBatchExecution: Query [id = 724b6d54-ceef-43d7-8aab-709fc32d06b7, runId = 48fa94d0-6735-48f0-8ef1-ba27e27bdec8] terminated with error
org.apache.spark.SparkArrayIndexOutOfBoundsException: [INVALID_ARRAY_INDEX] The index 3 is out of bounds. The array has 1 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. SQLSTATE: 22003
== DataFrame ==
"__getitem__" was called from
line 13 in cell [29]

	at org.apache.spark.sql.errors.QueryExecutionErrors$.invalidArrayIndexError(QueryExecutionErrors.scala:

StreamingQueryException: [STREAM_FAILED] Query [id = 12a578ae-008c-455d-a458-162b6d47b7c5, runId = 4f211459-7881-467f-aaf9-0f1c2327b0de] terminated with exception: [INVALID_ARRAY_INDEX] The index 2 is out of bounds. The array has 1 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. SQLSTATE: 22003
== DataFrame ==
"__getitem__" was called from
line 12 in cell [29]
 SQLSTATE: XXKST
=== Streaming Query ===
Identifier: [id = 12a578ae-008c-455d-a458-162b6d47b7c5, runId = 4f211459-7881-467f-aaf9-0f1c2327b0de]
Current Committed Offsets: {}
Current Available Offsets: {FileStreamSource[file:/opt/spark/work-dir/data/carolinarellano/spotify_logs]: {"logOffset":0}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
~WriteToMicroBatchDataSource org.apache.spark.sql.execution.streaming.ConsoleTable$@79f0e677, 12a578ae-008c-455d-a458-162b6d47b7c5, [truncate=false], Complete
+- ~Aggregate [event_type#242], [event_type#242, count(1) AS count#248L]
   +- ~Project [user_id#240, track_id#241, event_type#242, event_timestamp#246, session_id#244, device#245, year(event_timestamp#246) AS year#247]
      +- ~Project [user_id#240, track_id#241, event_type#242, to_date(trim(event_timestamp#243, None), Some(yyyy-MM-dd HH:mm:ss), Some(Etc/UTC), true) AS event_timestamp#246, session_id#244, device#245]
         +- ~Project [parsed#238[0] AS user_id#240, parsed#238[1] AS track_id#241, parsed#238[2] AS event_type#242, parsed#238[3] AS event_timestamp#243, parsed#238[4] AS session_id#244, parsed#238[5] AS device#245]
            +- ~Project [split(value#237,  \|, -1) AS parsed#238]
               +- ~StreamingExecutionRelation FileStreamSource[file:/opt/spark/work-dir/data/carolinarellano/spotify_logs], [value#237]


## Power BI Dashboard