# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>

#### <center> **Final Project: Structured Streaming** </center>
---

**Date**: November, 2025

**Student Name**: Ana Carolina Arellano Valdez

**Professor**: Pablo Camarillo Ramirez

# Producer

In [17]:
from carolinarellano.fake_data import FakeDataGenerator

# Initialize the generator
generator = FakeDataGenerator()

events_df = generator.generate_and_save_all(
    n_tracks=250,      
    n_users=50,
    n_events=1000
)

Generating fake data...
Saved tracks.csv with 250 rows to /opt/spark/work-dir/data/carolinarellano/spotify_logs/tracks.csv
Saved user_events.csv with 1000 rows to /opt/spark/work-dir/data/carolinarellano/spotify_logs/user_events.csv
Data generation complete!


In [18]:
!pwd
!ls -l data/carolinarellano/spotify_logs
!head -n 5 data/carolinarellano/spotify_logs/user_events.csv

/opt/spark/work-dir
total 76
-rw-r--r-- 1 root root 75524 Nov 21 19:30 user_events.csv
user_id,track_id,event_type,event_ts,session_id,device
user_0038,track_0188,play,2025-09-03 05:12:00.065222,session_004165,tv
user_0033,track_0134,skip,2025-10-21 19:55:22.065222,session_002724,desktop
user_0011,track_0063,play,2025-10-09 15:20:29.065222,session_003043,desktop
user_0001,track_0235,play,2025-09-28 03:51:23.065222,session_004413,desktop


# Consumer
## Dataset and Stream creation 

In [19]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Structured Streaming - Spotify Logs") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [20]:
from carolinarellano.spark_utils import SparkUtils

user_events_schema = SparkUtils.generate_schema([
    ("user_id", "string"),
    ("track_id", "string"),
    ("event_type", "string"),
    ("event_ts", "string"),
    ("session_id", "string"),
    ("device", "string")
])

## Transformations and Actions

In [21]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, trim, try_to_timestamp, year

df = spark \
    .readStream \
    .option("sep", ",") \
    .option("header", "true") \
    .schema(user_events_schema) \
    .csv("data/carolinarellano/spotify_logs/")

# Add a new column with the year extracted from the event_ts column
df_with_year = df.withColumn("year", year(col("event_ts")))
df_with_month = df_with_year.withColumn("month", col("event_ts").substr(6, 2))
df_with_day = df_with_month.withColumn("day", col("event_ts").substr(9, 2))


## Persistence Data

In [22]:
logs_detailed = df_with_day.select("*")
logs_detailed.printSchema()

# Print a few rows to verify
query = logs_detailed \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination(30)  # Run for 30 seconds
query.stop()

root
 |-- user_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_ts: string (nullable = true)
 |-- session_id: string (nullable = true)
 |-- device: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)



                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+----------+----------+--------------------------+--------------+----------+----+-----+---+
|user_id  |track_id  |event_type|event_ts                  |session_id    |device    |year|month|day|
+---------+----------+----------+--------------------------+--------------+----------+----+-----+---+
|user_0038|track_0188|play      |2025-09-03 05:12:00.065222|session_004165|tv        |2025|09   |03 |
|user_0033|track_0134|skip      |2025-10-21 19:55:22.065222|session_002724|desktop   |2025|10   |21 |
|user_0011|track_0063|play      |2025-10-09 15:20:29.065222|session_003043|desktop   |2025|10   |09 |
|user_0001|track_0235|play      |2025-09-28 03:51:23.065222|session_004413|desktop   |2025|09   |28 |
|user_0020|track_0035|skip      |2025-11-15 20:57:40.065222|session_002721|desktop   |2025|11   |15 |
|user_0047|track_0026|seek      |2025-10-21 17:55:13.065222|session_002394|tv        |2

In [24]:
base_path = "/opt/spark/work-dir/data/"

logs_detailed_query = logs_detailed.writeStream \
    .trigger(processingTime='10 seconds') \
    .partitionBy("year","month") \
    .format("parquet") \
    .option("path", base_path + "carolinarellano/processed/spotify_logs_detailed/") \
    .option("checkpointLocation", base_path + "carolinarellano/checkpoints/spotify_logs_detailed/parquet/") \
    .start()

In [25]:
spark.stop()

## Power BI Dashboard