# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>

#### <center> **Final Project: Structured Streaming** </center>
---

**Date**: November, 2025

**Student Name**: Ana Carolina Arellano Valdez

**Professor**: Pablo Camarillo Ramirez

# Producer

In [1]:
from carolinarellano.fake_data import FakeDataGenerator

# Initialize the generator
generator = FakeDataGenerator()

events_df = generator.generate_and_save_user_events(
    n_tracks=250,      
    n_users=50,
    n_events=1000
)

Generating fake user event data...
Saved user_events.txt with 1000 rows to /opt/spark/work-dir/data/carolinarellano/spotify_logs/user_events.txt
User event data generation complete!


In [2]:
!pwd
!ls -l data/carolinarellano/spotify_logs
!head -n 5 data/carolinarellano/spotify_logs/user_events.csv

/opt/spark/work-dir
total 168
-rw-r--r-- 1 root root 14136 Oct 30 13:03 tracks.csv
-rw-r--r-- 1 root root 75317 Oct 30 13:03 user_events.csv
-rw-r--r-- 1 root root 75322 Nov 21 16:28 user_events.txt
user_id,track_id,event_type,event_ts,session_id,device
user_0014,track_0074,seek,2025-09-22 16:11:50.007743,session_001586,mobile
user_0009,track_0210,play,2025-10-17 04:51:52.007743,session_003926,web player
user_0040,track_0183,play,2025-10-13 21:26:40.007743,session_001201,web player
user_0013,track_0165,skip,2025-08-18 06:42:02.007743,session_003662,web player


# Consumer
## Dataset and Stream creation 

In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Structured Streaming - Spotify Logs") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/21 16:28:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from carolinarellano.spark_utils import SparkUtils

user_events_schema = SparkUtils.generate_schema([
    ("user_id", "string"),
    ("track_id", "string"),
    ("event_type", "string"),
    ("event_ts", "string"),
    ("session_id", "string"),
    ("device", "string")
])

## Transformations and Actions

In [16]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, trim, try_to_timestamp, year

df = spark \
    .readStream \
    .option("sep", ",") \
    .option("header", "true") \
    .schema(user_events_schema) \
    .csv("data/carolinarellano/spotify_logs/")

# Add a new column with the year extracted from the event_ts column
df_with_year = df.withColumn("year", year(col("event_ts")))
df_with_month = df_with_year.withColumn("month", col("event_ts").substr(6, 2))
df_with_day = df_with_month.withColumn("day", col("event_ts").substr(9, 2))

## Persistence Data

In [17]:
logs_detailed = df_with_day.select("*")
logs_detailed.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_ts: string (nullable = true)
 |-- session_id: string (nullable = true)
 |-- device: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)



In [19]:

logs_detailed_query = logs_detailed.writeStream \
    .trigger(processingTime='10 seconds') \
    .partitionBy("year", "month", "day") \
    .format("parquet") \
    .option("header", "true") \
    .option("path", "data/carolinarellano/processed/spotify_logs_detailed/") \
    .option("checkpointLocation", "data/carolinarellano/checkpoints/spotify_logs_detailed/") \
    .start()

In [20]:
spark.stop()

## Power BI Dashboard