# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>

#### <center> **Final Project: Structured Streaming** </center>
---

**Date**: 23 November, 2025

**Student Name**: Antonia Horburger

**Professor**: Pablo Camarillo Ramirez

In [2]:
import findspark; findspark.init()
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Final Project: Streaming Processing")
    .master("spark://spark-master:7077")
    .config("spark.ui.port", "4040")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

base_path = "/opt/spark/work-dir"
stream_in  = f"{base_path}/data/stream/btc/incoming"
stream_out = f"{base_path}/data/stream/btc/persisted"
chkpt_path = f"{base_path}/data/stream/btc/checkpoints"

print("stream_in :", stream_in)
print("stream_out:", stream_out)
print("chkpt_path:", chkpt_path)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/24 05:31:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


stream_in : /opt/spark/work-dir/data/stream/btc/incoming
stream_out: /opt/spark/work-dir/data/stream/btc/persisted
chkpt_path: /opt/spark/work-dir/data/stream/btc/checkpoints


## Dataset & Stream Creation

In [3]:
from antoniahorburger.spark_utils import SparkUtils
from pyspark.sql.types import LongType

btc_schema_columns = [
    ("timestamp", "string"),
    ("price",     "double"),
    ("quantity",  "double"),
    ("side",      "string"),
    ("level",     "int"),
    ("update_id", "string"),
]

btc_schema = SparkUtils.generate_schema(btc_schema_columns)

import os
os.makedirs(stream_in, exist_ok=True)

raw_stream = (
    spark.readStream
         .format("csv")
         .option("header", "true")
         .schema(btc_schema)
         .load(stream_in)
)

## Transformations

In [4]:
from pyspark.sql.functions import (
    col, from_unixtime, to_timestamp, window,
    avg, sum as _sum, min as _min, max as _max, to_date
)
from pyspark.sql.types import LongType

ds = (
    raw_stream
      .withColumn("timestamp_ms", col("timestamp").cast(LongType()))
      .withColumn("update_id_long", col("update_id").cast(LongType()))
      .withColumn(
          "event_time",
          to_timestamp(from_unixtime((col("timestamp_ms") / 1000).cast("long")))
      )
      .withColumn("price",    col("price").cast("double"))
      .withColumn("quantity", col("quantity").cast("double"))
      .withColumn("level",    col("level").cast("int"))
)

feat_per_min = (
    ds
      .withWatermark("event_time", "2 minutes")
      .groupBy(
          window(col("event_time"), "1 minute").alias("w"),
          col("side"),
      )
      .agg(
          avg("price").alias("avg_price"),
          _sum("quantity").alias("total_qty"),
          _min("price").alias("min_price"),
          _max("price").alias("max_price"),
      )
)

feat_per_min_partitioned = feat_per_min.withColumn(
    "date",
    to_date(col("w.start"))
)

## Persistence with Vertical Partition

In [5]:
for q in spark.streams.active:
    q.stop()

sink_query = (
    feat_per_min_partitioned
        .writeStream
        .format("parquet")
        .option("path", stream_out)
        .option("checkpointLocation", chkpt_path)
        .partitionBy("date")
        .outputMode("append")
        .start()
)

sink_query

<pyspark.sql.streaming.query.StreamingQuery at 0x71eecd9378b0>

                                                                                

In [6]:
sink_query.lastProgress

{
  "id" : "497649b5-6a86-4495-bf1f-90bc60b0d497",
  "runId" : "5e0aacba-108a-4b1d-b314-49c18c8fce0a",
  "name" : null,
  "timestamp" : "2025-11-24T05:35:13.362Z",
  "batchId" : 3,
  "batchDuration" : 32,
  "numInputRows" : 0,
  "inputRowsPerSecond" : 0.0,
  "processedRowsPerSecond" : 0.0,
  "durationMs" : {
    "latestOffset" : 32,
    "triggerExecution" : 32
  },
  "eventTime" : {
    "watermark" : "2025-08-11T13:58:34.000Z"
  },
  "stateOperators" : [ {
    "operatorName" : "stateStoreSave",
    "numRowsTotal" : 4,
    "numRowsUpdated" : 0,
    "allUpdatesTimeMs" : 15,
    "numRowsRemoved" : 0,
    "allRemovalsTimeMs" : 90,
    "commitTimeMs" : 35730,
    "memoryUsedBytes" : 90720,
    "numRowsDroppedByWatermark" : 0,
    "numShufflePartitions" : 200,
    "numStateStoreInstances" : 200,
    "customMetrics" : {
      "loadedMapCacheHitCount" : 800,
      "loadedMapCacheMissCount" : 0,
      "stateOnCurrentVersionSizeBytes" : 24992
    }
  } ],
  "sources" : [ {
    "description" : "F

In [7]:
import os

for root, dirs, files in os.walk(stream_out):
    print(root, "->", files)

/opt/spark/work-dir/data/stream/btc/persisted -> []
/opt/spark/work-dir/data/stream/btc/persisted/_spark_metadata -> ['.0.crc', '.1.crc', '.2.crc', '0', '1', '2']


In [None]:
for q in spark.streams.active:
    q.stop()

In [None]:
from pyspark.sql.functions import count

df_persisted = spark.read.parquet(stream_out)

df_persisted.printSchema()
df_persisted.show(10, truncate=False)

# 2) Build an aggregated table for Power BI
df_powerbi = (
    df_persisted
      .groupBy("date", "side")
      .agg(
          count("*").alias("rows"),
          avg("avg_price").alias("mean_avg_price"),
          avg("total_qty").alias("mean_total_qty")
      )
)

df_powerbi.show(10, truncate=False)

powerbi_path = f"{base_path}/data/stream/btc/powerbi"

(df_powerbi
    .coalesce(1)                  
    .write
    .mode("overwrite")
    .option("header", "true")
    .csv(powerbi_path)
)

**Aviso:** Implementé todo el código para la persistencia y la exportación a Power BI pero en mi entorno local Spark no logró escribir los archivos Parquet (solo generó _spark_metadata). Probé reiniciar el checkpoint, las carpetas y el producer, pero el error seguía apareciendo. Por el tiempo de entrega lo dejo así, pero antes de la presentación voy a intentar de corregirlo.