In [37]:
from pyspark.sql import SparkSession

## Bronze: Ingesting Raw Data

In [38]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("data/bronze/crypto_trades")
print("rows:", df.count())
df.orderBy("offset", ascending=False).show(10, truncate=False)

rows: 159
+-------------+---------+------+-----------------------+-------+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic        |partition|offset|ts_kafka               |ts_type|key     |value_raw                                                                                                                                                          |
+-------------+---------+------+-----------------------+-------+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|crypto.trades|0        |158   |2025-09-02 13:52:18.164|0      |XBT/USDT|{"exchange":"kraken","symbol":"XBT/USDT","price":111315.0,"size":0.00806979,"side":"sell","order_type":"market","ts_event":1756846338133,"ts_ingest":1756846338163}|
|crypto.trades|0        |157   |2025-0

## Silver: Processing Raw Data

In [39]:
sdf = spark.read.parquet("data/silver/trades")
print("rows:", sdf.count())
sdf.select("symbol","price","size","side","event_time","ingest_time")\
   .orderBy("event_time", ascending=False).show(10, False)


rows: 157
+--------+--------+----------+----+-----------------------+-----------------------+
|symbol  |price   |size      |side|event_time             |ingest_time            |
+--------+--------+----------+----+-----------------------+-----------------------+
|XBT/USDT|111315.1|0.0021    |sell|2025-09-02 13:52:18.133|2025-09-02 13:52:18.163|
|XBT/USDT|111315.2|5.0E-5    |sell|2025-09-02 13:52:18.133|2025-09-02 13:52:18.162|
|XBT/USDT|111315.2|0.08978021|sell|2025-09-02 13:52:18.133|2025-09-02 13:52:18.163|
|XBT/USDT|111315.0|0.00806979|sell|2025-09-02 13:52:18.133|2025-09-02 13:52:18.163|
|XBT/USDT|111349.9|0.0011639 |sell|2025-09-02 13:50:21.804|2025-09-02 13:50:21.911|
|XBT/USDT|111350.0|0.00103337|buy |2025-09-02 13:50:11.581|2025-09-02 13:50:11.616|
|XBT/USDT|111293.2|9.26E-6   |sell|2025-09-02 13:45:33.036|2025-09-02 13:45:33.141|
|XBT/USDT|111284.2|9.0E-4    |sell|2025-09-02 13:42:45.863|2025-09-02 13:42:45.924|
|XBT/USDT|111274.2|0.0191    |sell|2025-09-02 13:42:45.863|2025-09

## Gold: Aggregating to 1 minute Intervals

In [40]:
g = spark.read.parquet("data/gold/bars_1m")
print(f'rows: {g.count()}')
g.orderBy("bar_start", ascending=False).select("symbol","bar_start","open","high","low","close","volume","vwap","trades").show(10, False)


rows: 48
+--------+-------------------+--------+--------+--------+--------+---------------------+------------------+------+
|symbol  |bar_start          |open    |high    |low     |close   |volume               |vwap              |trades|
+--------+-------------------+--------+--------+--------+--------+---------------------+------------------+------+
|XBT/USDT|2025-09-02 13:45:00|111293.2|111293.2|111293.2|111293.2|9.26E-6              |111293.20000000001|1     |
|XBT/USDT|2025-09-02 13:42:00|111319.4|111319.4|111274.2|111284.2|0.02121393           |111277.21074039557|3     |
|XBT/USDT|2025-09-02 13:41:00|111344.8|111344.8|111344.8|111344.8|1.7455E-4            |111344.8          |1     |
|XBT/USDT|2025-09-02 13:40:00|111321.3|111350.0|111321.3|111350.0|0.00182073           |111340.55075931082|3     |
|XBT/USDT|2025-09-02 13:37:00|111321.4|111321.4|111321.4|111321.4|9.87E-6              |111321.4          |1     |
|XBT/USDT|2025-09-02 13:35:00|111230.0|111233.8|111230.0|111233.8|0.001