In [1]:
from pyspark.sql import SparkSession

## Bronze: Ingesting Raw Data

In [2]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("data/bronze/crypto_trades")
print("rows:", df.count())
df.orderBy("offset", ascending=False).show(10, truncate=False)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/17 15:04:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

rows: 612
+-------------+---------+------+-----------------------+-------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic        |partition|offset|ts_kafka               |ts_type|key     |value_raw                                                                                                                                                         |
+-------------+---------+------+-----------------------+-------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|crypto.trades|0        |611   |2025-09-17 14:48:54.93 |0      |XBT/USDT|{"exchange":"kraken","symbol":"XBT/USDT","price":115998.6,"size":0.00055534,"side":"sell","order_type":"limit","ts_event":1758145734843,"ts_ingest":1758145734930}|
|crypto.trades|0        |610   |2025-09-17

## Silver: Processing Raw Data

In [3]:
sdf = spark.read.parquet("../data/silver/trades")
print("rows:", sdf.count())
sdf.select("symbol","price","size","side","event_time","ingest_time")\
   .orderBy("event_time", ascending=False).show(10, False)


rows: 595
+--------+--------+----------+----+-----------------------+-----------------------+
|symbol  |price   |size      |side|event_time             |ingest_time            |
+--------+--------+----------+----+-----------------------+-----------------------+
|XBT/USDT|115976.8|0.00176792|sell|2025-09-17 14:48:47.963|2025-09-17 14:48:48.038|
|XBT/USDT|115976.8|2.5684E-4 |sell|2025-09-17 14:48:46.402|2025-09-17 14:48:46.475|
|XBT/USDT|115979.9|2.4646E-4 |sell|2025-09-17 14:48:46.37 |2025-09-17 14:48:46.441|
|XBT/USDT|115979.9|2.4619E-4 |sell|2025-09-17 14:48:46.361|2025-09-17 14:48:46.431|
|XBT/USDT|115979.9|2.4592E-4 |sell|2025-09-17 14:48:46.351|2025-09-17 14:48:46.423|
|XBT/USDT|115976.5|5.54E-4   |sell|2025-09-17 14:46:43.057|2025-09-17 14:46:43.136|
|XBT/USDT|115976.5|5.5234E-4 |sell|2025-09-17 14:46:43.047|2025-09-17 14:46:43.123|
|XBT/USDT|115976.5|5.5069E-4 |sell|2025-09-17 14:46:43.037|2025-09-17 14:46:43.114|
|XBT/USDT|115976.5|5.4905E-4 |sell|2025-09-17 14:46:43.027|2025-09

## Gold: Aggregating to 1 minute Intervals

In [4]:
g = spark.read.parquet("../data/gold/bars_1m")
print(f'rows: {g.count()}')
g.orderBy("bar_start", ascending=False).select("symbol","bar_start","open","high","low","close","volume","vwap","trades").show(10, False)


rows: 174
+--------+-------------------+--------+--------+--------+--------+----------+------------------+------+
|symbol  |bar_start          |open    |high    |low     |close   |volume    |vwap              |trades|
+--------+-------------------+--------+--------+--------+--------+----------+------------------+------+
|XBT/USDT|2025-09-07 21:34:00|111188.7|111188.7|111188.7|111188.7|6.8434E-4 |111188.70000000001|1     |
|XBT/USDT|2025-09-07 21:31:00|111139.5|111139.5|111139.5|111139.5|0.00360344|111139.5          |1     |
|XBT/USDT|2025-09-07 21:30:00|111127.9|111127.9|111127.9|111127.9|8.999E-5  |111127.9          |1     |
|XBT/USDT|2025-09-07 21:27:00|111116.4|111116.4|111116.4|111116.4|0.00449946|111116.4          |1     |
|XBT/USDT|2025-09-07 21:23:00|111103.3|111103.3|111103.3|111103.3|3.2434E-4 |111103.30000000002|1     |
|XBT/USDT|2025-09-07 21:17:00|111067.5|111067.5|111067.5|111067.5|0.04514099|111067.50000000003|3     |
|XBT/USDT|2025-09-07 21:16:00|111067.4|111067.4|111067