In [1]:
from pyspark.sql.functions import col
from pyspark.sql.protobuf.functions import from_protobuf
from pyspark.sql.types import DateType
from delta.tables import DeltaTable
from pyspark.sql.streaming import StreamingQuery

In [2]:
from pathlib import Path

def read_binary_at(path: Path):
    with path.open("rb") as fb:
        bindata = fb.read()
    return bindata

## Note on Running the Following Examples
1. If you have gone through the process of running [highwire](https://github.com/datacircus/highwire) and have populated data using [100_automatic_kafka_to_tin_delta.ipynb](./100_automatic_kafka_to_tin_delta.ipynb), then you will have the table `default.coffeeco_v1_orders_tin`.
2. If you haven't run the 100 level notebook, you can change the Delta reference to point to `/opt/spark/work-dir/hitchhikers_guide/datasets/coffeeco_v1_orders_tin` so that you don't need to build and run `highwire`.

In [5]:
app_name = "ringmaster_tin_to_bronze"
app_version = "v0.0.1"

delta_source_table = "default.coffeeco_v1_orders_tin"

protobuf_descriptor_path: Path = (
    Path('/opt/spark/work-dir/hitchhikers_guide')
    .joinpath("common","protobuf","coffeeco_v1","descriptor.bin")
    .absolute()
)
tin_protobuf_message_name = "coffeeco.v1.Order"

checkpoint_dir = "/opt/spark/work-dir/hitchhikers_guide/applications"
checkpoint_path = f"{checkpoint_dir}/{app_name}/{app_version}/_checkpoints"
print(f"checkpoint_path={checkpoint_path}")

delta_sink_table = "default.coffeeco_v1_orders_tin"

checkpoint_path=/opt/spark/work-dir/hitchhikers_guide/applications/ringmaster_tin_to_bronze/v0.0.1/_checkpoints


In [46]:
spark.catalog.listTables()

[Table(name='coffee_orders_base', catalog='spark_catalog', namespace=['default'], description='lineage:default.coffeeco_v1_orders_tin:coffee_orders_base', tableType='MANAGED', isTemporary=False),
 Table(name='coffeeco_tin_dq_rules', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='coffeeco_v1_orders_tin', catalog='spark_catalog', namespace=['default'], description='kafka-rp:coffeeco.v1.orders:tin', tableType='MANAGED', isTemporary=False)]

## We will start simple. We will fetch the Table and Decode in Batch
> This allows us to ensure things are wired up correctly, and let's us have fun exploring the data

In [6]:
dt_source_tin: DeltaTable = DeltaTable.forName(spark, delta_source_table)

In [53]:
dt_source_tin.toDF().show()

+--------------------+--------------------+--------------------+----------+
|                 key|               value|           timestamp|      date|
+--------------------+--------------------+--------------------+----------+
|[53 63 6F 74 74 2...|[0A 0C 08 F4 DA 9...|2024-06-11 06:23:...|2024-06-11|
|[53 63 6F 74 74 2...|[0A 0C 08 F6 DA 9...|2024-06-11 06:23:...|2024-06-11|
|[53 63 6F 74 74 2...|[0A 0C 08 F7 DA 9...|2024-06-11 06:23:...|2024-06-11|
|[53 63 6F 74 74 2...|[0A 0C 08 F8 DA 9...|2024-06-11 06:23:...|2024-06-11|
|[53 63 6F 74 74 2...|[0A 0C 08 F9 DA 9...|2024-06-11 06:23:...|2024-06-11|
|[41 73 68 6F 6B 2...|[0A 0C 08 9A FE A...|2024-06-11 16:45:...|2024-06-11|
|[53 63 6F 74 74 2...|[0A 0C 08 9F FE A...|2024-06-11 16:45:...|2024-06-11|
|[53 63 6F 74 74 2...|[0A 0C 08 A1 FE A...|2024-06-11 16:45:...|2024-06-11|
|[41 73 68 6F 6B 2...|[0A 0C 08 A2 FE A...|2024-06-11 16:45:...|2024-06-11|
|[41 73 68 6F 6B 2...|[0A 0C 08 A4 FE A...|2024-06-11 16:45:...|2024-06-11|
|[41 73 68 6

## Next. We need to load the Binary Protobuf Descriptor and Decode the Protobuf
1. Using the `from_protobuf` method and the `binaryDescriptorSet` we will decode the `coffeeco.v1.Order` messages.
2. Once we've decoded the `value:bytes` into `order:struct`, we can have fun with the baseline dataset

In [7]:
coffeecov1_bin = read_binary_at(protobuf_descriptor_path)

In [8]:
coffee_orders_df = (
    dt_source_tin.toDF()
    .select(
        "date",
        "timestamp",
        from_protobuf(
            data=col("value"),
            messageName=tin_protobuf_message_name,
            options={"mode": "FAILFAST"},
            binaryDescriptorSet=coffeecov1_bin
        ).alias("order"),
    )
)

In [24]:
coffee_orders_df.printSchema()

root
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- order: struct (nullable = true)
 |    |-- order_created: timestamp (nullable = true)
 |    |-- purchased_at: struct (nullable = true)
 |    |    |-- store_id: string (nullable = true)
 |    |    |-- created: timestamp (nullable = true)
 |    |    |-- opened_on: timestamp (nullable = true)
 |    |    |-- closed_permanently_on: timestamp (nullable = true)
 |    |    |-- status: string (nullable = true)
 |    |-- customer: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- uuid: string (nullable = true)
 |    |    |-- first_seen: timestamp (nullable = true)
 |    |    |-- customer_type: string (nullable = true)
 |    |    |-- loyalty_member_id: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = false)
 |    |    |    |-- coffee: struct (nullable = true)
 |    |    |    |    |-- coffee: struct (nullable = tr

## Take the Base Table (Tin) and Create a Useful Bronze Table
> This table cherry-picks what it needs. Ignoring the rest

In [35]:
from pyspark.sql.functions import struct, col, concat_ws, lit

df_transformed = (
    coffee_orders_df
    .select(
        col("order.order_created").alias("order_created"),
        col("order.customer.uuid").alias("customer_uuid"),
        col("order.customer.customer_type").alias("customer_type"),
        col("order.customer.loyalty_member_id").alias("customer_loyalty"),
        col("order.items.coffee").alias("coffee"),
        col("order.total").alias("order_total")
    ).withColumn(
        "order_total", 
        (concat_ws('.', col("order_total.units"), col("order_total.nanos"))).cast("double"))
)#.explain("COST")

In [34]:
df_transformed.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|       order_created|       customer_uuid|       customer_type|    customer_loyalty|              coffee|order_total|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|5f02394e-aa12-43a...|[{{STREETLEVEL, C...|       4.75|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|d9aedc4a-2474-4b6...|[{{ASTER, COFFEE_...|       7.75|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|e1bbe192-1e57-4f2...|[{{SERMON, COFFEE...|      18.25|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|74dd6cb6-4f20-4c5...|[{{STREETLEVEL, C...|       11.5|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|01ae40b9-a561-410...|[{{ASTER, COFFEE_...|       25.0|
|2024-06-11 16:45:...|64616633-3331-653...|CUSTO

## Next. We Can Speed things up with Structured Streaming
> Using what we learned before. We can now write our nicely transformed data into our 'bronze' Table

In [None]:
from delta.tables import DeltaTable
from pyspark.sql.types import TimestampType

delta_table_name = "coffee_v1_orders_base"

spark.conf.set("spark.databricks.delta.constraints.allowUnenforcedNotNull.enabled", "true")

dt: DeltaTable = (
    DeltaTable.createIfNotExists(spark)
    .tableName(delta_table_name)
    .addColumns(df_transformed.schema)
    .addColumn("date", DateType(), generatedAlwaysAs="CAST(order_created AS DATE)")
    .comment("lineage:default.coffeeco_v1_orders_tin:coffee_orders_base")
    .clusterBy("date")
    .property("description", "This table stores our CoffeeCo Order History")\
    .property("delta.logRetentionDuration", "interval 30 days")
    .property("delta.deletedFileRetentionDuration", "interval 7 days")
    .property("delta.dataSkippingNumIndexedCols", "7")
    .property("delta.checkpoint.writeStatsAsStruct", "true")
    .property("delta.checkpoint.writeStatsAsJson", "false")
    .property("delta.checkpointPolicy", "v2")
    .property("delta.enableDeletionVectors", "true")
    .execute()
)

In [55]:
dt.history().select("version").show()

+-------+
|version|
+-------+
|      2|
|      1|
|      0|
+-------+



In [None]:
dt.restoreToVersion(0)

In [None]:
streamingQuery = (
    spark.readStream
    .format("delta")
    .option("withEventTimeOrder", "true")
    .option("ignoreChanges", "true")
    .table(delta_source_table)
    .select(
        "date",
        "timestamp",
        from_protobuf(
            data=col("value"),
            messageName=tin_protobuf_message_name,
            options={"mode": "FAILFAST"},
            binaryDescriptorSet=coffeecov1_bin
        ).alias("order"),
    )
    .select(
        col("order.order_created").alias("order_created"),
        col("order.customer.uuid").alias("customer_uuid"),
        col("order.customer.customer_type").alias("customer_type"),
        col("order.customer.loyalty_member_id").alias("customer_loyalty"),
        col("order.items.coffee").alias("coffee"),
        col("order.total").alias("order_total")
    ).withColumn(
        "order_total", 
        (concat_ws('.', col("order_total.units"), col("order_total.nanos"))).cast("double")
    )
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .trigger(availableNow=True)
    .toTable(delta_table_name)
)

In [51]:
streamingQuery.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [60]:
bronze_from_tin = DeltaTable.forName(spark, delta_table_name)

In [61]:
bronze_from_tin.toDF().show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+
|       order_created|       customer_uuid|       customer_type|    customer_loyalty|              coffee|order_total|      date|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|5f02394e-aa12-43a...|[{{STREETLEVEL, C...|       4.75|2024-06-11|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|d9aedc4a-2474-4b6...|[{{ASTER, COFFEE_...|       7.75|2024-06-11|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|e1bbe192-1e57-4f2...|[{{SERMON, COFFEE...|      18.25|2024-06-11|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|74dd6cb6-4f20-4c5...|[{{STREETLEVEL, C...|       11.5|2024-06-11|
|2024-06-11 06:23:...|36366365-3638-393...|CUSTOMER_TYPE_MEMBER|01ae40b9-a561-410...|[{{AS