In [18]:
from pyspark.sql.functions import col
from pyspark.sql.protobuf.functions import from_protobuf
from pyspark.sql.types import DateType
from delta.tables import DeltaTable
from pyspark.sql.streaming import StreamingQuery

In [5]:
from pathlib import Path

def read_binary_at(path: Path):
    with path.open("rb") as fb:
        bindata = fb.read()
    return bindata

## Note on Running the Following Examples
1. If you have gone through the process of running [highwire](https://github.com/datacircus/highwire) and have populated data using [100_automatic_kafka_to_tin_delta.ipynb](./100_automatic_kafka_to_tin_delta.ipynb), then you will have the table `default.coffeeco_v1_orders_tin`.
2. If you haven't run the 100 level notebook, you can change the Delta reference to point to `/opt/spark/work-dir/hitchhikers_guide/datasets/coffeeco_v1_orders_tin` so that you don't need to build and run `highwire`.

In [20]:
app_name = "ringmaster_tin_to_bronze"
app_version = "v0.0.1"

delta_source_table = "default.coffeeco_v1_orders_tin"

protobuf_descriptor_path: Path = (
    Path('/opt/spark/work-dir/hitchhikers_guide')
    .joinpath("common","protobuf","coffeeco_v1","descriptor.bin")
    .absolute()
)
tin_protobuf_message_name = "coffeeco.v1.Order"

checkpoint_dir = "/opt/spark/work-dir/hitchhikers_guide/applications"
checkpoint_path = f"{checkpoint_dir}/{app_name}/{app_version}/_checkpoints"
print(f"checkpoint_path={checkpoint_path}")

delta_sink_table = "default.coffeeco_v1_orders_tin"

checkpoint_path=/opt/spark/work-dir/hitchhikers_guide/applications/ringmaster_tin_to_bronze/v0.0.1/_checkpoints


## We will start simple. We will fetch the Table and Decode in Batch
> This allows us to ensure things are wired up correctly, and let's us have fun exploring the data

In [None]:
dt_source_tin: DeltaTable = DeltaTable.forName(spark, delta_source_table)

In [35]:
dt_source_tin.toDF().show()

+--------------------+--------------------+--------------------+----------+
|                 key|               value|           timestamp|      date|
+--------------------+--------------------+--------------------+----------+
|[53 63 6F 74 74 2...|[0A 0B 08 91 FD 8...|2024-06-07 21:41:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 E1 FD 8...|2024-06-07 21:42:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 FC FD 8...|2024-06-07 21:43:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 C8 84 8...|2024-06-07 21:57:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 93 86 8...|2024-06-07 22:00:...|2024-06-07|
+--------------------+--------------------+--------------------+----------+



## Next. We need to load the Binary Protobuf Descriptor and Decode the Protobuf
1. Using the `from_protobuf` method and the `binaryDescriptorSet` we will decode the `coffeeco.v1.Order` messages.
2. Once we've decoded the `value:bytes` into `order:struct`, we can have fun with the baseline dataset

In [17]:
coffeecov1_bin = read_binary_at(protobuf_descriptor_path)

In [21]:
coffee_orders_df = (
    dt_source_tin.toDF()
    .select(
        "date",
        "timestamp",
        from_protobuf(
            data=col("value"),
            messageName=tin_protobuf_message_name,
            options={"mode": "FAILFAST"},
            binaryDescriptorSet=coffeecov1_bin
        ).alias("order"),
    )
)

In [24]:
coffee_orders_df.printSchema()

root
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- order: struct (nullable = true)
 |    |-- order_created: timestamp (nullable = true)
 |    |-- purchased_at: struct (nullable = true)
 |    |    |-- store_id: string (nullable = true)
 |    |    |-- created: timestamp (nullable = true)
 |    |    |-- opened_on: timestamp (nullable = true)
 |    |    |-- closed_permanently_on: timestamp (nullable = true)
 |    |    |-- status: string (nullable = true)
 |    |-- customer: struct (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- uuid: string (nullable = true)
 |    |    |-- first_seen: timestamp (nullable = true)
 |    |    |-- customer_type: string (nullable = true)
 |    |    |-- loyalty_member_id: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = false)
 |    |    |    |-- coffee: struct (nullable = true)
 |    |    |    |    |-- coffee: struct (nullable = tr

In [34]:
(coffee_orders_df.select("date", "timestamp", "order.customer.name", "order.total")).show(truncate=False)

+----------+-----------------------+------------+-----------------------------+
|date      |timestamp              |name        |total                        |
+----------+-----------------------+------------+-----------------------------+
|2024-06-07|2024-06-07 21:41:37.032|Scott Haines|{CURRENCY_CODE_USD, 28, NULL}|
|2024-06-07|2024-06-07 21:42:57.921|Scott Haines|{CURRENCY_CODE_USD, 30, 75}  |
|2024-06-07|2024-06-07 21:43:24.769|Scott Haines|{CURRENCY_CODE_USD, 12, 50}  |
|2024-06-07|2024-06-07 21:57:28.328|Scott Haines|{CURRENCY_CODE_USD, 24, NULL}|
|2024-06-07|2024-06-07 22:00:51.321|Scott Haines|{CURRENCY_CODE_USD, 25, NULL}|
+----------+-----------------------+------------+-----------------------------+

