In [1]:
from pyspark.sql.functions import col
from pyspark.sql.types import DateType
from delta.tables import DeltaTable
from pyspark.sql.streaming import StreamingQuery

## Setup the Application
> This includes the following:
* app name
* app version
* kafka topic: `coffeeco.v1.orders`
* checkpoint dir, and path (for the streaming application state)
* the sink location: Managed Delta Table

In [2]:
app_name = "ringmaster_kafka_tin"
app_version = "v0.0.1"
kafka_topic = "coffeeco.v1.orders"
kafka_brokers = "kafka-rp:29092"
checkpoint_dir = "/opt/spark/work-dir/hitchhikers_guide/applications"
checkpoint_path = f"{checkpoint_dir}/{app_name}/{app_version}/_checkpoints"
print(f"checkpoint_path={checkpoint_path}")

kafka_to_delta_table = "default.coffeeco_v1_orders_tin"

checkpoint_path=/opt/spark/work-dir/hitchhikers_guide/applications/ringmaster_kafka_tin/v0.0.1/_checkpoints


In [3]:
kafka_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", kafka_brokers)
    .option("subscribe", kafka_topic)
    .option("failOnDataLoss", "true")
    .option("mode", "FAIL_FAST")
    .option("startingOffsets", "earliest")
    .option("fetchOffset.retryIntervalMs", "10")
    .option("groupIdPrefix", "ringmaster_tin")
    .load()
)

In [4]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
projection_df = kafka_df.select("key", "value", "timestamp")

## Create the Managed Delta Table
> This will allow us to automatically take the streaming protobuf data and drop it into a "tin" landing zone Delta table.
> By taking the streaming data off of Kafka, we erase one of the trickier aspects of working with short-TTLs from Kafka (typically between 24-48 hours for large topics). If things need to be replayed, we can now reference our Delta Table rather than rushing to recover while data may already be lost.

In [7]:
dt: DeltaTable = (
    DeltaTable.createIfNotExists(spark)
    .tableName(kafka_to_delta_table)
    .addColumns(projection_df.schema)
    .addColumn("date", DateType(), generatedAlwaysAs="CAST(timestamp AS DATE)")
    .comment("kafka-rp:coffeeco.v1.orders:tin")
    .clusterBy("date")
    .property("description", "This table provides a home for the processed Kafka records without immediate fear of TTLs")\
    .property("delta.logRetentionDuration", "interval 30 days")
    .property("delta.deletedFileRetentionDuration", "interval 7 days")
    .property("delta.dataSkippingNumIndexedCols", "4")
    .property("delta.checkpoint.writeStatsAsStruct", "true")
    .property("delta.checkpoint.writeStatsAsJson", "false")
    .property("delta.checkpointPolicy", "v2")
    .property("delta.enableDeletionVectors", "true")
    .execute()
)

In [None]:
#spark.conf.get("spark.sql.warehouse.dir")
# // file:/opt/spark/work-dir/hitchhikers_guide/warehouse
#spark.catalog.currentCatalog()
#spark.catalog.currentDatabase()
spark.catalog.listTables()

## Now that we have the Delta table. It is time to read from Kafka and populate the Tin Layer in the Streaming Medallion Architecture

In [None]:
streaming_query: StreamingQuery = (
    projection_df.writeStream
    .format("delta")
    .option("overwriteSchema", "false")
    .option("mergeSchema", "false")
    .option("checkpointLocation", checkpoint_path)
    .outputMode("append")
    .trigger(availableNow=True)
    .toTable(tableName=kafka_to_delta_table)
)

In [9]:
streaming_query.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [10]:
streaming_query.stop()

In [11]:
dt_tin: DeltaTable = DeltaTable.forName(spark, kafka_to_delta_table)

In [12]:
dt_tin.toDF().printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- date: date (nullable = true)



In [13]:
dt_tin.toDF().show()

+--------------------+--------------------+--------------------+----------+
|                 key|               value|           timestamp|      date|
+--------------------+--------------------+--------------------+----------+
|[53 63 6F 74 74 2...|[0A 0B 08 91 FD 8...|2024-06-07 21:41:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 E1 FD 8...|2024-06-07 21:42:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 FC FD 8...|2024-06-07 21:43:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 C8 84 8...|2024-06-07 21:57:...|2024-06-07|
|[53 63 6F 74 74 2...|[0A 0C 08 93 86 8...|2024-06-07 22:00:...|2024-06-07|
+--------------------+--------------------+--------------------+----------+

