### Consume data from Event Hubs

Setup variables for this job

In [3]:
# make sure to use your Event Hubs connection string to the event hubs created (not the namespace level)
eh_connection = "put your event hubs connection here"   # or use dbutils.secrets.get(scope="your scope",key="your scope key to the eventhubs connection")

lake = "/mnt/lake/raw/"
tableName = "raw.events"
deltaDataPath = lake+tableName
checkpointPath = "/checkpoint/"+tableName

Clean up checkpoint and table data (to start with a clean environment)

In [5]:
dbutils.fs.rm(checkpointPath, recurse=True)
dbutils.fs.rm(deltaDataPath, recurse=True)

Create the Structured streaming job

In [7]:
import json

# event hubs connection and params
ehConf = {'eventhubs.connectionString' : eh_connection}
startingEventPosition = {
  #"offset": "@latest",   
  "offset": "-1",
  "seqNo": -1,            
  "enqueuedTime": None,   
  "isInclusive": False
}
ehConf["eventhubs.startingPosition"] = json.dumps(startingEventPosition)
ehConf["maxEventsPerTrigger"] = 100000

# Start streaming
streamingInputDF = (spark.readStream
    .format("eventhubs")
    .options(**ehConf)
    .load())

Parse input json data and save into a delta table

In [9]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
from pyspark.sql.functions import substring,col,from_json

inputSchema = StructType([
  StructField("messageId", LongType(), True),
  StructField("deviceId", IntegerType(), True),
  StructField("temperature", IntegerType(), True),
  StructField("genTimestamp", StringType(), True)
])

In [10]:
query = (streamingInputDF
  .select(from_json(col("body").cast("string"), inputSchema).alias("value"))
  .selectExpr("value.*")
  .withColumn("genDate", substring("genTimestamp", 1, 10))
  .writeStream
  .format("delta")
  .partitionBy("genDate")
  .outputMode("append")
  .trigger(once=True)
  #.trigger(processingTime='30 seconds')
  .option("checkpointLocation", checkpointPath)
  .start(deltaDataPath) )

Wait so streaming starts and create a table definition mapping to location

In [12]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS raw")
spark.sql(f"DROP TABLE IF EXISTS {tableName}")
spark.sql(f"CREATE TABLE {tableName} USING DELTA LOCATION '{deltaDataPath}'")

Check that data is coming in

In [14]:
display(sql(f"select deviceId, count(*) from {tableName} group by deviceId"))

Check the folder for this table in ADLS gen2

Then optimize to see compaction take place

In [16]:
# Optimize table, and clean up snapshots
spark.sql(f"OPTIMIZE {tableName}")
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", False)
spark.sql(f"VACUUM {tableName} RETAIN 0 HOURS")    # 0 HOURS is risky if streaming is running

Check how delta table keeps history of changes in the table

In [18]:
display(spark.sql(f"DESCRIBE HISTORY {tableName}"))