In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Event Hub details
EH_CONN_STR = ""
EH_NAMESPACE = "eventhubgq7cwd0"


ehConf = {
  'eventhubs.connectionString' : sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(EH_CONN_STR),
  'eventhubs.eventHubName': EH_NAMESPACE
}

In [0]:
%sql
USE CATALOG sales_catalog;
DROP SCHEMA IF EXISTS sales_catalog.bronze CASCADE;
DROP SCHEMA IF EXISTS sales_catalog.silver CASCADE;
DROP SCHEMA IF EXISTS sales_catalog.gold CASCADE;
CREATE SCHEMA IF NOT EXISTS sales_catalog.bronze;
CREATE SCHEMA IF NOT EXISTS sales_catalog.silver;
CREATE SCHEMA IF NOT EXISTS sales_catalog.gold;

In [0]:
df = spark.readStream \
    .format("eventhubs") \
    .options(**ehConf) \
    .load() \

# Displaying stream: Show the incoming streaming data for visualization and debugging purposes
# df.display()

# Writing stream: Persist the streaming data to a Delta table 'streaming.bronze.order' in 'append' mode with checkpointing
df.writeStream\
    .option("checkpointLocation", "/tmp/streaming/bronze")\
    .outputMode("append")\
    .format("delta")\
    .toTable("sales_catalog.bronze.order")

In [0]:
%sql

SELECT * from sales_catalog.bronze.order

In [0]:
schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("Quantity", IntegerType(), True)
])

In [0]:
df = (spark.readStream
      .format("delta")
      .table("sales_catalog.bronze.order")
      .withColumn("body", col("body").cast("string"))
      .withColumn("body",from_json(col("body"), schema))
      .select("body.ProductID", "body.Quantity", col("enqueuedTime").alias('timestamp')))

In [0]:
# Displaying stream: Visualize the transformed data in the DataFrame for verification and analysis
df.display()

# Writing stream: Save the transformed data to the 'streaming.silver.order' Delta table in 'append' mode with checkpointing for data reliability
df.writeStream\
    .option("checkpointLocation", "/tmp/streaming/silver")\
    .outputMode("append")\
    .format("delta")\
    .toTable("sales_catalog.silver.order")

In [0]:
from pyspark.sql.functions import window, col, sum

# Read the silver table
df_silver = spark.readStream.table("sales_catalog.silver.order")

# Perform the aggregation
df_gold = df_silver \
    .groupBy(
        window(col("timestamp"), "5 seconds").alias("window"),
        col("ProductID")
    ) \
    .agg(
        sum("Quantity").alias("Orders")
    ) \
    .filter("count(*) > 1") \
    .select(
        col("window.start").alias("StartTime"),
        col("window.end").alias("EndTime"),
        col("ProductID"),
        col("Orders")
    )

# Write the result to the gold table
df_gold.writeStream \
    .option("checkpointLocation", "/tmp/streaming/gold2") \
    .outputMode("complete") \
    .format("delta") \
    .toTable("sales_catalog.gold.order")

In [0]:
display(df_gold)

In [0]:
%sql
SELECT * FROM sales_catalog.gold.order;