Views Notebook (Part 2)

In [None]:
from pyspark.sql.functions import col, window, to_json, struct
from pyspark.sql.types import TimestampType

# Load cleaned stream from Kafka 
cleaned_stream = (
    spark_session.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "ingest-cleaned")
    .option("startingOffsets", "latest")
    .load()
    .selectExpr("CAST(value AS STRING) AS json")
    .selectExpr("from_json(json, 'timestamp TIMESTAMP') as data")
    .select("data.*")
)

Events per day

In [None]:
daily_counts = (
    cleaned_stream
    .withWatermark("timestamp", "30 minutes")
    .groupBy(window(col("timestamp"), "1 day"))
    .count()
)

tq = (
    daily_counts.writeStream
    .outputMode("append")
    .format("memory")
    .queryName("events_per_day")
    .start()
)

To show/display

In [None]:

spark_session.sql("SELECT * FROM events_per_day").show()

Part 2 Events per hour with delta <-> previous day  

In [None]:
# Load static historical data
historical_df = (
    spark_session.read.format("csv")
    .option("header", "true")
    .load("data/historical_hourly.csv")
    .withColumn("hour", col("hour").cast(TimestampType()))
    .withColumn("historical_count", col("historical_count").cast("long"))
)

# Compute current hourly counts
hourly_counts = (
    cleaned_stream
    .withWatermark("timestamp", "30 minutes")
    .groupBy(window(col("timestamp"), "1 hour"))
    .count()
    .withColumnRenamed("count", "current_count")
    .withColumn("hour", col("window.start"))
)

# Join with historical data and compute delta
hourly_with_delta = (
    hourly_counts
    .join(historical_df, on="hour", how="left")
    .withColumn("delta", col("current_count") - col("historical_count"))
    .select("hour", "current_count", "historical_count", "delta")
)

# Write to Kafka topic "hourly-delta"
output_to_kafka = (
    hourly_with_delta
    .select(to_json(struct("hour", "current_count", "historical_count", "delta")).alias("value"))
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "hourly-delta")
    .option("checkpointLocation", "checkpoints-hourly-delta")
    .outputMode("append")
    .start()
)



Wait for stream to finish

In [None]:

output_to_kafka.awaitTermination()