# PySpark Streaming Pipeline

In [None]:
import hopsworks
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from hsfs import engine
import json

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

spark = SparkSession.builder \
    .appName("CTR_Streaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()

In [None]:
kafka_config = engine.get_instance()._get_kafka_config(fs.id, {})
EVENTS_TOPIC = "clickstream_events"
CTR_TOPIC = f"ctr_5min_{project.id}"

In [None]:
# Read from Kafka
events_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_config["bootstrap.servers"]) \
    .option("subscribe", EVENTS_TOPIC) \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON
schema = StructType([
    StructField("user_id", StringType()),
    StructField("event_type", StringType()),
    StructField("timestamp", LongType())
])

parsed_df = events_df \
    .select(from_json(col("value").cast("string"), schema).alias("data")) \
    .select("data.*") \
    .withColumn("timestamp", from_unixtime(col("timestamp")/1000).cast("timestamp"))

In [None]:
# Calculate CTR in 5-minute windows
ctr_df = parsed_df \
    .withWatermark("timestamp", "1 minute") \
    .groupBy(
        window("timestamp", "5 minutes"),
        "user_id"
    ) \
    .agg(
        sum(when(col("event_type") == "impression", 1).otherwise(0)).alias("impressions"),
        sum(when(col("event_type") == "click", 1).otherwise(0)).alias("clicks")
    ) \
    .withColumn("ctr", 
        when(col("impressions") > 0, col("clicks") / col("impressions"))
        .otherwise(lit(None))
    ) \
    .select(
        col("user_id"),
        col("impressions"),
        col("clicks"),
        col("ctr"),
        col("window.end").alias("window_end")
    )

In [None]:
# Write to Hopsworks via Kafka
query = ctr_df \
    .selectExpr("to_json(struct(*)) AS value") \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_config["bootstrap.servers"]) \
    .option("topic", CTR_TOPIC) \
    .option("checkpointLocation", "/tmp/ctr_checkpoint") \
    .outputMode("update") \
    .trigger(processingTime="30 seconds") \
    .start()

print(f"Streaming to {CTR_TOPIC}")
query.awaitTermination()