# PyFlink Pipeline

In [None]:
!pip install apache-flink apache-flink-libraries

In [None]:
import hopsworks
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings
from hsfs import engine
import json

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

kafka_config = engine.get_instance()._get_kafka_config(fs.id, {})
EVENTS_TOPIC = "clickstream_events"
CTR_TOPIC = f"ctr_5min_{project.id}"

In [None]:
# Setup Flink environment
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)

settings = EnvironmentSettings.new_instance() \
    .in_streaming_mode() \
    .build()

t_env = StreamTableEnvironment.create(env, settings)

In [None]:
# Create source table from Kafka
t_env.execute_sql(f"""
    CREATE TABLE events (
        user_id STRING,
        event_type STRING,
        `timestamp` BIGINT,
        ts AS TO_TIMESTAMP(FROM_UNIXTIME(`timestamp` / 1000)),
        WATERMARK FOR ts AS ts - INTERVAL '1' MINUTE
    ) WITH (
        'connector' = 'kafka',
        'topic' = '{EVENTS_TOPIC}',
        'properties.bootstrap.servers' = '{kafka_config['bootstrap.servers']}',
        'properties.group.id' = 'flink_ctr_consumer',
        'scan.startup.mode' = 'earliest-offset',
        'format' = 'json'
    )
""")

In [None]:
# Create sink table to Kafka
t_env.execute_sql(f"""
    CREATE TABLE ctr_output (
        user_id STRING,
        impressions BIGINT,
        clicks BIGINT,
        ctr DOUBLE,
        window_end TIMESTAMP(3)
    ) WITH (
        'connector' = 'kafka',
        'topic' = '{CTR_TOPIC}',
        'properties.bootstrap.servers' = '{kafka_config['bootstrap.servers']}',
        'format' = 'json',
        'sink.partitioner' = 'round-robin'
    )
""")

In [None]:
# Calculate CTR with SQL
ctr_query = t_env.sql_query("""
    SELECT
        user_id,
        COUNT(CASE WHEN event_type = 'impression' THEN 1 END) as impressions,
        COUNT(CASE WHEN event_type = 'click' THEN 1 END) as clicks,
        CAST(COUNT(CASE WHEN event_type = 'click' THEN 1 END) AS DOUBLE) /
            NULLIF(COUNT(CASE WHEN event_type = 'impression' THEN 1 END), 0) as ctr,
        TUMBLE_END(ts, INTERVAL '5' MINUTE) as window_end
    FROM events
    GROUP BY
        user_id,
        TUMBLE(ts, INTERVAL '5' MINUTE)
""")

# Insert into sink
ctr_query.execute_insert('ctr_output').wait()

print(f"Streaming CTR to {CTR_TOPIC}")