# Feldera Pipeline

In [None]:
import hopsworks
from feldera import FelderaClient, PipelineBuilder
from hsfs import engine
import json

In [None]:
# Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()

# Feldera - use local instance or cloud sandbox
feldera = FelderaClient("http://localhost:8080")
# feldera = FelderaClient("https://try.feldera.com", api_key="your-key")

## Get Kafka Configuration

## Define Feldera SQL Pipeline

This is where the magic happens - pure SQL for real-time aggregation.

In [None]:
# Input connector: Read from Kafka
events_source = json.dumps({
    "transport": {
        "name": "kafka_input",
        "config": kafka_config | {
            "topics": [EVENTS_TOPIC],
            "auto.offset.reset": "earliest"
        }
    },
    "format": {
        "name": "json",
        "config": {
            "update_format": "raw",
            "array": False
        }
    }
})

# Output connector: Write to Hopsworks
ctr_sink = json.dumps({
    "transport": {
        "name": "kafka_output",
        "config": kafka_config | {
            "topic": CTR_TOPIC,
            "headers": [
                {"key": "projectId", "value": str(project.id)},
                {"key": "featureGroupId", "value": str(ctr_fg.id)},
                {"key": "subjectId", "value": str(ctr_fg.subject["id"])}
            ]
        }
    },
    "format": {
        "name": "avro",
        "config": {
            "schema": ctr_fg.avro_schema,
            "skip_schema_id": True
        }
    }
})

In [None]:
sql = f"""
-- Input table: clickstream events from Kafka
CREATE TABLE events (
    user_id STRING,
    event_type STRING,
    timestamp TIMESTAMP
) WITH (
    'connectors' = '[{events_source}]'
);

-- Output view: CTR per user over 5-minute windows
CREATE VIEW ctr_5min
WITH ('connectors' = '[{ctr_sink}]')
AS
    SELECT
        user_id,
        SUM(CASE WHEN event_type = 'impression' THEN 1 ELSE 0 END) as impressions,
        SUM(CASE WHEN event_type = 'click' THEN 1 ELSE 0 END) as clicks,
        CAST(SUM(CASE WHEN event_type = 'click' THEN 1 ELSE 0 END) AS DOUBLE) /
            NULLIF(SUM(CASE WHEN event_type = 'impression' THEN 1 ELSE 0 END), 0) as ctr,
        window_end
    FROM TABLE(
        TUMBLE(
            TABLE events,
            DESCRIPTOR(timestamp),
            INTERVAL '5' MINUTES
        )
    )
    GROUP BY user_id, window_end;
"""

print("SQL Pipeline:")
print(sql)

## Create and Start Pipeline

## What Just Happened?

Feldera is now:
1. Reading events from Kafka
2. Aggregating into 5-minute windows
3. Calculating CTR = clicks / impressions
4. Writing to Hopsworks Feature Store

**All in pure SQL. No Java. No boilerplate.**

Hopsworks stores:
- Historical CTR in offline store (for training)
- Latest CTR per user in online store (for inference)

## Monitor Pipeline