In [6]:
import hopsworks
from hsfs import engine
from hsfs.feature import Feature
import json
import datetime
from feldera import FelderaClient, PipelineBuilder

# client = FelderaClient("http://localhost:8080")

project = hopsworks.login()

kafka_api = project.get_kafka_api()

fs = project.get_feature_store()

name = "precomputed_click_counts"
kafka_topic = f"{project.name}_onlinefs"

clicks_fg = fs.get_feature_group("clicks")

window_fg = fs.get_or_create_feature_group(
    name="precomputed_click_counts",
    version=1,
    description="Precomputed clicks counts",
    primary_key=["user_id"],
    online_enabled=True,
    stream=True,
    event_time="event_time",
    features= [
                Feature(name="user_id", type="bigint"),
                Feature(name="event_time", type="bigint"),
                Feature(name="click_count_1_min", type="bigint"),
                Feature(name="click_count_10_min", type="bigint"),
                Feature(name="click_count_30_min", type="bigint"),
                Feature(name="click_count_1_hour", type="bigint"),
            ]
)
try:
    window_fg.save()
except Exception as e:
    print(e)

Connection closed.
2025-10-09 20:32:10,382 INFO: Python Engine initialized.

Logged in to project, explore it here https://snurran.devnet.hops.works/p/4217

Feature Group created successfully, explore it at 
https://snurran.devnet.hops.works/p/4217/fs/4165/fg/5147


## Feldera Setup
Copy Hopsworks certificates into Feldera Docker container

In [None]:
import subprocess

# Get container ID
container_id = subprocess.check_output(
    ["docker", "ps", "--filter", "ancestor=ghcr.io/feldera/pipeline-manager:latest", "-q"],
    text=True
).strip()

# Load the hopsworks certs to the correct directory in the feldera container
subprocess.run([
    "docker", "exec", container_id,
    "bash", "-c",
    f"rm -f /tmp/{hostname} && ln -s /opt/{hostname}/{hostname} /tmp/{hostname}"
])


### Streaming Transformations in Feldera-SQL
Compute the aggregations and write them to the Kafka topic for `precomputed_click_counts`

In [None]:
def build_sql(
    transaction_source_config: str, card_details_source_config: str, fs_sink_config: str
) -> str:
    return f"""

    CREATE TABLE clicks (
        user_id BIGINT,
        event_time BIGINT,
        click_id BIGINT
    ) WITH (
        'connectors' = '[{source_config}]'
    );

    CREATE VIEW precomputed_click_counts AS
    WITH (
        'connectors' = '[{sink_config}]'
    ) 
    SELECT
        c.user_id,
        c.event_time,
        COUNT(amount) OVER window_1_minute AS click_count_1_min,
        COUNT(amount) OVER window_10_minute AS click_count_10_min,
        COUNT(amount) OVER window_30_minute AS click_count_1_min,
        COUNT(amount) OVER window_1_hour AS click_count_1_day
    FROM
         clicks AS c
    WINDOW
        window_10_minute AS (
            PARTITION BY user_id
            ORDER BY event_time
            RANGE BETWEEN INTERVAL '1' MINUTE PRECEDING AND CURRENT ROW
        ),
        window_10_minute AS (
            PARTITION BY user_id
            ORDER BY event_time
            RANGE BETWEEN INTERVAL '10' MINUTE PRECEDING AND CURRENT ROW
        ),
        window_30_minute AS (
            PARTITION BY user_id
            ORDER BY event_time
            RANGE BETWEEN INTERVAL '30' MINUTE PRECEDING AND CURRENT ROW
        ),
        window_1_hour AS (
            PARTITION BY user_id
            ORDER BY event_time
            RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW
        )
    ;
    """

In [None]:
def create_consumer_kafka_config(kafka_config: dict, fg):
    return kafka_config | {
        "topic": fg._online_topic_name,
        "start_from": "earliest",
    }

def create_producer_kafka_config(kafka_config: dict, fg, project):
    return kafka_config | {
        "topic": fg._online_topic_name,
        "auto.offset.reset": "earliest",
        "headers": [
            {
                "key": "projectId",
                "value": str(project.id),
            },
            {
                "key": "featureGroupId",
                "value": str(fg.id),
            },
            {
                "key": "subjectId",
                "value": str(fg.subject["id"]),
            },
        ],
    }

kafka_config = kafka_api.get_default_config()

clicks_config = json.dumps(
    {
        "transport": {
            "name": "kafka_input",
            "config": create_consumer_kafka_config(kafka_config, clicks_fg),
        },
        "format": {
            "name": "avro",
            "config": {"schema": clicks_fg.avro_schema, "skip_schema_id": True},
        },
    }
)
aggs_config = json.dumps(
    {
        "transport": {
            "name": "kafka_output",
            "config": create_producer_kafka_config(kafka_config, window_fg, project),
        },
        "format": {
            "name": "avro",
            "config": {"schema": window_fg.avro_schema, "skip_schema_id": True},
        },
    }
)



In [None]:
sql = build_sql(clicks_config, aggs_config)
pipeline = PipelineBuilder(client, name="hopsworks_clicks_aggs", sql=sql).create_or_replace()
pipeline.start()

### Schedule job to sync data to the offline feature store
Then create a feature view to be used for training and online inference

In [None]:
windowed_fg.materialization_job.schedule(
    cron_expression="0 0 3 * * ? *",
    start_time=datetime.datetime.now(tz=datetime.timezone.utc),
)

In [8]:
# Create a feature view for creating training data and for online inference
fv = fs.get_or_create_feature_view(name="precomputed_click_counts",
                                   version=1,
                                   query=window_fg.select_all()
                                  )

Feature view created successfully, explore it at 
https://snurran.devnet.hops.works/p/4217/fs/4165/fv/precomputed_click_counts/version/1
