## Kafka API

This API provides the possibility to manage schemas, topics and setting up the connection to the kafka brokers.

## Scope

* Create a schema
* Create a topic
* Produce messages to the broker
* Consume messages from the broker

In [1]:
import hopsworks

## Connect to the cluster

In [2]:
# Connect to your cluster, to be used running inside Jupyter or jobs inside the cluster.
connection = hopsworks.connection()

Connected. Call `.close()` to terminate connection gracefully.


In [3]:
# Uncomment when connecting to the cluster from an external environment.
# connection = hopsworks.connection(project='my_project', host='my_instance', port=443, api_key_value='apikey')

## Get Project

In [4]:
project = connection.get_project()

## Get the API

In [5]:
kafka_api = project.get_kafka_api()

## Define an avro schema

In [6]:
SCHEMA_NAME="schema_example"

In [7]:
schema = {
  "type": "record",
  "name": "tutorial",
  "fields": [
    {
      "name": "id",
      "type": "int"  
    },
    {
      "name": "data",
      "type": "string"
    }
  ]
}

In [8]:
my_schema = kafka_api.create_schema(SCHEMA_NAME, schema)
my_schema

KafkaSchema('schema_example', 1)

In [9]:
my_schema.subject

'schema_example'

In [10]:
my_schema.version

1

In [11]:
my_schema.schema

## Define a topic

In [12]:
TOPIC_NAME="topic_example"

In [13]:
my_topic = kafka_api.create_topic(TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1)
my_topic

KafkaTopic('topic_example')

In [14]:
my_topic.name

'topic_example'

In [15]:
my_topic.replicas

1

In [16]:
my_topic.partitions

1

In [17]:
my_topic.schema

KafkaSchema('schema_example', 1)

## Produce messages to topic
#### Currently _kafka_api.get_default_config()_ is only supported for use in a notebook or job inside Hopsworks

In [18]:
producer_config = kafka_api.get_default_config()
producer_config

{'bootstrap.servers': '10.0.2.15:9091',
 'security.protocol': 'SSL',
 'ssl.ca.location': 'ca_chain.pem',
 'ssl.certificate.location': 'client.pem',
 'ssl.key.location': 'client_key.pem',
 'group.id': 'my-group-id'}

In [19]:
from confluent_kafka import Producer, Consumer

In [20]:
# Configure producer
producer = Producer(producer_config)

In [21]:
def delivery_callback(err, msg):
    """
    Optional per-message delivery callback (triggered by poll() or flush())
    when a message has been successfully delivered or permanently
    failed delivery (after retries).
    """
    if err:
        print("Message failed delivery: {}".format(err))
    else:
        print('Message: {} delivered to topic: {}, partition: {}, offset: {}, timestamp: {}'.format(msg.value(), msg.topic(), msg.partition(), msg.offset(), msg.timestamp()))


In [22]:
import uuid
import json
for i in range(0, 10):
    producer.produce(TOPIC_NAME, json.dumps({"id": i, "data": str(uuid.uuid1())}), "key", callback=delivery_callback)
    
# Trigger the sending of all messages to the brokers, 20sec timeout
producer.flush(20)



Message: b'{"id": 0, "data": "36039d5a-c48f-11ec-af74-080027fffec3"}' delivered to topic: topic_example, partition: 0, offset: 0, timestamp: (1, 1650887990722)
Message: b'{"id": 1, "data": "3603b9fc-c48f-11ec-af74-080027fffec3"}' delivered to topic: topic_example, partition: 0, offset: 1, timestamp: (1, 1650887990722)
Message: b'{"id": 2, "data": "3603ceb0-c48f-11ec-af74-080027fffec3"}' delivered to topic: topic_example, partition: 0, offset: 2, timestamp: (1, 1650887990723)
Message: b'{"id": 3, "data": "3603d41e-c48f-11ec-af74-080027fffec3"}' delivered to topic: topic_example, partition: 0, offset: 3, timestamp: (1, 1650887990723)
Message: b'{"id": 4, "data": "3603dd38-c48f-11ec-af74-080027fffec3"}' delivered to topic: topic_example, partition: 0, offset: 4, timestamp: (1, 1650887990723)
Message: b'{"id": 5, "data": "3603e134-c48f-11ec-af74-080027fffec3"}' delivered to topic: topic_example, partition: 0, offset: 5, timestamp: (1, 1650887990723)
Message: b'{"id": 6, "data": "3603e4fe-c

0

## Consume messages from topic

In [23]:
from confluent_kafka import Consumer

consumer_config = kafka_api.get_default_config()
consumer_config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
consumer_config

{'bootstrap.servers': '10.0.2.15:9091',
 'security.protocol': 'SSL',
 'ssl.ca.location': 'ca_chain.pem',
 'ssl.certificate.location': 'client.pem',
 'ssl.key.location': 'client_key.pem',
 'group.id': 'my-group-id',
 'default.topic.config': {'auto.offset.reset': 'earliest'}}

In [24]:
# Configure consumer
consumer = Consumer(consumer_config)

In [25]:
# Subscribe to topic
consumer.subscribe([TOPIC_NAME])

In [26]:
for i in range(0, 10):
    msg = consumer.poll(timeout=30.0)
    print(msg.value())

b'{"id": 0, "data": "36039d5a-c48f-11ec-af74-080027fffec3"}'
b'{"id": 1, "data": "3603b9fc-c48f-11ec-af74-080027fffec3"}'
b'{"id": 2, "data": "3603ceb0-c48f-11ec-af74-080027fffec3"}'
b'{"id": 3, "data": "3603d41e-c48f-11ec-af74-080027fffec3"}'
b'{"id": 4, "data": "3603dd38-c48f-11ec-af74-080027fffec3"}'
b'{"id": 5, "data": "3603e134-c48f-11ec-af74-080027fffec3"}'
b'{"id": 6, "data": "3603e4fe-c48f-11ec-af74-080027fffec3"}'
b'{"id": 7, "data": "3603e8a0-c48f-11ec-af74-080027fffec3"}'
b'{"id": 8, "data": "3603ec56-c48f-11ec-af74-080027fffec3"}'
b'{"id": 9, "data": "3603ef94-c48f-11ec-af74-080027fffec3"}'


In [27]:
my_topic.delete()

In [28]:
my_schema.delete()