In [1]:
## PARAMETERS ##

N_PARTITIONS = 3
BOOTSTRAP_SERVER = '10.67.22.61'
MSG_RATE = 1000 # number of messages per second

# parameters for artificial rate control
BATCH_FRACTION = 0.1 # can't be lower than 0.1 this (problem with sleep function time resolution see https://stackoverflow.com/questions/1133857/how-accurate-is-pythons-time-sleep)
BATCH_SIZE = int(max(0.1*MSG_RATE, BATCH_FRACTION*MSG_RATE)) # number of messages between each rate control


# ADMIN SECTION: create and delete partitions

In [2]:
from  confluent_kafka.admin import AdminClient, NewTopic

In [3]:
kafka_admin = AdminClient({'bootstrap.servers':BOOTSTRAP_SERVER})

In [4]:
def create_topics(admin, topics):
    """ Create topics """

    new_topics = [NewTopic(topic, num_partitions=N_PARTITIONS, replication_factor=1) for topic in topics]
    # Call create_topics to asynchronously create topics, a dict
    # of <topic,future> is returned.
    fs = admin.create_topics(new_topics, request_timeout=15.0)

    # Wait for operation to finish.
    # Timeouts are preferably controlled by passing request_timeout=15.0
    # to the create_topics() call.
    # All futures will finish at the same time.
    for topic, f in fs.items():
        try:
            f.result()  # The result itself is None
            print("Topic {} created".format(topic))
        except Exception as e:
            print("Failed to create topic {}: {}".format(topic, e))

In [5]:
def delete_topics(admin, topics):
    """ delete topics """

    # Call delete_topics to asynchronously delete topics, a future is returned.
    # By default, this operation on the broker returns immediately while
    # topics are deleted in the background. But here we give it some time (30s)
    # to propagate in the cluster before returning.
    #
    # Returns a dict of <topic,future>.
    fs = admin.delete_topics(topics, operation_timeout=30)

    # Wait for operation to finish.
    for topic, f in fs.items():
        try:
            f.result()  # The result itself is None
            print("Topic {} deleted".format(topic))
        except Exception as e:
            print("Failed to delete topic {}: {}".format(topic, e))

In [6]:
delete_topics(kafka_admin, ['data'])

Topic data deleted


In [7]:
## check if topic already exits otherwise create it
topic_name = 'data'
if not topic_name in kafka_admin.list_topics().topics.keys():
    create_topics(kafka_admin, [topic_name])
else:
    print("Topic " + topic_name + " already exists")

Topic data created


# Connect to bucket

In [8]:
import boto3

In [9]:
s3_client = boto3.client('s3', endpoint_url='https://cloud-areapd.pd.infn.it:5210', verify=False)

# Create producer and send messages at specified rate

In [10]:
from confluent_kafka import Producer
import time

In [11]:
producer = Producer({'bootstrap.servers':BOOTSTRAP_SERVER,
                     'linger.ms':20, # delay in ms before messages are sent if batch size is not reached
                     'batch.size':16384}) # maximum batch size before messages are sent

In [12]:

bucket_name = 'mapd-minidt-stream'
batch_count = 0 # counter for artificial delay

# read all lines from all files in the bucket
for key in s3_client.list_objects(Bucket=bucket_name)['Contents']:

    # create line iterator
    line_reader = s3_client.get_object(Bucket=bucket_name, Key=key['Key'])['Body'].iter_lines()

    next(line_reader) # skip header line for each file

    for line in line_reader:

        producer.produce(topic_name, line) # produce message
        producer.poll(0) # pool producer (asynch process, message are actually sent following criteria)
        batch_count += 1 # update counter
        if batch_count == BATCH_SIZE: # add artificial rate control
            time.sleep(BATCH_SIZE/MSG_RATE)
            batch_count = 0 # reset counter

    producer.flush() # wait for last messages to be sent



KeyboardInterrupt: 