In [2]:
import json
import uuid

from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin.new_topic import NewTopic
from kafka.errors import TopicAlreadyExistsError

### Configuration Parameters 

> **TODO:** Change the configuration prameters to the appropriate values for your setup.

In [3]:
config = dict(
    bootstrap_servers=['kafka.kafka.svc.cluster.local:9092'],
    first_name='Adam',
    last_name='Curry'
)

config['client_id'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)
config['topic_prefix'] = '{}{}'.format(
    config['last_name'], 
    config['first_name']
)

config

{'bootstrap_servers': ['kafka.kafka.svc.cluster.local:9092'],
 'first_name': 'Adam',
 'last_name': 'Curry',
 'client_id': 'CurryAdam',
 'topic_prefix': 'CurryAdam'}

### Create Topic Utility Function

The `create_kafka_topic` helps create a Kafka topic based on your configuration settings.  For instance, if your first name is *John* and your last name is *Doe*, `create_kafka_topic('locations')` will create a topic with the name `DoeJohn-locations`.  The function will not create the topic if it already exists. 

In [4]:
def create_kafka_topic(topic_name, config=config, num_partitions=1, replication_factor=1):
    bootstrap_servers = config['bootstrap_servers']
    client_id = config['client_id']
    topic_prefix = config['topic_prefix']
    name = '{}-{}'.format(topic_prefix, topic_name)
    
    admin_client = KafkaAdminClient(
        bootstrap_servers=bootstrap_servers, 
        client_id=client_id
    )
    
    topic = NewTopic(
        name=name,
        num_partitions=num_partitions,
        replication_factor=replication_factor
    )

    topic_list = [topic]
    try:
        admin_client.create_topics(new_topics=topic_list)
        print('Created topic "{}"'.format(name))
    except TopicAlreadyExistsError as e:
        print('Topic "{}" already exists'.format(name))
    
create_kafka_topic('locations')

Topic "CurryAdam-locations" already exists


### Kafka Producer

The following code creates a `KafkaProducer` object which you can use to send Python objects that are serialized as JSON.

**Note:** This producer serializes Python objects as JSON. This means that object must be JSON serializable.  As an example, Python `DateTime` values are not JSON serializable and must be converted to a string (e.g. ISO 8601) or a numeric value (e.g. a Unix timestamp) before being sent.

In [5]:
producer = KafkaProducer(
  bootstrap_servers=config['bootstrap_servers'],
  value_serializer=lambda x: json.dumps(x).encode('utf-8')
)

### Send Data Function

The `send_data` function sends a Python object to a Kafka topic. This function adds the `topic_prefix` to the topic so `send_data('locations', data)` sends a JSON serialized message to `DoeJohn-locations`. The function also registers callbacks to let you know if the message has been sent or if an error has occured. 

In [6]:
def on_send_success(record_metadata):
    print('Message sent:\n    Topic: "{}"\n    Partition: {}\n    Offset: {}'.format(
        record_metadata.topic,
        record_metadata.partition,
        record_metadata.offset
    ))
    
def on_send_error(excp):
    print('I am an errback', exc_info=excp)
    # handle exception

def send_data(topic, data, config=config, producer=producer, msg_key=None):
    topic_prefix = config['topic_prefix']
    topic_name = '{}-{}'.format(topic_prefix, topic)
    
    if msg_key is not None:
        key = msg_key
    else:
        key = uuid.uuid4().hex
    
    producer.send(
        topic_name, 
        value=data,
        key=key.encode('utf-8')
    ).add_callback(on_send_success).add_errback(on_send_error)

In [7]:
example_data = dict(
    key1='value1',
    key2='value2'
)

send_data('locations', example_data)

Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 39


In [8]:
import os
from pathlib import Path
import pandas as pd
#https://youtu.be/IEEhzQoKtQU
#https://github.com/sthakur2019/dsc650/blob/main/Week%238_Assignment.ipynb
import threading
import datetime  as dt
import json
#https://www.youtube.com/watch?v=HIz0pUXhM3U
endpoint_url='https://storage.budsc.midwest-datascience.com'
current_dir = Path(os.getcwd()).absolute()
base_dir = '/home/jovyan/dsc650/data/processed/bdd/'
accelerations=base_dir+'accelerations/'
locations=base_dir+'locations/'
time_dir=os.listdir(locations)

test_dir = '/home/jovyan/dsc650/data/processed/bdd/locations//t=007.8/'

In [32]:
print(time_dict)

{98.8: 't=098.8', 49.5: 't=049.5', 117.2: 't=117.2', 4.5: 't=004.5', 17.9: 't=017.9', 81.4: 't=081.4', 56.4: 't=056.4', 30.4: 't=030.4', 109.9: 't=109.9', 106.0: 't=106.0', 0.0: 't=000.0', 88.3: 't=088.3', 14.9: 't=014.9', 10.6: 't=010.6', 94.7: 't=094.7', 26.1: 't=026.1', 33.7: 't=033.7', 77.1: 't=077.1', 121.4: 't=121.4', 66.7: 't=066.7', 7.8: 't=007.8', 113.2: 't=113.2', 63.8: 't=063.8', 60.1: 't=060.1', 91.7: 't=091.7', 37.7: 't=037.7', 21.3: 't=021.3', 102.5: 't=102.5', 45.4: 't=045.4', 41.5: 't=041.5', 85.1: 't=085.1', 52.5: 't=052.5', 70.9: 't=070.9', 73.9: 't=073.9'}


In [16]:
def time_path(folder_path):
    folder_dir = folder_path
    time_dir = os.listdir(folder_dir)
    time_l = list(set([float(x.split('=')[1]) for x in time_dir]))
    time_dict = {}
    for t in time_dir:
        time_dict[float(t.split('=')[1])]=t
    return time_l, time_dict, folder_dir

#time_path(locations)

In [11]:
def read_parq(pq_path):
    df = pd.read_parquet(pq_path)
    # convert each row to json
    #df.apply(lambda x: print(x.to_json()), axis=1)
    df.apply(lambda x: x.to_json(), axis=1)
    return df.to_json()
    #return df.to_json()

df = read_parq(test_dir)

In [12]:
df

'{"id":{"0":"58682c5d48cad9d9e103431d773615bf","1":"85c61911b7fe2ced1000c33c9e932706"},"ride_id":{"0":"c9a2b46c9aa515b632eddc45c4868482","1":"6760ffa3f41908695d1405b776c3e8d5"},"uuid":{"0":"19b9aa10588646b3bf22c9b4865a7995","1":"dad7eae44e784b549c8c5a3aa051a8c7"},"timestamp":{"0":1503882,"1":1507320},"offset":{"0":8.5250608865,"1":8.0779125296},"course":{"0":299.619140625,"1":159.609375},"latitude":{"0":40.7628694891,"1":40.678191064},"longitude":{"0":-73.9619473161,"1":-73.8181926124},"geohash":{"0":"dr5ruuwsctwg","1":"dr5x2jppxkqj"},"speed":{"0":0.0,"1":13.1499996185},"accuracy":{"0":10.0,"1":10.0},"timelapse":{"0":false,"1":false},"filename":{"0":"e2f795a7-6a7d-4500-b5d7-4569de996811.mov","1":"d745b92f-aefd-467d-9121-7a71308e8d6d.mov"}}'

In [17]:
def send(f):
    i = 0.0
    time_l, time_dict, folder_dir = time_path(f)
    event = threading.Event()
    for t in time_l:
        final_dir = folder_dir+'/'+time_dict[t]+'/'
        pq = read_parq(final_dir)
        event.wait(t-i)
        curr_time = dt.datetime.now()
        i=t
        send_data('locations',pq)
send(locations)

Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 40
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 41
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 42
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 43
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 44
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 45
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 46
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 47
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 48
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 49
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 50
Message sent:
    Topic: "CurryAdam-locations"
    Partition: 0
    Offset: 51
Message sent:
    Topic: "CurryAdam-locations"
    P