<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_streaming_kafka_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the required kafka packages

In [None]:
!pip install kafka-python

### Import packages

In [None]:
import os
from google.colab import userdata
from datetime import datetime
import time
import threading
import json
from kafka import KafkaProducer
from kafka.errors import KafkaError
import pandas as pd


## Download and setup Kafka and Zookeeper instances

For demo purposes, the following instances are setup locally:

- Kafka (Brokers: 127.0.0.1:9092)
- Zookeeper (Node: 127.0.0.1:2181)


In [None]:
!curl -sSOL https://downloads.apache.org/kafka/3.7.0/kafka_2.12-3.7.0.tgz
!tar -xzf kafka_2.12-3.7.0.tgz

Kafka with defaults

In [None]:
!./kafka_2.12-3.7.0/bin/zookeeper-server-start.sh -daemon ./kafka_2.12-3.7.0/config/zookeeper.properties
!./kafka_2.12-3.7.0/bin/kafka-server-start.sh -daemon ./kafka_2.12-3.7.0/config/server.properties
!echo "Give the processes 10 seconds to start before proceeding."
!sleep 10

Is Kafka running?

In [None]:
!ps -ef | grep java

Create the kafka topics with the following specs:

- sample-streaming-data: partitions=1

In [None]:
!./kafka_2.12-3.7.0/bin/kafka-topics.sh --create --bootstrap-server 127.0.0.1:9092 --replication-factor 1 --partitions 1 --topic sample-streaming-data

Describe the topic for details on the configuration

In [None]:
!./kafka_2.12-3.7.0/bin/kafka-topics.sh --describe --bootstrap-server 127.0.0.1:9092 --topic sample-streaming-data

## generator python script

This script simply generates random data to publish into our topic

In [None]:
%%writefile generator.py

import sys
args = sys.argv  # a list of the arguments provided (str)
print("running generator.py", args)
iterations = int(args[1])
print(f'iterations: {iterations}')

def error_callback(exc):
    raise Exception('Error while sendig data to kafka: {0}'.format(str(exc)))

def write_to_kafka(topic_name, items):
  from kafka import KafkaProducer

  count=0
  producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'])
  for message, key in items:
    producer.send(topic_name, key=key.encode('utf-8'), value=message.encode('utf-8'), partition=0).add_errback(error_callback)
    count+=1
  producer.flush()
  print("Wrote {0} messages into topic: {1}".format(count, topic_name))

import random
from time import sleep

def generate_data(rows=2):

  index_num = random.randint(0,1000000)
  print(index_num)
  keys = list([f'{index_num}'])
  msg = list([f'hello world!{index_num}'])
  data = zip(msg, keys)

  return data

for i in range(iterations):
  write_to_kafka("sample-streaming-data", generate_data())
  sleep(random.randint(0,5))



# write some data

In [None]:
%%script bash --bg

python generator.py 10

In [None]:
message_n = 10

from kafka import KafkaConsumer

# Kafka consumer configuration
bootstrap_servers = ['localhost:9092']  # Kafka server address
topic_name = 'sample-streaming-data'  # Kafka topic you want to read from
group_id = 'some_group'  # Consumer group ID

# Create a Kafka consumer
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    auto_offset_reset='earliest',  # Start reading at the earliest message
    enable_auto_commit=True,
    group_id=group_id,
    value_deserializer=lambda x: x.decode('utf-8')  # Assuming messages are UTF-8 encoded
)

# Read and print messages from the topic
try:
    for _ in range(message_n):
        message = next(consumer)
        print(f"Received message: {message.value}")
finally:
    # Clean up on exit
    consumer.close()


# UTA API

We'll retrieve realtime data from UTA and publish it to our kafka topic as a producter.

Then we'll interact with the topic as a Consumer and get the messages out of the topic

## install package needed for uta api

In [None]:
!pip install xmltodict

## create a dict of our token to save to a file for the python script

In [None]:
token_dict = {'token':userdata.get('uta')}

## dump the token data to a file

In [None]:
with open('token.json', 'w') as file:
    json.dump(token_dict, file)

## retrieve the data from the API and write it to the topic.

these scripts will retrieve data from the API and write the messages to the kafka topic

In [None]:
%%writefile uta_generator.py

from google.colab import userdata
import json
import xmltodict
import sys


args = sys.argv  # a list of the arguments provided (str)
print("running generator.py", args)
iterations = int(args[1])

print(f'iterations: {iterations}')





def error_callback(exc):
    raise Exception('Error while sendig data to kafka: {0}'.format(str(exc)))




# take the data retrieved from the api and write it to the kafka topic
def write_to_kafka(topic_name, items):
  from kafka import KafkaProducer

  count=0
  producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'])
  print(items)

  for ref, lat, lon in items:
    print(ref, lat, lon)
    location = f'{{"Vehichle_ref":{ref},Latitude": {lat}, "Longitude": {lon}}}'
    producer.send(topic_name, value=location.encode('utf-8'), partition=0).add_errback(error_callback)
    count+=1

  producer.flush()
  print("Wrote {0} messages into topic: {1}".format(count, topic_name))


# the api call to the uta api
def get_locations(token):

    from time import sleep
    from google.colab import userdata
    import requests
    import xmltodict
    import pandas as pd
    import os

    url = f'http://api.rideuta.com/SIRI/SIRI.svc/VehicleMonitor/ByRoute?route=703&onwardcalls=true&usertoken={token}'
    print(url)
    response = requests.get(url)
    xml_dict = xmltodict.parse(response.text)
    df = pd.DataFrame(xml_dict['Siri']['VehicleMonitoringDelivery']['VehicleActivity']['MonitoredVehicleJourney'])
    print(df)
    print(df.VehicleRef.value_counts())
    print(df.shape)

    location_df = pd.json_normalize(df.VehicleLocation)
    vehicle_location = pd.merge(df['VehicleRef'],location_df,left_index=True,right_index=True)
    zip_tuple = tuple(zip(vehicle_location['VehicleRef'],vehicle_location['Latitude'], vehicle_location['Longitude']))
    sleep(6)

    return zip_tuple

# open the token file
with open('token.json', 'r') as file:
    token_dict = json.load(file)

# turn the dict value into a local var
token = token_dict['token']

# call the function multiple times to get the data
# and write it to the kafka topic

for i in range(iterations):
  write_to_kafka("sample-streaming-data", get_locations(token))



## let's remind ourselves of the UTA data structure

create a function to hit the api using our token

In [None]:
def get_red_trains(token):
    from time import sleep
    from google.colab import userdata
    import requests
    import xmltodict
    import pandas as pd

    token = userdata.get('uta')
    url = f'http://api.rideuta.com/SIRI/SIRI.svc/VehicleMonitor/ByRoute?route=703&onwardcalls=true&usertoken={token}'
    response = requests.get(url)
    xml_dict = xmltodict.parse(response.text)
    df = pd.DataFrame(xml_dict['Siri']['VehicleMonitoringDelivery']['VehicleActivity']['MonitoredVehicleJourney'])
    return df


hit the api and show the dataframe we have from the request.

In [None]:
df = get_red_trains(userdata.get('uta'))
df

## get train lat/lon

conver the JSON data to a dataframe format after normalizing the lat/lon data

In [None]:
import pandas as pd
location_df = pd.json_normalize(df.VehicleLocation)
vehicle_location = pd.merge(df['VehicleRef'],location_df,left_index=True,right_index=True)
vehicle_location

## get UTA data

this calls the script asyncronously so our cells are not blocked from execution while the API data is retrieved

In [None]:
%%script bash --bg

python uta_generator.py 10 >> log.txt

## KAFKA Consumer for UTA

In [None]:
message_n = 100

messages = []

from kafka import KafkaConsumer

# Kafka consumer configuration
bootstrap_servers = ['localhost:9092']  # Kafka server address
topic_name = 'sample-streaming-data'  # Kafka topic you want to read from
group_id = 'some_group'  # Consumer group ID

# Create a Kafka consumer
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    auto_offset_reset='earliest',  # Start reading at the earliest message
    enable_auto_commit=True,
    group_id=group_id,
    value_deserializer=lambda x: x.decode('utf-8')  # Assuming messages are UTF-8 encoded
)

# Read and print five messages from the topic
try:
    for _ in range(message_n):
        message = next(consumer)
        messages.append(message)
        print(f"Received message: {message.value}")
finally:
    # Clean up on exit
    consumer.close()


function for retrieve messages.

In [None]:
message

# retrieve messages