<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_streaming_kafka_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the required kafka packages

In [None]:
!pip install kafka-python

### Import packages

In [None]:
import os
from google.colab import userdata
from datetime import datetime
import time
import threading
import json
from kafka import KafkaProducer
from kafka.errors import KafkaError
import pandas as pd


## Download and setup Kafka and Zookeeper instances

For demo purposes, the following instances are setup locally:

- Kafka (Brokers: 127.0.0.1:9092)
- Zookeeper (Node: 127.0.0.1:2181)


In [None]:
# prompt: untar the tgz file

!tar -xvzf filename.tgz

In [None]:
!curl -sSOL https://downloads.apache.org/kafka/3.7.2/kafka_2.12-3.7.2.tgz

In [None]:
!ls -ltrh

In [None]:
!tar -xvzf kafka_2.12-3.7.2.tgz

Kafka with defaults

In [None]:
!ls -ltrh

In [None]:
!./kafka_2.12-3.7.2/bin/zookeeper-server-start.sh -daemon ./kafka_2.12-3.7.2/config/zookeeper.properties
!./kafka_2.12-3.7.2/bin/kafka-server-start.sh -daemon ./kafka_2.12-3.7.2/config/server.properties
!echo "Give the processes 10 seconds to start before proceeding."
!sleep 10

Is Kafka running?

In [None]:
!ps -ef | grep java

Create the kafka topics with the following specs:

- sample-streaming-data: partitions=1

In [None]:
!./kafka_2.12-3.7.2/bin/kafka-topics.sh --create --bootstrap-server 127.0.0.1:9092 --replication-factor 1 --partitions 1 --topic sample-streaming-data

Describe the topic for details on the configuration

In [None]:
!./kafka_2.12-3.7.2/bin/kafka-topics.sh --describe --bootstrap-server 127.0.0.1:9092 --topic sample-streaming-data

## generator python script

This script simply generates random data to publish into our topic

In [None]:
%%writefile generator.py

import sys
args = sys.argv  # a list of the arguments provided (str)
print("running generator.py", args)
iterations = int(args[1])
print(f'iterations: {iterations}')

def error_callback(exc):
    raise Exception('Error while sendig data to kafka: {0}'.format(str(exc)))

def write_to_kafka(topic_name, items):
  from kafka import KafkaProducer

  count=0
  producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'])
  for message, key in items:
    producer.send(topic_name, key=key.encode('utf-8'), value=message.encode('utf-8'), partition=0).add_errback(error_callback)
    count+=1
  producer.flush()
  print("Wrote {0} messages into topic: {1}".format(count, topic_name))

import random
from time import sleep

def generate_data(rows=2):

  index_num = random.randint(0,1000000)
  print(index_num)
  keys = list([f'{index_num}'])
  msg = list([f'hello world!{index_num}'])
  data = zip(msg, keys)

  return data

for i in range(iterations):
  write_to_kafka("sample-streaming-data", generate_data())
  sleep(random.randint(0,5))



# write some data

In [None]:
%%script bash --bg

python generator.py 10

In [None]:
message_n = 10

from kafka import KafkaConsumer

# Kafka consumer configuration
bootstrap_servers = ['localhost:9092']  # Kafka server address
topic_name = 'sample-streaming-data'  # Kafka topic you want to read from
group_id = 'some_group'  # Consumer group ID

# Create a Kafka consumer
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    auto_offset_reset='earliest',  # Start reading at the earliest message
    enable_auto_commit=True,
    group_id=group_id,
    value_deserializer=lambda x: x.decode('utf-8')  # Assuming messages are UTF-8 encoded
)

# Read and print messages from the topic
try:
    for _ in range(message_n):
        message = next(consumer)
        print(f"Received message: {message.value}")
finally:
    # Clean up on exit
    consumer.close()


function for retrieve messages.

# OpenWeather API example

In [None]:
import requests

In [None]:
api_key = userdata.get('open_weather')

In [None]:
city = "Salt Lake City"

url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=imperial"

response = requests.get(url)

data = response.json()

current_temp = data['main']['temp']

current_temp

In [None]:
def error_callback(exc):
    raise Exception('Error while sendig data to kafka: {0}'.format(str(exc)))

In [None]:
def write_to_kafka(topic_name, items):
  from kafka import KafkaProducer

  count=0
  producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'])
  for message, key in items:
    producer.send(topic_name, key=key.encode('utf-8'), value=message.encode('utf-8'), partition=0).add_errback(error_callback)
    count+=1
  producer.flush()
  print("Wrote {0} messages into topic: {1}".format(count, topic_name))

In [None]:
write_to_kafka("temp",[(str(current_temp),"temp")])

In [None]:
message_n = 2

from kafka import KafkaConsumer

# Kafka consumer configuration
bootstrap_servers = ['localhost:9092']  # Kafka server address
topic_name = 'temp'  # Kafka topic you want to read from
group_id = 'some_group'  # Consumer group ID

# Create a Kafka consumer
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    auto_offset_reset='earliest',  # Start reading at the earliest message
    enable_auto_commit=True,
    group_id=group_id,
    value_deserializer=lambda x: x.decode('utf-8')  # Assuming messages are UTF-8 encoded
)

# Read and print messages from the topic
try:
    for _ in range(message_n):
        message = next(consumer)
        print(f"Received message: {message.value}")
finally:
    # Clean up on exit
    consumer.close()