In [49]:
import datetime, time, random, string

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [57]:
# loops forever because the weather never ends...
for row in all_stations(3):
    print(row)
    break# date, station, temp, raining

('2000-01-01', 'A', 27.828682305481333, False)


In [2]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)

admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

admin.list_topics()

cannot delete (may not exist yet)


['stations-json', 'stations']

Part 1:

In [10]:
producer=KafkaProducer(bootstrap_servers=["kafka:9092"])
producer

<kafka.producer.kafka.KafkaProducer at 0x7f06301b66b0>

In [50]:
import time, threading
import report_pb2
import json

In [96]:
def produce():
    producer = KafkaProducer(
        bootstrap_servers=["kafka:9092"],
        retries=10,
        acks="all"
    )

    for date, station, degrees, raining in all_stations(15):
        # send to "stations" stream using protobuf
        report = report_pb2.Report(date = date, station = station, degrees = degrees, raining = raining)
        key = bytes(station, "utf-8")
        value = report.SerializeToString()
        producer.send("stations", key=key, value=value)

        # send to "stations-json" using JSON
        data = {
            "date": date,
            "station": station,
            "degrees": degrees,
            "raining": int(raining)
        }
        key = bytes(station, "utf-8")
        value = bytes(json.dumps(data), "utf-8")
        producer.send("stations-json", key=key, value=value)

In [97]:
# Start thread to run produce
producer_thread = threading.Thread(target=produce)
producer_thread.start()

Part 2:

In [98]:
consumer = KafkaConsumer(bootstrap_servers=["kafka:9092"])

In [99]:
consumer.assign([TopicPartition("stations", 0)])

In [108]:
batch = consumer.poll(1000)
for topicpartition, messages in batch.items():
    for msg in messages:
        print(msg.value.decode())
        break

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xab in position 17: invalid start byte

In [32]:
import os, json

for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [7]:
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {"partition": partition_num, "offset": 0}

def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [85]:
def process_messages(messages, partition):
    for msg in messages:
        station = msg.key.decode("utf-8")
        #THE ISSUE IS WHEN WE TRY DECODE THE LINE BELOW
        date, temperature = msg.value.decode("utf-8").split(",")
        
        temperature = float(temperature)

        if station not in partition:
            partition[station] = {
                "sum": temperature,
                "count": 1,
                "avg": temperature,
                "start": date,
                "end": date,
            }
        else:
            if date > partition[station]["end"]:
                partition[station]["sum"] += temperature
                partition[station]["count"] += 1
                partition[station]["avg"] = partition[station]["sum"] / partition[station]["count"]
                partition[station]["end"] = date

    return partition

In [86]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(
        bootstrap_servers=["kafka:9092"],
        auto_offset_reset="earliest",
        api_version=(0,11,5))
    topic_list=[TopicPartition("stations", part_num) for part_num in part_nums]
    print(type(consumer))
    consumer.assign(topic_list)
    partitions = {part_num: load_partition(part_num) for part_num in part_nums} 
    for part_num, partition in partitions.items(): 
        if "offset" in partition: 
            consumer.seek(TopicPartition("stations", part_num), partition["offset"]) 
    for i in range(iterations):
        batch = consumer.poll(1000)
        for topic, messages in batch.items():
            partition_num = messages[0].partition
            partition = partitions[partition_num]
            partition = process_messages(messages, partition)
            partition["offset"] = consumer.position(TopicPartition("stations", partition_num))
            save_partition(partition)

    print("exiting")
    
for i in range(2):
    print("ROUND", i)
    t1 = threading.Thread(target=consume, args=([0,1], 30))
    t2 = threading.Thread(target=consume, args=([2,3], 30))
    t3 = threading.Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()  

ROUND 0
<class 'kafka.consumer.group.KafkaConsumer'>
<class 'kafka.consumer.group.KafkaConsumer'>
<class 'kafka.consumer.group.KafkaConsumer'>


Exception in thread Thread-121 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_14/1295344906.py", line 18, in consume
  File "/tmp/ipykernel_14/1187766284.py", line 5, in process_messages
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 17: invalid start byte
Exception in thread Thread-122 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_14/1295344906.py", line 18, in consume
  File "/tmp/ipykernel_14/1187766284.py", line 5, in process_messages
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 17: invalid start byte


N
D


Exception in thread Thread-123 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_14/1295344906.py", line 18, in consume
  File "/tmp/ipykernel_14/1187766284.py", line 5, in process_messages
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbc in position 17: invalid start byte


A
ROUND 1
<class 'kafka.consumer.group.KafkaConsumer'>
<class 'kafka.consumer.group.KafkaConsumer'>
<class 'kafka.consumer.group.KafkaConsumer'>


Exception in thread Thread-124 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_14/1295344906.py", line 18, in consume
  File "/tmp/ipykernel_14/1187766284.py", line 5, in process_messages
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb3 in position 17: invalid start byte


N


Exception in thread Thread-125 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_14/1295344906.py", line 18, in consume
  File "/tmp/ipykernel_14/1187766284.py", line 5, in process_messages
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 17: invalid start byte


D


Exception in thread Thread-126 (consume):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_14/1295344906.py", line 18, in consume
  File "/tmp/ipykernel_14/1187766284.py", line 5, in process_messages
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbc in position 17: invalid start byte


A


In [42]:
!cat partition*.json

cat: 'partition*.json': No such file or directory


cat: 'partition*.json': No such file or directory


In [84]:
consumer = KafkaConsumer(
        bootstrap_servers=["localhost:9092"],
        auto_offset_reset="earliest",
        api_version=(0,11,5))
topic_list=[TopicPartition("stations", part_num) for part_num in part_nums]
consumer.assign(topic_list)
partitions = {part_num: load_partition(part_num) for part_num in part_nums} 
for part_num, partition in partitions.items(): 
    if "offset" in partition: 
        consumer.seek(TopicPartition("stations", part_num), partition["offset"]) 
for i in range(30):
    batch = consumer.poll(1000)
    print(batch.items())
    for topic, messages in batch.items():
        partition_num = messages[0].partition
        partition = partitions[partition_num]
        print(str(messages[1], "utf-8"))
        partition = process_messages(messages, partition)
        partition["offset"] = consumer.position(TopicPartition("stations", partition_num))
        save_partition(partition)

dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])
dict_items([])


KeyboardInterrupt: 

In [83]:
b'\n\n2000-01-01\x12\x01E\x19"\xdb\xe8\x8a\x94\xf4=@'

bytes