In [1]:
import datetime, time, random, string

def one_station(name):
    # temp pattern
    month_avg = [27,31,44,58,70,79,83,81,74,61,46,32]
    shift = (random.random()-0.5) * 30
    month_avg = [m + shift + (random.random()-0.5) * 5 for m in month_avg]
    
    # rain pattern
    start_rain = [0.1,0.1,0.3,0.5,0.4,0.2,0.2,0.1,0.2,0.2,0.2,0.1]
    shift = (random.random()-0.5) * 0.1
    start_rain = [r + shift + (random.random() - 0.5) * 0.2 for r in start_rain]
    stop_rain = 0.2 + random.random() * 0.2

    # day's state
    today = datetime.date(2000, 1, 1)
    temp = month_avg[0]
    raining = False
    
    # gen weather
    while True:
        # choose temp+rain
        month = today.month - 1
        temp = temp * 0.8 + month_avg[month] * 0.2 + (random.random()-0.5) * 20
        if temp < 32:
            raining=False
        elif raining and random.random() < stop_rain:
            raining = False
        elif not raining and random.random() < start_rain[month]:
            raining = True

        yield (today.strftime("%Y-%m-%d"), name, temp, raining)

        # next day
        today += datetime.timedelta(days=1)
        
def all_stations(count=10, sleep_sec=1):
    assert count <= 26
    stations = []
    for name in string.ascii_uppercase[:count]:
        stations.append(one_station(name))
    while True:
        for station in stations:
            yield next(station)
        time.sleep(sleep_sec)

In [2]:
# loops forever because the weather never ends...
for row in all_stations(3):
    print(row)
    break# date, station, temp, raining

('2000-01-01', 'A', 19.57385690146352, False)


In [3]:
from kafka import KafkaAdminClient, KafkaProducer, KafkaConsumer, TopicPartition
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError, UnknownTopicOrPartitionError

admin = KafkaAdminClient(bootstrap_servers=["kafka:9092"])
try:
    admin.delete_topics(["stations", "stations-json"])
    print("deleted")
except UnknownTopicOrPartitionError:
    print("cannot delete (may not exist yet)")

time.sleep(1)

admin.create_topics([NewTopic("stations", 6, 1)])
admin.create_topics([NewTopic("stations-json", 6, 1)])
admin.list_topics()

admin.list_topics()

cannot delete (may not exist yet)


['stations-json', 'stations']

Part 1:

In [4]:
import time, threading
import report_pb2
import json

In [5]:
def produce():
    producer = KafkaProducer(
        bootstrap_servers=["kafka:9092"],
        retries=10,
        acks="all"
    )

    for date, station, degrees, raining in all_stations(15):
        # send to "stations" stream using protobuf
        report = report_pb2.Report(date = date, station = station, degrees = degrees, raining = raining)
        key = bytes(station, "utf-8")
        value = report.SerializeToString()
        producer.send("stations", key=key, value=value)

        # send to "stations-json" using JSON
        data = {
            "date": date,
            "station": station,
            "degrees": degrees,
            "raining": int(raining)
        }
        key = bytes(station, "utf-8")
        value = bytes(json.dumps(data), "utf-8")
        producer.send("stations-json", key=key, value=value)

In [6]:
# Start thread to run produce
producer_thread = threading.Thread(target=produce)
producer_thread.start()

Part 2:

In [14]:
import os, json

for partition in range(6):
    path = f"partition-{partition}.json"
    if os.path.exists(path):
        os.remove(path)

In [8]:
def load_partition(partition_num):
    path = f"partition-{partition_num}.json"
    if os.path.exists(path):
        with open(path, "r") as file:
            return json.load(file)
    else:
        return {"partition": partition_num, "offset": 0}

def save_partition(partition):
    path = f"partition-{partition['partition']}.json"
    with open(path, "w") as file:
        json.dump(partition, file)

In [9]:
def process_messages(messages, partition):
    for msg in messages:
        report = report_pb2.Report()
        report.ParseFromString(msg.value)
        
        date = report.date
        station = report.station
        temperature = report.degrees
        raining = report.raining

        if station not in partition:
            partition[station] = {
                "sum": temperature,
                "count": 1,
                "avg": temperature,
                "start": date,
                "end": date,
            }
        else:
            if date > partition[station]["end"]:
                partition[station]["sum"] += temperature
                partition[station]["count"] += 1
                partition[station]["avg"] = partition[station]["sum"] / partition[station]["count"]
                partition[station]["end"] = date

    return partition

In [12]:
def consume(part_nums=[], iterations=10):
    consumer = KafkaConsumer(
        bootstrap_servers=["kafka:9092"],
        auto_offset_reset="earliest",
        api_version=(0,11,5))
    topic_list=[TopicPartition("stations", part_num) for part_num in part_nums]
    consumer.assign(topic_list)
    partitions = {part_num: load_partition(part_num) for part_num in part_nums} 
    for part_num, partition in partitions.items(): 
        if "offset" in partition: 
            consumer.seek(TopicPartition("stations", part_num), partition["offset"]) 
    for i in range(iterations):
        batch = consumer.poll(1000)
        for topic, messages in batch.items():
            partition_num = messages[0].partition
            partition = partitions[partition_num]
            partition = process_messages(messages, partition)
            partition["offset"] = consumer.position(TopicPartition("stations", partition_num))
            save_partition(partition)

    print("exiting")
    
for i in range(2):
    print("ROUND", i)
    t1 = threading.Thread(target=consume, args=([0,1], 30))
    t2 = threading.Thread(target=consume, args=([2,3], 30))
    t3 = threading.Thread(target=consume, args=([4,5], 30))
    t1.start()
    t2.start()
    t3.start()
    t1.join()
    t2.join()
    t3.join()  

ROUND 0
exiting
exiting
exiting
ROUND 1
exiting
exiting
exiting


In [13]:
!cat partition*.json

{"partition": 0, "offset": 101, "N": {"sum": 5115.457875990508, "count": 101, "avg": 50.648097782084236, "start": "2000-01-01", "end": "2000-04-10"}}{"partition": 1, "offset": 202, "E": {"sum": 3561.890826232486, "count": 101, "avg": 35.266245804282036, "start": "2000-01-01", "end": "2000-04-10"}, "O": {"sum": 3211.742338107154, "count": 101, "avg": 31.79942909016984, "start": "2000-01-01", "end": "2000-04-10"}}{"partition": 2, "offset": 307, "F": {"sum": 3270.129304673357, "count": 103, "avg": 31.748828200712207, "start": "2000-01-01", "end": "2000-04-12"}, "I": {"sum": 3897.061415110247, "count": 102, "avg": 38.206484461865166, "start": "2000-01-01", "end": "2000-04-11"}, "J": {"sum": 4491.969588685132, "count": 102, "avg": 44.03891753612875, "start": "2000-01-01", "end": "2000-04-11"}}{"partition": 3, "offset": 307, "D": {"sum": 2984.527210458143, "count": 103, "avg": 28.97599233454508, "start": "2000-01-01", "end": "2000-04-12"}, "G": {"sum": 4413.6122355273255, "count": 102, "avg"