# Apache Kafka Avançado — Notebook Único (Colab)

Este notebook contém **10 slides avançados**, cada um com **código executável** no Google Colab.

> **Importante:** Execute as células **na ordem**.

## Slide 0 — Setup Kafka (KRaft, single-node)

In [None]:

import os, subprocess, pathlib, re, time

# Install Java 8 (Required for Kafka)
def sh(cmd):
    print(f"$ {cmd}")
    out = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if out.returncode != 0:
        print(out.stdout)
        print(out.stderr)
        raise RuntimeError(cmd)
    return out.stdout

if not pathlib.Path("/usr/lib/jvm/java-8-openjdk-amd64").exists():
    print("Installing Java 8...")
    sh("apt-get install openjdk-8-jdk-headless -qq > /dev/null")
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

KAFKA_VERSION = "3.6.1"
SCALA_VERSION = "2.13"
KAFKA_TGZ = f"kafka_{SCALA_VERSION}-{KAFKA_VERSION}.tgz"
KAFKA_DIR = f"kafka_{SCALA_VERSION}-{KAFKA_VERSION}"

# function sh moved up

if not pathlib.Path(KAFKA_DIR).exists():
    sh(f"wget -q https://downloads.apache.org/kafka/{KAFKA_VERSION}/{KAFKA_TGZ}")
    sh(f"tar -xzf {KAFKA_TGZ}")

os.environ["KAFKA_HOME"] = str(pathlib.Path(KAFKA_DIR).resolve())
print("KAFKA_HOME =", os.environ["KAFKA_HOME"])

kraft_cfg = pathlib.Path(KAFKA_DIR) / "config" / "kraft" / "server.properties"
cfg = kraft_cfg.read_text()
cfg = re.sub(r"^listeners=.*$", "listeners=PLAINTEXT://:9092,CONTROLLER://:9093", cfg, flags=re.M)
cfg = re.sub(r"^advertised.listeners=.*$", "advertised.listeners=PLAINTEXT://127.0.0.1:9092", cfg, flags=re.M)
cfg = re.sub(r"^log.dirs=.*$", "log.dirs=/tmp/kafka-logs", cfg, flags=re.M)
kraft_cfg.write_text(cfg)

cluster_id = sh(f"{KAFKA_DIR}/bin/kafka-storage.sh random-uuid").strip()
sh(f"{KAFKA_DIR}/bin/kafka-storage.sh format -t {cluster_id} -c {kraft_cfg} >/dev/null")
print("Kafka preparado (KRaft).")


## Slide 0.1 — Subir Kafka

In [None]:

import subprocess, os, time, pathlib

KAFKA_DIR = os.environ["KAFKA_HOME"]
kraft_cfg = pathlib.Path(KAFKA_DIR) / "config" / "kraft" / "server.properties"

kafka_proc = subprocess.Popen(
    [f"{KAFKA_DIR}/bin/kafka-server-start.sh", str(kraft_cfg)],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True
)

time.sleep(3)
for _ in range(15):
    line = kafka_proc.stdout.readline()
    if not line:
        break
    print(line.rstrip())

print("Kafka em execução.")


## Slide 0.2 — Cliente Python

In [None]:
!pip -q install kafka-python

## Slide 1 — Kafka como Log Distribuído

In [None]:

from kafka import KafkaProducer, KafkaConsumer

topic = "s1_log"
producer = KafkaProducer(bootstrap_servers="127.0.0.1:9092")

for i in range(5):
    producer.send(topic, f"evento-{i}".encode())
producer.flush()

consumer = KafkaConsumer(
    topic,
    bootstrap_servers="127.0.0.1:9092",
    auto_offset_reset="earliest",
    enable_auto_commit=False,
    consumer_timeout_ms=2000
)

for m in consumer:
    print(m.offset, m.value.decode())
consumer.close()


## Slide 2 — Partitions

In [None]:

import subprocess, os
KAFKA_DIR = os.environ["KAFKA_HOME"]
topic = "s2_partitions"

subprocess.run([
    f"{KAFKA_DIR}/bin/kafka-topics.sh",
    "--bootstrap-server","127.0.0.1:9092",
    "--create","--topic",topic,
    "--partitions","3",
    "--replication-factor","1"
], check=True)

subprocess.run([
    f"{KAFKA_DIR}/bin/kafka-topics.sh",
    "--bootstrap-server","127.0.0.1:9092",
    "--describe","--topic",topic
], check=True)


## Slide 3 — Semântica de Entrega

In [None]:

from kafka import KafkaProducer
import json, time

producer = KafkaProducer(
    bootstrap_servers="127.0.0.1:9092",
    acks="all",
    retries=5
)

producer.send("s3_delivery", json.dumps({
    "id":1,"evento":"pagamento","ts":time.time()
}).encode())
producer.flush()

print("Evento enviado com acks=all.")


## Slide 4 — Performance (batch + compressão)

In [None]:

from kafka import KafkaProducer
import time, json, random

producer = KafkaProducer(
    bootstrap_servers="127.0.0.1:9092",
    linger_ms=50,
    batch_size=64000,
    compression_type="gzip"
)

t0 = time.time()
for i in range(2000):
    producer.send("s4_perf", json.dumps({"i":i,"x":random.random()}).encode())
producer.flush()
print("Tempo:", time.time()-t0)


## Slide 5 — Particionamento por Key

In [None]:

from kafka import KafkaProducer, KafkaConsumer

producer = KafkaProducer(bootstrap_servers="127.0.0.1:9092")
for i in range(10):
    producer.send("s5_keys", key=b"user-42", value=f"msg-{i}".encode())
producer.flush()

consumer = KafkaConsumer(
    "s5_keys",
    bootstrap_servers="127.0.0.1:9092",
    auto_offset_reset="earliest",
    consumer_timeout_ms=2000
)

for m in consumer:
    print(m.partition, m.offset, m.key.decode(), m.value.decode())
consumer.close()


## Slide 6 — Consumer Group

In [None]:

from kafka import KafkaProducer
producer = KafkaProducer(bootstrap_servers="127.0.0.1:9092")

for i in range(20):
    producer.send("s6_group", f"evento-{i}".encode())
producer.flush()

print("Produzido para consumer group.")


## Slide 7 — Replay de Offset

In [None]:

from kafka import KafkaConsumer
import time

consumer = KafkaConsumer(
    "s7_replay",
    bootstrap_servers="127.0.0.1:9092",
    auto_offset_reset="earliest",
    enable_auto_commit=False
)

for i, m in enumerate(consumer):
    print("Lido:", m.offset, m.value.decode())
    if i==4:
        break

consumer.commit()
consumer.seek_to_beginning()

for i, m in enumerate(consumer):
    print("Replay:", m.offset, m.value.decode())
    if i==4:
        break

consumer.close()


## Slide 8 — Stateful Processing

In [None]:

from kafka import KafkaConsumer
from collections import defaultdict
import json, time

consumer = KafkaConsumer(
    "s8_stateful",
    bootstrap_servers="127.0.0.1:9092",
    auto_offset_reset="earliest",
    value_deserializer=lambda b: json.loads(b.decode()),
    consumer_timeout_ms=10000
)

counts = defaultdict(int)
start = time.time()

for m in consumer:
    counts[m.value["tipo"]] += 1
    if time.time()-start > 5:
        print(dict(counts))
        counts.clear()
        start = time.time()

consumer.close()


## Slide 9 — Observabilidade

In [None]:

import subprocess, os
KAFKA_DIR = os.environ["KAFKA_HOME"]

subprocess.run([
    f"{KAFKA_DIR}/bin/kafka-consumer-groups.sh",
    "--bootstrap-server","127.0.0.1:9092",
    "--all-groups","--describe"
], check=False)


## Slide 10 — CDC (Evento de Banco)

In [None]:

from kafka import KafkaProducer, KafkaConsumer
import json, time

producer = KafkaProducer(
    bootstrap_servers="127.0.0.1:9092",
    value_serializer=lambda d: json.dumps(d).encode()
)

producer.send("s10_cdc", {
    "op":"u",
    "before":{"id":10,"status":"pendente"},
    "after":{"id":10,"status":"pago"},
    "ts":int(time.time()*1000)
})
producer.flush()

consumer = KafkaConsumer(
    "s10_cdc",
    bootstrap_servers="127.0.0.1:9092",
    auto_offset_reset="earliest",
    value_deserializer=lambda b: json.loads(b.decode()),
    consumer_timeout_ms=2000
)

for m in consumer:
    print(m.value)
consumer.close()
